diff --git a/shared/rocroller/client/CMakeLists.txt b/shared/rocroller/client/CMakeLists.txt
index f737b9967ee..5a6cf6c7d16 100644
--- a/shared/rocroller/client/CMakeLists.txt
+++ b/shared/rocroller/client/CMakeLists.txt
@@ -32,6 +32,7 @@ target_sources(rocroller-gemm
         "${CMAKE_CURRENT_SOURCE_DIR}/include/client/GEMMSolution.hpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/include/client/GraphInspector.hpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/include/client/GraphInspector_impl.hpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/include/client/PreSwizzle.hpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/include/client/StreamKGEMMSolution.hpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/include/client/visualize.hpp"
 )
diff --git a/shared/rocroller/client/include/client/DataParallelGEMMSolution.hpp b/shared/rocroller/client/include/client/DataParallelGEMMSolution.hpp
index 4dc3419022c..97ab43c98f9 100644
--- a/shared/rocroller/client/include/client/DataParallelGEMMSolution.hpp
+++ b/shared/rocroller/client/include/client/DataParallelGEMMSolution.hpp
@@ -133,11 +133,23 @@ namespace rocRoller
                         m_tagLoadScaleA   = command->addOperation(
                             rocRoller::Operations::T_Load_Tiled(m_tagTensorScaleA.value()));
 
+                        auto scaleInputA = m_tagLoadScaleA;
+
+                        if(solutionParams.types.scaleSkipPermlane)
+                        {
+                            AssertFatal(solutionParams.types.scaleShuffleTileA.size() == 3,
+                                        ShowValue(solutionParams.types.scaleShuffleTileA));
+
+                            scaleInputA
+                                = command->addOperation(rocRoller::Operations::SubTileTranspose(
+                                    *m_tagLoadScaleA, solutionParams.types.scaleShuffleTileA));
+                        }
+
                         m_tagBlockScaleA = mulInputA
                             = command->addOperation(rocRoller::Operations::BlockScale(
                                 m_tagA,
                                 2,
-                                m_tagLoadScaleA,
+                                scaleInputA,
                                 {1,
                                  static_cast<unsigned long>(solutionParams.types.scaleBlockSize)}));
                     }
@@ -160,11 +172,22 @@ namespace rocRoller
                         m_tagLoadScaleB   = command->addOperation(
                             rocRoller::Operations::T_Load_Tiled(m_tagTensorScaleB.value()));
 
+                        auto scaleInputB = m_tagLoadScaleB;
+
+                        if(solutionParams.types.scaleSkipPermlane)
+                        {
+                            AssertFatal(solutionParams.types.scaleShuffleTileB.size() == 3);
+
+                            scaleInputB
+                                = command->addOperation(rocRoller::Operations::SubTileTranspose(
+                                    *m_tagLoadScaleB, solutionParams.types.scaleShuffleTileB));
+                        }
+
                         m_tagBlockScaleB = mulInputB
                             = command->addOperation(rocRoller::Operations::BlockScale(
                                 m_tagB,
                                 2,
-                                m_tagLoadScaleB,
+                                scaleInputB,
                                 {static_cast<unsigned long>(solutionParams.types.scaleBlockSize),
                                  1}));
                     }
diff --git a/shared/rocroller/client/include/client/GEMMParameters.hpp b/shared/rocroller/client/include/client/GEMMParameters.hpp
index b981b4b42c2..169d7f0b322 100644
--- a/shared/rocroller/client/include/client/GEMMParameters.hpp
+++ b/shared/rocroller/client/include/client/GEMMParameters.hpp
@@ -75,6 +75,10 @@ namespace rocRoller
 
                 bool scaleSkipPermlane = false;
 
+                // Order: M/N, K tile, K subtile
+                std::vector<size_t> scaleShuffleTileA;
+                std::vector<size_t> scaleShuffleTileB;
+
                 std::string kernelNamePart() const;
             };
 
diff --git a/shared/rocroller/client/include/client/GEMMParameters_serialization.hpp b/shared/rocroller/client/include/client/GEMMParameters_serialization.hpp
index 61bf28888a2..deaf056d6e8 100644
--- a/shared/rocroller/client/include/client/GEMMParameters_serialization.hpp
+++ b/shared/rocroller/client/include/client/GEMMParameters_serialization.hpp
@@ -65,6 +65,9 @@ namespace rocRoller::Serialization
 
             iot::mapRequired(io, "scaleBlockSize", params.scaleBlockSize);
             iot::mapRequired(io, "scaleSkipPermlane", params.scaleSkipPermlane);
+
+            iot::mapRequired(io, "scaleShuffleTileA", params.scaleShuffleTileA);
+            iot::mapRequired(io, "scaleShuffleTileB", params.scaleShuffleTileB);
         }
 
         static void mapping(IO& io, Client::GEMMClient::TypeParameters& params, EmptyContext& ctx)
diff --git a/shared/rocroller/client/include/client/GraphInspector.hpp b/shared/rocroller/client/include/client/GraphInspector.hpp
index 23360b5883c..d301aa5acfe 100644
--- a/shared/rocroller/client/include/client/GraphInspector.hpp
+++ b/shared/rocroller/client/include/client/GraphInspector.hpp
@@ -121,6 +121,11 @@ namespace rocRoller
 
             KernelGraph::CoordinateGraph::Transformer& tx();
 
+            KernelGraph::KernelGraphPtr graph()
+            {
+                return m_kgraph;
+            };
+
         private:
             void assignLiteralSizesAndStrides();
 
diff --git a/shared/rocroller/client/include/client/PreSwizzle.hpp b/shared/rocroller/client/include/client/PreSwizzle.hpp
new file mode 100644
index 00000000000..42252a62001
--- /dev/null
+++ b/shared/rocroller/client/include/client/PreSwizzle.hpp
@@ -0,0 +1,87 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright 2024-2025 AMD ROCm(TM) Software
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <vector>
+
+#include <rocRoller/TensorDescriptor.hpp>
+
+#include <client/GEMMParameters.hpp>
+
+namespace rocRoller::Client
+{
+
+    template <typename T>
+    inline std::vector<T> preSwizzle(std::vector<T> const&      input,
+                                     TensorDescriptor const&    desc,
+                                     std::vector<size_t> const& tile)
+    {
+        AssertFatal(tile.size() == 3, ShowValue(tile.size()), ShowValue(tile));
+        AssertFatal(desc.dimensions() == 2,
+                    "Batch dimension not yet supported.",
+                    ShowValue(desc.dimensions()),
+                    ShowValue(desc));
+        AssertFatal(desc.totalAllocatedElements() == input.size(),
+                    ShowValue(desc),
+                    ShowValue(input.size()));
+
+        auto tileMN   = tile[0];
+        auto tileK    = tile[1];
+        auto subTileK = tile[2];
+
+        size_t instPerTileK   = tileK / subTileK;
+        size_t instKPerTileMN = tileMN / subTileK;
+
+        std::vector<size_t> srcSizes = {subTileK,
+                                        instPerTileK,
+                                        desc.size(0) / (tileK),
+                                        instKPerTileMN,
+                                        subTileK,
+                                        desc.size(1) / (tileMN)};
+
+        TensorDescriptor src(desc.dataType(), srcSizes);
+
+        AssertFatal(src.totalAllocatedElements() == desc.totalAllocatedElements(),
+                    ShowValue(src.totalAllocatedElements()),
+                    ShowValue(desc.totalAllocatedElements()),
+                    ShowValue(src.totalAllocatedElements() / desc.totalAllocatedElements()),
+                    ShowValue(src),
+                    ShowValue(desc));
+
+        auto dst
+            = TensorDescriptor::ShuffledNoPadding(desc.dataType(), srcSizes, {4, 1, 2, 3, 0, 5});
+
+        AssertFatal(src.totalAllocatedElements() == dst.totalAllocatedElements(),
+                    ShowValue(src.totalAllocatedElements()),
+                    ShowValue(dst.totalAllocatedElements()),
+                    ShowValue(src),
+                    ShowValue(dst));
+
+        return shuffleDims(input, dst, src);
+    }
+
+}
diff --git a/shared/rocroller/client/src/GEMMParameters.cpp b/shared/rocroller/client/src/GEMMParameters.cpp
index c6be0cfed18..7d3c797bf26 100644
--- a/shared/rocroller/client/src/GEMMParameters.cpp
+++ b/shared/rocroller/client/src/GEMMParameters.cpp
@@ -66,7 +66,9 @@ namespace rocRoller
                     rv << "_" << t;
 
                 if(scaleSkipPermlane)
-                    rv << "_PRE_SW";
+                {
+                    rv << "_PreSW_AB";
+                }
 
                 return rv.str();
             }
diff --git a/shared/rocroller/client/src/gemm.cpp b/shared/rocroller/client/src/gemm.cpp
index d0175cbd217..279d6d62370 100644
--- a/shared/rocroller/client/src/gemm.cpp
+++ b/shared/rocroller/client/src/gemm.cpp
@@ -47,6 +47,7 @@
 #include "client/DataParallelGEMMSolution.hpp"
 #include "client/GEMMParameters.hpp"
 #include "client/GEMMParameters_serialization.hpp"
+#include "client/PreSwizzle.hpp"
 #include "client/StreamKGEMMSolution.hpp"
 
 #include <CLI/CLI.hpp>
@@ -218,11 +219,47 @@ namespace rocRoller::Client::GEMMClient
                     ShowValue(problemParams.types.scaleB));
         if(problemParams.types.scaleA == Operations::ScaleMode::Separate)
         {
-            deviceScaleA = make_shared_device(hostScaleA);
+            if(problemParams.types.scaleSkipPermlane)
+            {
+                AssertFatal(problemParams.types.scaleShuffleTileA.size() == 3);
+
+                auto descScaleA = descA.withNormalizedDimensions();
+                {
+                    auto sizes = descScaleA.sizes();
+                    sizes[0] /= problemParams.types.scaleBlockSize;
+                    descScaleA = TensorDescriptor(descScaleA.dataType(), std::move(sizes));
+                }
+
+                auto tmpScaleA
+                    = preSwizzle(hostScaleA, descScaleA, problemParams.types.scaleShuffleTileA);
+                deviceScaleA = make_shared_device(tmpScaleA);
+            }
+            else
+            {
+                deviceScaleA = make_shared_device(hostScaleA);
+            }
         }
         if(problemParams.types.scaleB == Operations::ScaleMode::Separate)
         {
-            deviceScaleB = make_shared_device(hostScaleB);
+            if(problemParams.types.scaleSkipPermlane)
+            {
+                AssertFatal(problemParams.types.scaleShuffleTileB.size() == 3);
+
+                auto descScaleB = descB.withNormalizedDimensions();
+                {
+                    auto sizes = descScaleB.sizes();
+                    sizes[0] /= problemParams.types.scaleBlockSize;
+                    descScaleB = TensorDescriptor(descScaleB.dataType(), std::move(sizes));
+                }
+
+                auto tmpScaleB
+                    = preSwizzle(hostScaleB, descScaleB, problemParams.types.scaleShuffleTileB);
+                deviceScaleB = make_shared_device(tmpScaleB);
+            }
+            else
+            {
+                deviceScaleB = make_shared_device(hostScaleB);
+            }
         }
 
         std::cout << "Generating launch parameters and runtime arguments..." << std::endl;
@@ -1396,16 +1433,12 @@ int main(int argc, const char* argv[])
         AssertFatal(arch.HasCapability(GPUCapability::HasBlockScaling32),
                     fmt::format("Architecture {} does not support block scaling.",
                                 arch.target().toString()));
-        types.scaleBlockSize         = arch.GetCapability(GPUCapability::DefaultScaleBlockSize);
-        problem.types.scaleBlockSize = types.scaleBlockSize;
+        types.scaleBlockSize = arch.GetCapability(GPUCapability::DefaultScaleBlockSize);
     }
 
     AssertFatal((types.typeAcc == "float") || (types.typeAcc == "half")
                 || (types.typeAcc == "bf16"));
 
-    problem.types  = types;
-    solution.types = types;
-
     // TODO: Reevaluate the relationship between problem and solution params.
     problem.workgroupMapping = solution.workgroupMapping;
 
@@ -1417,8 +1450,8 @@ int main(int argc, const char* argv[])
     // Set default MI sizes
     if(arch.HasCapability(GPUCapability::HasMFMA))
     {
-        if(solution.types.typeA == "float" && solution.types.typeB == "float"
-           && solution.types.typeC == "float" && solution.types.typeD == "float")
+        if(types.typeA == "float" && types.typeB == "float" && types.typeC == "float"
+           && types.typeD == "float")
         {
             if(solution.waveM == -1)
                 solution.waveM = 32;
@@ -1429,7 +1462,7 @@ int main(int argc, const char* argv[])
             if(solution.waveB == -1)
                 solution.waveB = 1;
         }
-        else if(solution.types.typeA == "half" && solution.types.typeB == "half")
+        else if(types.typeA == "half" && types.typeB == "half")
         {
             if(solution.waveM == -1)
                 solution.waveM = 32;
@@ -1440,7 +1473,7 @@ int main(int argc, const char* argv[])
             if(solution.waveB == -1)
                 solution.waveB = 1;
         }
-        else if(solution.types.typeA == "bf16" && solution.types.typeB == "bf16")
+        else if(types.typeA == "bf16" && types.typeB == "bf16")
         {
             if(solution.waveM == -1)
                 solution.waveM = 16;
@@ -1451,8 +1484,8 @@ int main(int argc, const char* argv[])
             if(solution.waveB == -1)
                 solution.waveB = 1;
         }
-        else if((solution.types.typeA == "fp8" && solution.types.typeB == "fp8")
-                || (solution.types.typeA == "bf8" && solution.types.typeB == "bf8"))
+        else if((types.typeA == "fp8" && types.typeB == "fp8")
+                || (types.typeA == "bf8" && types.typeB == "bf8"))
         {
             if(solution.waveM == -1)
                 solution.waveM = 16;
@@ -1468,12 +1501,12 @@ int main(int argc, const char* argv[])
     {
         if(arch.target().isRDNA4GPU())
         {
-            if((solution.types.typeA == "half" && solution.types.typeB == "half")
-               || (solution.types.typeA == "bf16" && solution.types.typeB == "bf16")
-               || (solution.types.typeA == "fp8" && solution.types.typeB == "fp8")
-               || (solution.types.typeA == "bf8" && solution.types.typeB == "bf8")
-               || (solution.types.typeA == "bf8" && solution.types.typeB == "fp8")
-               || (solution.types.typeA == "fp8" && solution.types.typeB == "bf8"))
+            if((types.typeA == "half" && types.typeB == "half")
+               || (types.typeA == "bf16" && types.typeB == "bf16")
+               || (types.typeA == "fp8" && types.typeB == "fp8")
+               || (types.typeA == "bf8" && types.typeB == "bf8")
+               || (types.typeA == "bf8" && types.typeB == "fp8")
+               || (types.typeA == "fp8" && types.typeB == "bf8"))
             {
                 if(solution.waveM == -1)
                     solution.waveM = 16;
@@ -1489,24 +1522,24 @@ int main(int argc, const char* argv[])
                 // Override default settings for the `example` and `generate` subcommands.
                 if(example->parsed() || generate->parsed())
                 {
-                    solution.types.typeA = "half";
-                    solution.types.typeB = "half";
-                    solution.types.typeC = "half";
-                    solution.types.typeD = "half";
-                    solution.waveM       = 16;
-                    solution.waveN       = 16;
-                    solution.waveK       = 16;
-                    solution.waveB       = 1;
+                    types.typeA    = "half";
+                    types.typeB    = "half";
+                    types.typeC    = "half";
+                    types.typeD    = "half";
+                    solution.waveM = 16;
+                    solution.waveN = 16;
+                    solution.waveK = 16;
+                    solution.waveB = 1;
                 }
                 else
                 {
                     Throw<FatalError>("Unsupported MI on: ",
                                       arch.target().toString(),
-                                      ShowValue(solution.types.typeA),
-                                      ShowValue(solution.types.typeB),
-                                      ShowValue(solution.types.typeC),
-                                      ShowValue(solution.types.typeD),
-                                      ShowValue(solution.types.typeAcc));
+                                      ShowValue(types.typeA),
+                                      ShowValue(types.typeB),
+                                      ShowValue(types.typeC),
+                                      ShowValue(types.typeD),
+                                      ShowValue(types.typeAcc));
                 }
             }
             // TODO Support prefetch on gfx12
@@ -1522,6 +1555,39 @@ int main(int argc, const char* argv[])
         Throw<FatalError>("Unsupported arch for GEMM client: ", arch.target().toString());
     }
 
+    if(types.scaleSkipPermlane)
+    {
+        AssertFatal(types.transA == Client::GEMMClient::TransposeType::T, ShowValue(types));
+        AssertFatal(types.scaleA == Operations::ScaleMode::Separate, ShowValue(types));
+
+        size_t kSubtile = solution.waveK / types.scaleBlockSize;
+
+        AssertFatal(kSubtile == 2 || kSubtile == 4,
+                    ShowValue(kSubtile),
+                    ShowValue(solution.waveK),
+                    ShowValue(types.scaleBlockSize));
+
+        types.scaleShuffleTileA = {64, 4, kSubtile};
+    }
+
+    if(types.scaleSkipPermlane)
+    {
+        AssertFatal(types.transB == Client::GEMMClient::TransposeType::N, ShowValue(types));
+        AssertFatal(types.scaleB == Operations::ScaleMode::Separate, ShowValue(types));
+
+        size_t kSubtile = solution.waveK / types.scaleBlockSize;
+
+        AssertFatal(kSubtile == 2 || kSubtile == 4,
+                    ShowValue(kSubtile),
+                    ShowValue(solution.waveK),
+                    ShowValue(types.scaleBlockSize));
+
+        types.scaleShuffleTileB = {64, 4, kSubtile};
+    }
+
+    problem.types  = types;
+    solution.types = types;
+
     // Set default prefetchMixMemOps
     if(prefetchMixMemOpsFlag->count() == 0)
     {
@@ -1530,10 +1596,10 @@ int main(int argc, const char* argv[])
         if(solution.prefetchLDSFactor != 0)
             solution.prefetchMixMemOps = true;
 
-        if(solution.types.scaleB == Operations::ScaleMode::Separate && !solution.loadLDSScaleB)
+        if(types.scaleB == Operations::ScaleMode::Separate && !solution.loadLDSScaleB)
             solution.prefetchMixMemOps = false;
 
-        if(solution.types.scaleA == Operations::ScaleMode::Separate && !solution.loadLDSScaleA)
+        if(types.scaleA == Operations::ScaleMode::Separate && !solution.loadLDSScaleA)
             solution.prefetchMixMemOps = false;
 
         // TODO: enable (prefetchMixMemOps == true && prefetchLDSFactor == 2 && direct2LDSA/B = true)
diff --git a/shared/rocroller/client/test_gemm_client.py b/shared/rocroller/client/test_gemm_client.py
index 90e49d0aa72..a9b2ee0bcc2 100644
--- a/shared/rocroller/client/test_gemm_client.py
+++ b/shared/rocroller/client/test_gemm_client.py
@@ -270,6 +270,8 @@ def client_arguments(self):
   scale_B: None
   scaleType_B: None
   scaleBlockSize: 0
+  scaleShuffleTileA: []
+  scaleShuffleTileB: []
   scaleSkipPermlane: false
 streamK: false
 streamKTwoTile: false
@@ -327,6 +329,8 @@ def client_arguments(self):
   scale_B: None
   scaleType_B: None
   scaleBlockSize: 0
+  scaleShuffleTileA: []
+  scaleShuffleTileB: []
   scaleSkipPermlane: false
 loadLDSScale_A: false
 loadLDSScale_B: false
@@ -382,6 +386,8 @@ def client_arguments(self):
   scale_B: None
   scaleType_B: None
   scaleBlockSize: 0
+  scaleShuffleTileA: []
+  scaleShuffleTileB: []
   scaleSkipPermlane: false
 loadLDSScale_A: false
 loadLDSScale_B: false
diff --git a/shared/rocroller/lib/include/CMakeLists.txt b/shared/rocroller/lib/include/CMakeLists.txt
index 7fb1540a855..d17938edddf 100644
--- a/shared/rocroller/lib/include/CMakeLists.txt
+++ b/shared/rocroller/lib/include/CMakeLists.txt
@@ -33,6 +33,7 @@ target_sources(rocroller
         "${CMAKE_CURRENT_SOURCE_DIR}/rocRoller/ScheduledInstructions.hpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/rocRoller/ScheduledInstructions_fwd.hpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/rocRoller/TensorDescriptor.hpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/rocRoller/TensorDescriptor_impl.hpp"
 
         "${CMAKE_CURRENT_SOURCE_DIR}/rocRoller/Assemblers/Assembler.hpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/rocRoller/Assemblers/Assembler_fwd.hpp"
diff --git a/shared/rocroller/lib/include/rocRoller/KernelGraph/ControlGraph/Operation.hpp b/shared/rocroller/lib/include/rocRoller/KernelGraph/ControlGraph/Operation.hpp
index bba66d9b3ab..77647d91a0b 100644
--- a/shared/rocroller/lib/include/rocRoller/KernelGraph/ControlGraph/Operation.hpp
+++ b/shared/rocroller/lib/include/rocRoller/KernelGraph/ControlGraph/Operation.hpp
@@ -450,6 +450,8 @@ namespace rocRoller
             Operations::ScaleMode scaleModeB = Operations::ScaleMode::None;
             std::vector<size_t>   scaleStridesA;
             std::vector<size_t>   scaleStridesB;
+            std::vector<size_t>   scalePreShuffledTileA;
+            std::vector<size_t>   scalePreShuffledTileB;
             VariableType          accType = DataType::Float;
 
             std::string name() const;
diff --git a/shared/rocroller/lib/include/rocRoller/KernelGraph/KernelGraph.hpp b/shared/rocroller/lib/include/rocRoller/KernelGraph/KernelGraph.hpp
index e021ce64a16..c87206c1ed9 100644
--- a/shared/rocroller/lib/include/rocRoller/KernelGraph/KernelGraph.hpp
+++ b/shared/rocroller/lib/include/rocRoller/KernelGraph/KernelGraph.hpp
@@ -182,7 +182,7 @@ namespace rocRoller
          *
          * @ingroup KernelGraph
          */
-        KernelGraph translate(CommandPtr);
+        KernelGraph translate(CommandPtr, CommandParametersPtr params = nullptr);
 
         /**
          * Generate assembly from a KernelGraph.
diff --git a/shared/rocroller/lib/include/rocRoller/KernelOptions_detail.hpp b/shared/rocroller/lib/include/rocRoller/KernelOptions_detail.hpp
index 0c0c0435147..e5a789e8e59 100644
--- a/shared/rocroller/lib/include/rocRoller/KernelOptions_detail.hpp
+++ b/shared/rocroller/lib/include/rocRoller/KernelOptions_detail.hpp
@@ -118,8 +118,8 @@ namespace rocRoller
 
         /**
          * Skip generation of permlane instructions when loading scale data.
-         * This is experimental and will cause the validation to fail, but will
-         * show better performance.
+         * This is experimental and requires that the input be specifically
+         * modified, but will show better performance.
          */
         bool scaleSkipPermlane = false;
 
diff --git a/shared/rocroller/lib/include/rocRoller/Operations/BlockScale.hpp b/shared/rocroller/lib/include/rocRoller/Operations/BlockScale.hpp
index 90f49e319d2..b1545748ca6 100644
--- a/shared/rocroller/lib/include/rocRoller/Operations/BlockScale.hpp
+++ b/shared/rocroller/lib/include/rocRoller/Operations/BlockScale.hpp
@@ -59,7 +59,7 @@ namespace rocRoller
             explicit BlockScale(OperationTag                data,
                                 int                         dimensions,
                                 std::optional<OperationTag> scale   = {},
-                                std::vector<size_t> const&  strides = {});
+                                std::vector<size_t>         strides = {});
 
             std::unordered_set<OperationTag> getInputs() const;
             std::string                      toString() const;
@@ -78,5 +78,29 @@ namespace rocRoller
             template <typename T1, typename T2, typename T3>
             friend struct rocRoller::Serialization::MappingTraits;
         };
+
+        class SubTileTranspose : public BaseOperation
+        {
+        public:
+            SubTileTranspose() = delete;
+
+            explicit SubTileTranspose(OperationTag input, std::vector<size_t> tileDimensions);
+
+            std::unordered_set<OperationTag> getInputs() const;
+            std::string                      toString() const;
+            std::vector<size_t> const&       tileDimensions() const;
+
+            auto operator<=>(SubTileTranspose const&) const = default;
+            bool operator==(SubTileTranspose const& other) const;
+
+            OperationTag input() const;
+
+        private:
+            OperationTag        m_input;
+            std::vector<size_t> m_tileDimensions;
+
+            template <typename T1, typename T2, typename T3>
+            friend struct rocRoller::Serialization::MappingTraits;
+        };
     }
 }
diff --git a/shared/rocroller/lib/include/rocRoller/Operations/Operation.hpp b/shared/rocroller/lib/include/rocRoller/Operations/Operation.hpp
index 2ac6afa0462..45bf53e8a45 100644
--- a/shared/rocroller/lib/include/rocRoller/Operations/Operation.hpp
+++ b/shared/rocroller/lib/include/rocRoller/Operations/Operation.hpp
@@ -45,6 +45,8 @@ namespace rocRoller
             OperationTag getTag() const;
             void         setTag(OperationTag tag);
 
+            std::strong_ordering operator<=>(BaseOperation const&) const;
+
         protected:
             OperationTag           m_tag;
             std::weak_ptr<Command> m_command;
diff --git a/shared/rocroller/lib/include/rocRoller/Operations/Operations.hpp b/shared/rocroller/lib/include/rocRoller/Operations/Operations.hpp
index 54365e6beb4..a75ac7c5366 100644
--- a/shared/rocroller/lib/include/rocRoller/Operations/Operations.hpp
+++ b/shared/rocroller/lib/include/rocRoller/Operations/Operations.hpp
@@ -48,7 +48,7 @@ namespace rocRoller
         {
             Nop() {}
             template <typename... Args>
-            Nop(Args&&... i)
+            explicit Nop(Args&&... i)
             {
             }
 
@@ -63,6 +63,7 @@ namespace rocRoller
             std::unordered_set<OperationTag> operator()(Scalar const&);
             std::unordered_set<OperationTag> operator()(Literal const&);
             std::unordered_set<OperationTag> operator()(BlockScale const&);
+            std::unordered_set<OperationTag> operator()(SubTileTranspose const&);
             std::unordered_set<OperationTag> operator()(T_Load_Linear const&);
             std::unordered_set<OperationTag> operator()(T_Load_Scalar const&);
             std::unordered_set<OperationTag> operator()(T_Load_Tiled const&);
@@ -87,6 +88,7 @@ namespace rocRoller
             std::unordered_set<OperationTag> operator()(Scalar const&);
             std::unordered_set<OperationTag> operator()(Literal const&);
             std::unordered_set<OperationTag> operator()(BlockScale const&);
+            std::unordered_set<OperationTag> operator()(SubTileTranspose const&);
             std::unordered_set<OperationTag> operator()(T_Load_Linear const&);
             std::unordered_set<OperationTag> operator()(T_Load_Scalar const&);
             std::unordered_set<OperationTag> operator()(T_Load_Tiled const&);
@@ -119,6 +121,7 @@ namespace rocRoller
             std::unordered_set<OperationTag> operator()(Scalar&);
             std::unordered_set<OperationTag> operator()(Literal&);
             std::unordered_set<OperationTag> operator()(BlockScale&);
+            std::unordered_set<OperationTag> operator()(SubTileTranspose&);
             std::unordered_set<OperationTag> operator()(T_Load_Linear&);
             std::unordered_set<OperationTag> operator()(T_Load_Scalar&);
             std::unordered_set<OperationTag> operator()(T_Load_Tiled&);
@@ -146,6 +149,7 @@ namespace rocRoller
             std::string operator()(Scalar const&);
             std::string operator()(Literal const&);
             std::string operator()(BlockScale const&);
+            std::string operator()(SubTileTranspose const&);
             std::string operator()(T_Load_Linear const&);
             std::string operator()(T_Load_Scalar const&);
             std::string operator()(T_Load_Tiled const&);
@@ -175,6 +179,7 @@ namespace rocRoller
             void operator()(Scalar&);
             void operator()(Literal&);
             void operator()(BlockScale&);
+            void operator()(SubTileTranspose&);
             void operator()(T_Load_Linear&);
             void operator()(T_Load_Scalar&);
             void operator()(T_Load_Tiled&);
@@ -196,6 +201,7 @@ namespace rocRoller
             void operator()(Scalar&);
             void operator()(Literal&);
             void operator()(BlockScale&);
+            void operator()(SubTileTranspose&);
             void operator()(T_Load_Linear&);
             void operator()(T_Load_Scalar&);
             void operator()(T_Load_Tiled&);
@@ -215,6 +221,7 @@ namespace rocRoller
             rocRoller::VariableType operator()(Scalar&);
             rocRoller::VariableType operator()(Literal&);
             rocRoller::VariableType operator()(BlockScale&);
+            rocRoller::VariableType operator()(SubTileTranspose&);
             rocRoller::VariableType operator()(T_Load_Linear&);
             rocRoller::VariableType operator()(T_Load_Scalar&);
             rocRoller::VariableType operator()(T_Load_Tiled&);
diff --git a/shared/rocroller/lib/include/rocRoller/Operations/Operations_fwd.hpp b/shared/rocroller/lib/include/rocRoller/Operations/Operations_fwd.hpp
index 1bf71bdbdc4..d19529ec075 100644
--- a/shared/rocroller/lib/include/rocRoller/Operations/Operations_fwd.hpp
+++ b/shared/rocroller/lib/include/rocRoller/Operations/Operations_fwd.hpp
@@ -40,6 +40,7 @@ namespace rocRoller
         struct Scalar;
         struct Literal;
         struct BlockScale;
+        struct SubTileTranspose;
         struct T_Load_Linear;
         struct T_Load_Scalar;
         struct T_Load_Tiled;
@@ -53,6 +54,7 @@ namespace rocRoller
                                        Scalar,
                                        Literal,
                                        BlockScale,
+                                       SubTileTranspose,
                                        T_Load_Linear,
                                        T_Load_Scalar,
                                        T_Load_Tiled,
diff --git a/shared/rocroller/lib/include/rocRoller/Operations/Operations_impl.hpp b/shared/rocroller/lib/include/rocRoller/Operations/Operations_impl.hpp
index 2a2ab13efe4..0e6a5b0e949 100644
--- a/shared/rocroller/lib/include/rocRoller/Operations/Operations_impl.hpp
+++ b/shared/rocroller/lib/include/rocRoller/Operations/Operations_impl.hpp
@@ -63,6 +63,11 @@ namespace rocRoller
             return blockScale.getInputs();
         }
 
+        inline std::unordered_set<OperationTag> Inputs::operator()(SubTileTranspose const& op)
+        {
+            return op.getInputs();
+        }
+
         inline std::unordered_set<OperationTag> Inputs::operator()(T_Load_Linear const& load)
         {
             return {};
@@ -153,6 +158,11 @@ namespace rocRoller
             return {blockScale.getTag()};
         }
 
+        inline std::unordered_set<OperationTag> Outputs::operator()(SubTileTranspose const& op)
+        {
+            return {op.getTag()};
+        }
+
         inline std::unordered_set<OperationTag> Outputs::operator()(T_Load_Linear const& load)
         {
             return {load.getTag()};
@@ -290,6 +300,17 @@ namespace rocRoller
             return {blockScale.getTag()};
         }
 
+        inline std::unordered_set<OperationTag> AssignOutputs::operator()(SubTileTranspose& op)
+        {
+            if(op.getTag().uninitialized())
+            {
+                op.setTag(m_nextTagValue);
+                ++m_nextTagValue;
+            }
+
+            return {op.getTag()};
+        }
+
         inline std::unordered_set<OperationTag> AssignOutputs::operator()(T_Load_Linear& load)
         {
             if(load.getTag().uninitialized())
@@ -452,6 +473,11 @@ namespace rocRoller
             return blockScale.toString();
         }
 
+        inline std::string ToStringVisitor::operator()(SubTileTranspose const& op)
+        {
+            return op.toString();
+        }
+
         inline std::string ToStringVisitor::operator()(T_Load_Linear const& load)
         {
             return load.toString();
@@ -547,6 +573,11 @@ namespace rocRoller
             blockScale.setCommand(command);
         }
 
+        inline void SetCommand::operator()(SubTileTranspose& op)
+        {
+            op.setCommand(command);
+        }
+
         inline void SetCommand::operator()(T_Load_Linear& load)
         {
             load.setCommand(command);
@@ -608,6 +639,8 @@ namespace rocRoller
 
         inline void AllocateArguments::operator()(BlockScale& blockScale) {}
 
+        inline void AllocateArguments::operator()(SubTileTranspose&) {}
+
         inline void AllocateArguments::operator()(T_Load_Linear& load) {}
 
         inline void AllocateArguments::operator()(T_Load_Scalar& load) {}
@@ -656,6 +689,12 @@ namespace rocRoller
             return {rocRoller::DataType::None};
         }
 
+        inline rocRoller::VariableType
+            rocRoller::Operations::VariableTypeVisitor::operator()(SubTileTranspose&)
+        {
+            return {rocRoller::DataType::None};
+        }
+
         inline rocRoller::VariableType
             rocRoller::Operations::VariableTypeVisitor::operator()(T_Load_Linear& load)
         {
@@ -734,6 +773,7 @@ namespace rocRoller
         RR_OPERATION_NAME(Scalar);
         RR_OPERATION_NAME(Literal);
         RR_OPERATION_NAME(BlockScale);
+        RR_OPERATION_NAME(SubTileTranspose);
         RR_OPERATION_NAME(T_Load_Linear);
         RR_OPERATION_NAME(T_Load_Scalar);
         RR_OPERATION_NAME(T_Load_Tiled);
diff --git a/shared/rocroller/lib/include/rocRoller/Serialization/ControlGraph.hpp b/shared/rocroller/lib/include/rocRoller/Serialization/ControlGraph.hpp
index 93994c1e9e7..ff8ebe872e0 100644
--- a/shared/rocroller/lib/include/rocRoller/Serialization/ControlGraph.hpp
+++ b/shared/rocroller/lib/include/rocRoller/Serialization/ControlGraph.hpp
@@ -413,12 +413,17 @@ namespace rocRoller
             using iot = IOTraits<IO>;
             static void mapping(IO& io, KernelGraph::ControlGraph::TensorContraction& op, Context&)
             {
+                static_assert(sizeof(op) == 160);
+
                 iot::mapRequired(io, "aDims", op.aDims);
                 iot::mapRequired(io, "bDims", op.bDims);
                 iot::mapRequired(io, "scaleModeA", op.scaleModeA);
                 iot::mapRequired(io, "scaleModeB", op.scaleModeB);
                 iot::mapRequired(io, "scaleStridesA", op.scaleStridesA);
                 iot::mapRequired(io, "scaleStridesB", op.scaleStridesB);
+                iot::mapRequired(io, "scalePreShuffledTileA", op.scalePreShuffledTileA);
+                iot::mapRequired(io, "scalePreShuffledTileB", op.scalePreShuffledTileB);
+                iot::mapRequired(io, "accType", op.accType);
             }
 
             static void mapping(IO& io, KernelGraph::ControlGraph::TensorContraction& op)
diff --git a/shared/rocroller/lib/include/rocRoller/Serialization/Operations.hpp b/shared/rocroller/lib/include/rocRoller/Serialization/Operations.hpp
index b186a73d777..5084d68c992 100644
--- a/shared/rocroller/lib/include/rocRoller/Serialization/Operations.hpp
+++ b/shared/rocroller/lib/include/rocRoller/Serialization/Operations.hpp
@@ -169,6 +169,27 @@ namespace rocRoller
             }
         };
 
+        template <typename IO, typename Context>
+        struct MappingTraits<Operations::SubTileTranspose, IO, Context>
+        {
+            using TOp = Operations::SubTileTranspose;
+            using iot = IOTraits<IO>;
+
+            static void mapping(IO& io, TOp& op, Context& ctx)
+            {
+                iot::mapRequired(io, "tag", op.m_tag);
+                iot::mapRequired(io, "input", op.m_input);
+                iot::mapRequired(io, "tileDimensions", op.m_tileDimensions);
+            }
+
+            static void mapping(IO& io, TOp& val)
+            {
+                AssertFatal((std::same_as<EmptyContext, Context>));
+                Context ctx;
+                mapping(io, val, ctx);
+            }
+        };
+
         template <typename IO, typename Context>
         struct MappingTraits<Operations::T_Load_Linear, IO, Context>
         {
@@ -419,6 +440,15 @@ namespace rocRoller
             }
         };
 
+        template <>
+        struct DefaultConstruct<Operations::Operation, Operations::SubTileTranspose>
+        {
+            static Operations::Operation call()
+            {
+                return Operations::SubTileTranspose(Operations::OperationTag(-1), {});
+            }
+        };
+
         template <>
         struct DefaultConstruct<Operations::Operation, Operations::RandomNumberGenerator>
         {
diff --git a/shared/rocroller/lib/include/rocRoller/TensorDescriptor.hpp b/shared/rocroller/lib/include/rocRoller/TensorDescriptor.hpp
index d3cac75c4b3..a2676d0ea92 100644
--- a/shared/rocroller/lib/include/rocRoller/TensorDescriptor.hpp
+++ b/shared/rocroller/lib/include/rocRoller/TensorDescriptor.hpp
@@ -31,9 +31,41 @@
 #include <rocRoller/CommandSolution.hpp>
 #include <rocRoller/DataTypes/DataTypes.hpp>
 #include <rocRoller/Operations/CommandArguments.hpp>
+#include <rocRoller/Utilities/Utils.hpp>
 
 namespace rocRoller
 {
+    /**
+     * [sizeBegin,sizeEnd): A range of values representing sizes of dimensions.
+     * Returns the number of coordinates within that space,
+     * i.e. the product of those values.
+     */
+    template <typename SizeIter>
+    size_t CoordCount(SizeIter sizeBegin, SizeIter sizeEnd);
+
+    /**
+     * [sizeBegin,sizeEnd): A range of values representing sizes of dimensions.
+     * Writes into [coordBegin:coordEnd) the coordinates numbered `num` within
+     * a linearization of all coordinates with earlier values being
+     * faster-moving.
+     */
+    template <typename CoordIter, typename SizeIter>
+    void CoordNumbered(
+        size_t num, CoordIter coordBegin, CoordIter coordEnd, SizeIter sizeBegin, SizeIter sizeEnd);
+
+    /**
+     * If [coordBegin, coordEnd) represents coordinate x within the
+     * linearization, updates it to represent coordinate x+1.
+     * 
+     * If [coordBegin, coordEnd) represents the last coordinate within the
+     * linearization, it will be reset to all 0s and false will be returned.
+     */
+    template <typename CoordIter, typename SizeIter>
+    bool IncrementCoord(CoordIter coordBegin,
+                        CoordIter coordEnd,
+                        SizeIter  sizeBegin,
+                        SizeIter  sizeEnd);
+
     /*
      * Describes a tensor including dimensions, memory layout, and data type.
      * Decoupled from any particular pointer value or memory location.
@@ -41,10 +73,7 @@ namespace rocRoller
     class TensorDescriptor
     {
     public:
-        TensorDescriptor()
-        {
-            this->calculate();
-        }
+        TensorDescriptor();
 
         template <typename IterA, typename IterB>
         TensorDescriptor(DataType t,
@@ -52,201 +81,127 @@ namespace rocRoller
                          IterA    sizesEnd,
                          IterB    stridesBegin,
                          IterB    stridesEnd,
-                         size_t   offset = 0)
-            : m_sizes(sizesBegin, sizesEnd)
-            , m_strides(stridesBegin, stridesEnd)
-            , m_dataType(t)
-            , m_offset(offset)
-        {
-            this->calculate();
-        }
+                         size_t   offset = 0);
 
         template <typename Iter>
-        TensorDescriptor(DataType t, Iter sizesBegin, Iter sizesEnd, size_t offset = 0)
-            : m_sizes(sizesBegin, sizesEnd)
-            , m_dataType(t)
-            , m_offset(offset)
-        {
-            this->calculate();
-        }
-
-        /*
-        *  Allow directly specifying total number of elements instead of sizes
-        */
+        TensorDescriptor(DataType t, Iter sizesBegin, Iter sizesEnd, size_t offset = 0);
+
+        /**
+         *  Allow directly specifying total number of elements instead of sizes
+         */
         TensorDescriptor(DataType                      t,
                          size_t                        totalLogicalElements,
                          std::initializer_list<size_t> strides,
-                         size_t                        offset = 0)
-            : m_totalLogicalElements(totalLogicalElements)
-            , m_strides(strides)
-            , m_dataType(t)
-            , m_offset(offset)
-        {
-            this->calculate();
-        }
-
-        TensorDescriptor(DataType t, std::initializer_list<size_t> sizes, size_t offset = 0)
-            : m_sizes(sizes)
-            , m_dataType(t)
-            , m_offset(offset)
-
-        {
-            this->calculate();
-        }
+                         size_t                        offset = 0);
+
+        TensorDescriptor(DataType t, std::initializer_list<size_t> sizes, size_t offset = 0);
+
+        TensorDescriptor(DataType t, std::vector<size_t> sizes, size_t offset = 0);
 
         TensorDescriptor(DataType                      t,
                          std::initializer_list<size_t> sizes,
                          std::initializer_list<size_t> strides,
-                         size_t                        offset = 0)
-            : m_sizes(sizes)
-            , m_strides(strides)
-            , m_dataType(t)
-            , m_offset(offset)
-        {
-            this->calculate();
-        }
-
-        // Specialized constructor for 2-D tensor (i.e., matrix)
+                         size_t                        offset = 0);
+
+        TensorDescriptor(DataType            t,
+                         std::vector<size_t> sizes,
+                         std::vector<size_t> strides,
+                         size_t              offset = 0);
+
+        /**
+         * Specialized constructor for 2-D tensor (i.e., matrix)
+         */
         TensorDescriptor(DataType              t,
                          std::array<size_t, 2> sizes,
                          std::string const&    transpose,
-                         size_t                offset = 0)
-            : m_sizes(sizes.begin(), sizes.end())
-            , m_dataType(t)
-            , m_offset(offset)
-        {
-            if(transpose == "T")
-            {
-                m_strides = {m_sizes[1], 1u};
-            }
-            else
-            {
-                m_strides = {1u, m_sizes[0]};
-            }
-            this->calculate();
-        }
-
-        void calculate()
-        {
-            if(m_strides.size() < m_sizes.size())
-            {
-                m_strides.resize(m_sizes.size(), UseDefaultStride);
-                if(m_strides[0] == UseDefaultStride)
-                {
-                    m_strides[0] = 1;
-                }
-            }
-
-            // Calculate total number of logical elements and update strides
-            if(not m_sizes.empty())
-            {
-                m_totalLogicalElements = m_sizes[0];
-            }
-            for(int i = 1; i < m_sizes.size(); i++)
-            {
-                m_totalLogicalElements *= m_sizes[i];
-                if(m_strides[i] == UseDefaultStride)
-                {
-                    m_strides[i] = m_strides[i - 1] * m_sizes[i - 1];
-                }
-            }
-
-            // Calculate total number of allocated elements
-            if(not m_sizes.empty())
-            {
-                m_totalAllocatedElements = 1;
-                for(size_t i = 0; i < m_sizes.size(); i++)
-                    m_totalAllocatedElements += m_strides[i] * (m_sizes[i] - 1);
-            }
-            else
-            {
-                m_totalAllocatedElements = m_totalLogicalElements;
-            }
-            m_totalAllocatedElements += m_offset;
-        }
-
-        const size_t size(size_t index) const
-        {
-            return m_sizes[index];
-        }
-        const std::vector<size_t>& sizes() const
-        {
-            return m_sizes;
-        }
-        const size_t stride(size_t index) const
-        {
-            return m_strides[index];
-        }
-        const std::vector<size_t>& strides() const
-        {
-            return m_strides;
-        }
-
-        size_t offset() const
-        {
-            return m_offset;
-        }
-
-        size_t dimensions() const
-        {
-            return m_sizes.size();
-        }
-        size_t totalLogicalElements() const
-        {
-            return m_totalLogicalElements;
-        }
-        size_t totalAllocatedElements() const
-        {
-            return m_totalAllocatedElements;
-        }
-        size_t elementBytes() const
-        {
-            return DataTypeInfo::Get(m_dataType).elementBytes;
-        }
-
-        DataType dataType() const
-        {
-            return m_dataType;
-        }
-
-        bool operator==(const TensorDescriptor& rhs) const
-        {
-            return m_dataType == rhs.m_dataType && m_sizes == rhs.m_sizes
-                   && m_strides == rhs.m_strides && m_offset == rhs.m_offset;
-        }
-
-        bool operator!=(const TensorDescriptor& rhs) const
-        {
-            return !(*this == rhs);
-        }
-
-        std::string toString() const
-        {
-            std::ostringstream result;
-
-            auto join = [&](std::vector<size_t> const& items) {
-                if(items.empty())
-                    return;
-
-                result << "(";
-                auto last_item = std::prev(items.end());
-                for(auto iter = items.begin(); iter != last_item; iter++)
-                    result << *iter << ",";
-                result << *last_item << ")";
-            };
-
-            result << dimensions() << "-tensor<" << dataType() << "> ";
-            join(m_sizes);
-            join(m_strides);
-            result << " offset: " << m_offset;
-            return result.str();
-        }
+                         size_t                offset = 0);
+
+        static TensorDescriptor
+            ShuffledNoPadding(DataType t, std::vector<size_t> sizes, std::vector<size_t> dimOrder);
+
+        static TensorDescriptor ShuffledNoPadding(DataType                      t,
+                                                  std::initializer_list<size_t> sizes,
+                                                  std::vector<size_t>           dimOrder);
+
+        static TensorDescriptor ShuffledNoPadding(DataType                      t,
+                                                  std::vector<size_t>           sizes,
+                                                  std::initializer_list<size_t> dimOrder);
+
+        static TensorDescriptor ShuffledNoPadding(DataType                      t,
+                                                  std::initializer_list<size_t> sizes,
+                                                  std::initializer_list<size_t> dimOrder);
+
+        inline void calculate();
+
+        const size_t size(size_t index) const;
+
+        const std::vector<size_t>& sizes() const;
+
+        const size_t stride(size_t index) const;
+
+        const std::vector<size_t>& strides() const;
+
+        size_t offset() const;
+
+        size_t dimensions() const;
+
+        size_t totalLogicalElements() const;
+
+        size_t totalAllocatedElements() const;
+
+        size_t totalAllocatedBytes() const;
+
+        size_t elementBytes() const;
+
+        /**
+         * Returns the number of elements of padding in the given dimension (0 if
+         * unpadded). May be negative if stride is less than size
+         */
+        int64_t dimensionPadding(size_t dim) const;
+
+        /**
+         * Collapses dimensions in the interval [begin, end).
+         *
+         * preconditions:
+         * - end >= begin
+         * - begin < dimensions()
+         * - end <= dimensions()
+         * - dimensions in the interval [begin, end-1) are not padded.
+         *
+         * postconditions:
+         * - dimensions() is diminished by end-begin
+         * - total elements (allocated and logical) remain the same
+         * - dimension 'begin' is the product of all the dimensions in the interval
+         * [begin, end).
+         */
+        void collapseDims(size_t begin, size_t end);
+
+        DataType dataType() const;
+
+        bool operator==(const TensorDescriptor& rhs) const;
+
+        bool operator!=(const TensorDescriptor& rhs) const;
+
+        std::string toString() const;
+
+        template <typename Container>
+        inline size_t index(Container const& indices) const;
+
+        template <typename T>
+        inline size_t index(std::initializer_list<T> indices) const;
+
+        template <std::integral... Ts>
+        inline size_t index(Ts... is) const;
+
+        inline bool incrementCoord(std::vector<size_t>& coord, size_t firstDimension = 0) const;
+
+        TensorDescriptor withNormalizedDimensions() const;
 
         friend std::ostream& operator<<(std::ostream& stream, const TensorDescriptor& t);
 
-    private:
         static inline const size_t UseDefaultStride = -1;
 
+    private:
         std::vector<size_t> m_sizes;
         std::vector<size_t> m_strides;
         size_t              m_offset = 0;
@@ -261,17 +216,24 @@ namespace rocRoller
     inline void setCommandTensorArg(rocRoller::CommandArguments&               commandArgs,
                                     rocRoller::Operations::OperationTag const& tag,
                                     TensorDescriptor&                          desc,
-                                    T                                          value)
-    {
-        commandArgs.setArgument(tag, ArgumentType::Value, value);
-        commandArgs.setArgument(tag, ArgumentType::Limit, desc.totalLogicalElements());
-
-        auto const& sizes = desc.sizes();
-        for(size_t i = 0; i < sizes.size(); i++)
-            commandArgs.setArgument(tag, ArgumentType::Size, i, sizes[i]);
-
-        auto const& strides = desc.strides();
-        for(size_t i = 0; i < strides.size(); i++)
-            commandArgs.setArgument(tag, ArgumentType::Stride, i, (size_t)strides[i]);
-    }
+                                    T                                          value);
+
+    template <typename T>
+    std::string writeTensor(std::vector<T> const& data, TensorDescriptor desc);
+
+    /**
+     * `dst` and `src` must be two TensorDescriptors with the same data type
+     * and dimension sizes (i.e. dst.sizes() == src.sizes()). They should have
+     * different strides (or this function is a no-op).
+     * 
+     * `input` must contain data arranged according to `src`.
+     * 
+     * Returns `input` rearranged according to the strides in `dst`.
+     */
+    template <typename T>
+    inline std::vector<T> shuffleDims(std::vector<T> const&   input,
+                                      TensorDescriptor const& dst,
+                                      TensorDescriptor const& src);
 }
+
+#include "TensorDescriptor_impl.hpp"
diff --git a/shared/rocroller/lib/include/rocRoller/TensorDescriptor_impl.hpp b/shared/rocroller/lib/include/rocRoller/TensorDescriptor_impl.hpp
new file mode 100644
index 00000000000..9b7f4ce06e7
--- /dev/null
+++ b/shared/rocroller/lib/include/rocRoller/TensorDescriptor_impl.hpp
@@ -0,0 +1,539 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright 2024-2025 AMD ROCm(TM) Software
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <numeric>
+
+#include <rocRoller/TensorDescriptor.hpp>
+
+#include <rocRoller/CommandSolution.hpp>
+#include <rocRoller/DataTypes/DataTypes.hpp>
+#include <rocRoller/Operations/CommandArguments.hpp>
+#include <rocRoller/Utilities/Utils.hpp>
+
+namespace rocRoller
+{
+    template <typename SizeIter>
+    inline size_t CoordCount(SizeIter sizeBegin, SizeIter sizeEnd)
+    {
+        size_t rv = 1;
+
+        while(sizeBegin != sizeEnd)
+        {
+            rv *= *sizeBegin;
+            sizeBegin++;
+        }
+
+        return rv;
+    }
+
+    template <typename CoordIter, typename SizeIter>
+    inline void CoordNumbered(
+        size_t num, CoordIter coordBegin, CoordIter coordEnd, SizeIter sizeBegin, SizeIter sizeEnd)
+    {
+        auto coord = coordBegin;
+        auto size  = sizeBegin;
+
+        while(coord != coordEnd && size != sizeEnd)
+        {
+            *coord = num % *size;
+            num /= *size;
+
+            coord++;
+            size++;
+        }
+
+        if(coord != coordEnd || size != sizeEnd)
+            throw std::runtime_error("Inconsistent size of coordinates.");
+    }
+
+    template <typename CoordIter, typename SizeIter>
+    inline bool IncrementCoord(CoordIter coordBegin,
+                               CoordIter coordEnd,
+                               SizeIter  sizeBegin,
+                               SizeIter  sizeEnd)
+    {
+        auto coord = coordBegin;
+        auto size  = sizeBegin;
+
+        while(coord != coordEnd)
+        {
+            (*coord)++;
+            if(*coord < *size)
+                return true;
+
+            *coord = 0;
+
+            coord++;
+            size++;
+        }
+
+        return false;
+    }
+
+    inline TensorDescriptor::TensorDescriptor()
+    {
+        this->calculate();
+    }
+
+    template <typename IterA, typename IterB>
+    TensorDescriptor::TensorDescriptor(DataType t,
+                                       IterA    sizesBegin,
+                                       IterA    sizesEnd,
+                                       IterB    stridesBegin,
+                                       IterB    stridesEnd,
+                                       size_t   offset)
+        : m_sizes(sizesBegin, sizesEnd)
+        , m_strides(stridesBegin, stridesEnd)
+        , m_dataType(t)
+        , m_offset(offset)
+    {
+        this->calculate();
+    }
+
+    template <typename Iter>
+    TensorDescriptor::TensorDescriptor(DataType t, Iter sizesBegin, Iter sizesEnd, size_t offset)
+        : m_sizes(sizesBegin, sizesEnd)
+        , m_dataType(t)
+        , m_offset(offset)
+    {
+        this->calculate();
+    }
+
+    inline TensorDescriptor::TensorDescriptor(DataType                      t,
+                                              size_t                        totalLogicalElements,
+                                              std::initializer_list<size_t> strides,
+                                              size_t                        offset)
+        : m_totalLogicalElements(totalLogicalElements)
+        , m_strides(strides)
+        , m_dataType(t)
+        , m_offset(offset)
+    {
+        this->calculate();
+    }
+
+    inline TensorDescriptor::TensorDescriptor(DataType                      t,
+                                              std::initializer_list<size_t> sizes,
+                                              size_t                        offset)
+        : m_sizes(sizes)
+        , m_dataType(t)
+        , m_offset(offset)
+
+    {
+        this->calculate();
+    }
+
+    inline TensorDescriptor::TensorDescriptor(DataType t, std::vector<size_t> sizes, size_t offset)
+        : m_sizes(std::move(sizes))
+        , m_dataType(t)
+        , m_offset(offset)
+    {
+        this->calculate();
+    }
+
+    inline TensorDescriptor::TensorDescriptor(DataType                      t,
+                                              std::initializer_list<size_t> sizes,
+                                              std::initializer_list<size_t> strides,
+                                              size_t                        offset)
+        : m_sizes(sizes)
+        , m_strides(strides)
+        , m_dataType(t)
+        , m_offset(offset)
+    {
+        this->calculate();
+    }
+
+    inline TensorDescriptor::TensorDescriptor(DataType            t,
+                                              std::vector<size_t> sizes,
+                                              std::vector<size_t> strides,
+                                              size_t              offset)
+        : m_sizes(std::move(sizes))
+        , m_strides(std::move(strides))
+        , m_dataType(t)
+        , m_offset(offset)
+    {
+        this->calculate();
+    }
+
+    // Specialized constructor for 2-D tensor (i.e., matrix)
+    inline TensorDescriptor::TensorDescriptor(DataType              t,
+                                              std::array<size_t, 2> sizes,
+                                              std::string const&    transpose,
+                                              size_t                offset)
+        : m_sizes(sizes.begin(), sizes.end())
+        , m_dataType(t)
+        , m_offset(offset)
+    {
+        if(transpose == "T")
+        {
+            m_strides = {m_sizes[1], 1u};
+        }
+        else
+        {
+            m_strides = {1u, m_sizes[0]};
+        }
+        this->calculate();
+    }
+
+    inline TensorDescriptor TensorDescriptor::ShuffledNoPadding(DataType            t,
+                                                                std::vector<size_t> sizes,
+                                                                std::vector<size_t> dimOrder)
+    {
+        AssertFatal(sizes.size() == dimOrder.size(), ShowValue(sizes), ShowValue(dimOrder));
+
+        std::vector<size_t> strides(sizes.size(), 0);
+
+        size_t stride = 1;
+        for(auto idx : dimOrder)
+        {
+            strides.at(idx) = stride;
+            stride *= sizes.at(idx);
+        }
+
+        return TensorDescriptor(t, std::move(sizes), std::move(strides));
+    }
+
+    inline TensorDescriptor TensorDescriptor::ShuffledNoPadding(DataType                      t,
+                                                                std::initializer_list<size_t> sizes,
+                                                                std::vector<size_t> dimOrder)
+    {
+        std::vector<size_t> theSizes(std::move(sizes));
+        return ShuffledNoPadding(t, std::move(theSizes), std::move(dimOrder));
+    }
+
+    inline TensorDescriptor TensorDescriptor::ShuffledNoPadding(
+        DataType t, std::vector<size_t> sizes, std::initializer_list<size_t> dimOrder)
+    {
+        std::vector<size_t> theDimOrder(std::move(dimOrder));
+        return ShuffledNoPadding(t, std::move(sizes), std::move(theDimOrder));
+    }
+
+    inline TensorDescriptor TensorDescriptor::ShuffledNoPadding(
+        DataType t, std::initializer_list<size_t> sizes, std::initializer_list<size_t> dimOrder)
+    {
+        std::vector<size_t> theSizes(std::move(sizes));
+        std::vector<size_t> theDimOrder(std::move(dimOrder));
+
+        return ShuffledNoPadding(t, std::move(theSizes), std::move(theDimOrder));
+    }
+
+    inline void TensorDescriptor::calculate()
+    {
+        if(m_sizes.empty())
+        {
+            m_strides                = m_sizes;
+            m_totalLogicalElements   = 0;
+            m_totalAllocatedElements = 0;
+            return;
+        }
+
+        m_strides.resize(m_sizes.size(), UseDefaultStride);
+        if(m_strides[0] == UseDefaultStride)
+        {
+            m_strides[0] = 1;
+        }
+        m_totalLogicalElements = m_sizes[0];
+
+        for(int i = 1; i < m_sizes.size(); i++)
+        {
+            m_totalLogicalElements *= m_sizes[i];
+
+            if(m_strides[i] == UseDefaultStride)
+            {
+                m_strides[i] = m_strides[i - 1] * m_sizes[i - 1];
+            }
+        }
+
+        m_totalAllocatedElements = 1;
+        for(int i = 0; i < m_sizes.size(); i++)
+            m_totalAllocatedElements += m_strides[i] * (m_sizes[i] - 1);
+
+        m_totalAllocatedElements += m_offset;
+    }
+
+    inline const size_t TensorDescriptor::size(size_t index) const
+    {
+        return m_sizes[index];
+    }
+    inline const std::vector<size_t>& TensorDescriptor::sizes() const
+    {
+        return m_sizes;
+    }
+    inline const size_t TensorDescriptor::stride(size_t index) const
+    {
+        return m_strides[index];
+    }
+    inline const std::vector<size_t>& TensorDescriptor::strides() const
+    {
+        return m_strides;
+    }
+
+    inline size_t TensorDescriptor::offset() const
+    {
+        return m_offset;
+    }
+
+    inline size_t TensorDescriptor::dimensions() const
+    {
+        return m_sizes.size();
+    }
+    inline size_t TensorDescriptor::totalLogicalElements() const
+    {
+        return m_totalLogicalElements;
+    }
+    inline size_t TensorDescriptor::totalAllocatedElements() const
+    {
+        return m_totalAllocatedElements;
+    }
+    inline size_t TensorDescriptor::totalAllocatedBytes() const
+    {
+        return totalAllocatedElements() * elementBytes();
+    }
+    inline size_t TensorDescriptor::elementBytes() const
+    {
+        return DataTypeInfo::Get(m_dataType).elementBytes;
+    }
+
+    inline DataType TensorDescriptor::dataType() const
+    {
+        return m_dataType;
+    }
+
+    inline bool TensorDescriptor::operator==(const TensorDescriptor& rhs) const
+    {
+        return m_dataType == rhs.m_dataType && m_sizes == rhs.m_sizes && m_strides == rhs.m_strides
+               && m_offset == rhs.m_offset;
+    }
+
+    inline bool TensorDescriptor::operator!=(const TensorDescriptor& rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+    inline std::string TensorDescriptor::toString() const
+    {
+        std::ostringstream result;
+
+        auto join = [&](std::vector<size_t> const& items) {
+            if(items.empty())
+                return;
+
+            result << "(";
+            auto last_item = std::prev(items.end());
+            for(auto iter = items.begin(); iter != last_item; iter++)
+                result << *iter << ",";
+            result << *last_item << ")";
+        };
+
+        result << dimensions() << "-tensor<" << dataType() << "> ";
+        join(m_sizes);
+        join(m_strides);
+        result << " offset: " << m_offset;
+        return result.str();
+    }
+
+    template <typename Container>
+    inline size_t TensorDescriptor::index(Container const& indices) const
+    {
+        if(indices.size() != dimensions())
+            throw std::runtime_error("Incorrect number of indices.");
+
+        for(int i = 0; i < indices.size(); i++)
+            if(indices[i] >= m_sizes[i])
+                throw std::runtime_error("Index out of bounds.");
+
+        return std::inner_product(indices.begin(), indices.end(), m_strides.begin(), m_offset);
+    }
+
+    template <typename T>
+    inline size_t TensorDescriptor::index(std::initializer_list<T> indices) const
+    {
+        if(indices.size() != dimensions())
+            throw std::runtime_error("Incorrect number of indices.");
+
+        for(auto i = std::make_pair(indices.begin(), m_sizes.begin()); i.first != indices.end();
+            i.first++, i.second++)
+            if(*i.first >= *i.second)
+                throw std::runtime_error("Index out of bounds.");
+
+        return std::inner_product(indices.begin(), indices.end(), m_strides.begin(), m_offset);
+    }
+
+    template <std::integral... Ts>
+    inline size_t TensorDescriptor::index(Ts... is) const
+    {
+        return this->index({is...});
+    }
+
+    inline bool TensorDescriptor::incrementCoord(std::vector<size_t>& coord,
+                                                 size_t               firstDimension) const
+    {
+        if(coord.size() != dimensions())
+            throw std::runtime_error(concatenate(
+                "Invalid coordinate size ", coord.size(), " for ", dimensions(), "-tensor"));
+
+        if(firstDimension >= dimensions())
+            return false;
+
+        return IncrementCoord(
+            coord.begin() + firstDimension, coord.end(), m_sizes.begin(), m_sizes.end());
+    }
+
+    inline TensorDescriptor TensorDescriptor::withNormalizedDimensions() const
+    {
+        auto dims = iota<size_t>(0, dimensions()).to<std::vector>();
+
+        std::ranges::sort(dims, [this](size_t a, size_t b) { return m_strides[a] < m_strides[b]; });
+
+        std::vector<size_t> sizes, strides;
+        sizes.reserve(dimensions());
+        strides.reserve(dimensions());
+
+        for(auto dim : dims)
+        {
+            sizes.push_back(m_sizes[dim]);
+            strides.push_back(m_strides[dim]);
+        }
+
+        return TensorDescriptor(m_dataType, std::move(sizes), std::move(strides));
+    }
+
+    template <CCommandArgumentValue T>
+    inline void setCommandTensorArg(rocRoller::CommandArguments&               commandArgs,
+                                    rocRoller::Operations::OperationTag const& tag,
+                                    TensorDescriptor&                          desc,
+                                    T                                          value)
+    {
+        commandArgs.setArgument(tag, ArgumentType::Value, value);
+        commandArgs.setArgument(tag, ArgumentType::Limit, desc.totalLogicalElements());
+
+        auto const& sizes = desc.sizes();
+        for(size_t i = 0; i < sizes.size(); i++)
+            commandArgs.setArgument(tag, ArgumentType::Size, i, sizes[i]);
+
+        auto const& strides = desc.strides();
+        for(size_t i = 0; i < strides.size(); i++)
+            commandArgs.setArgument(tag, ArgumentType::Stride, i, (size_t)strides[i]);
+    }
+
+    template <typename T>
+    std::string writeTensor(std::vector<T> const& data, TensorDescriptor desc)
+    {
+        std::string rv = desc.toString() + "\n";
+
+        auto const& sizes = desc.sizes();
+        auto        count = CoordCount(sizes.begin(), std::prev(sizes.end()));
+
+        std::vector<size_t> prevCoord(desc.dimensions(), 0);
+        for(size_t coordNum = 0; coordNum < count; coordNum++)
+        {
+            std::vector<size_t> coord(desc.dimensions(), 0);
+            CoordNumbered(coordNum,
+                          coord.begin(),
+                          std::prev(coord.end()),
+                          sizes.begin(),
+                          std::prev(sizes.end()));
+
+            for(coord.back() = 0; coord.back() < sizes.back(); coord.back()++)
+            {
+                auto idx = desc.index(coord);
+
+                if(coord.back() > 0)
+                    rv += " ";
+                else
+                    rv += fmt::format("{: >8}| ", idx);
+
+                rv += fmt::format("{: >8}", data.at(idx));
+            }
+
+            rv += "\n";
+
+            bool newDim = false;
+            for(int idx = 0; idx < desc.dimensions(); idx++)
+            {
+                if(coord[idx] < prevCoord[idx])
+                {
+                    newDim = true;
+                    break;
+                }
+            }
+
+            if(newDim)
+            {
+                rv += "\n";
+                bool first = true;
+                for(auto dim : coord)
+                {
+                    if(!first)
+                        rv += ", ";
+                    rv += std::to_string(dim);
+                    first = false;
+                }
+            }
+
+            prevCoord = std::move(coord);
+        }
+
+        return rv;
+    }
+
+    template <typename T>
+    inline std::vector<T> shuffleDims(std::vector<T> const&   input,
+                                      TensorDescriptor const& dst,
+                                      TensorDescriptor const& src)
+    {
+        AssertFatal(dst.dimensions() > 1, ShowValue(dst.dimensions()));
+        AssertFatal(dst.sizes() == src.sizes(), ShowValue(dst.sizes()), ShowValue(src.sizes()));
+        AssertFatal(dst.dataType() == src.dataType());
+
+        auto const& sizes = dst.sizes();
+
+        std::vector<T> rv(input.size());
+
+        auto count = CoordCount(sizes.begin(), std::prev(sizes.end()));
+#pragma omp parallel for
+        for(size_t coordNum = 0; coordNum < count; coordNum++)
+        {
+            std::vector<size_t> coord(dst.dimensions(), 0);
+            CoordNumbered(coordNum,
+                          coord.begin(),
+                          std::prev(coord.end()),
+                          sizes.begin(),
+                          std::prev(sizes.end()));
+
+            for(coord.back() = 0; coord.back() < sizes.back(); coord.back()++)
+            {
+                auto dstIdx = dst.index(coord);
+                auto srcIdx = src.index(coord);
+
+                rv.at(dstIdx) = input.at(srcIdx);
+            }
+        }
+
+        return rv;
+    }
+}
diff --git a/shared/rocroller/lib/source/CMakeLists.txt b/shared/rocroller/lib/source/CMakeLists.txt
index e8bce2fc109..275b7985736 100644
--- a/shared/rocroller/lib/source/CMakeLists.txt
+++ b/shared/rocroller/lib/source/CMakeLists.txt
@@ -38,6 +38,7 @@ target_sources(rocroller
         "${CMAKE_CURRENT_SOURCE_DIR}/ReplaceKernelArgs.cpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/ScheduledInstructions.cpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/Scheduler.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/TensorDescriptor.cpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/Timer.cpp"
 )
 
diff --git a/shared/rocroller/lib/source/CommandSolution.cpp b/shared/rocroller/lib/source/CommandSolution.cpp
index 50d2eed23f0..3474aefbb16 100644
--- a/shared/rocroller/lib/source/CommandSolution.cpp
+++ b/shared/rocroller/lib/source/CommandSolution.cpp
@@ -318,7 +318,7 @@ namespace rocRoller
         if(!m_context->kernelOptions()->lazyAddArguments)
             m_context->kernel()->addCommandArguments(m_command->getArguments());
 
-        auto kernelGraph = KernelGraph::translate(m_command);
+        auto kernelGraph = KernelGraph::translate(m_command, m_commandParameters);
 
         if(Settings::getInstance()->get(Settings::LogGraphs))
             Log::debug("CommandKernel::generateKernel: post translate: {}",
diff --git a/shared/rocroller/lib/source/KernelGraph/LowerFromCommand.cpp b/shared/rocroller/lib/source/KernelGraph/LowerFromCommand.cpp
index e13018b66f7..8da333db67e 100644
--- a/shared/rocroller/lib/source/KernelGraph/LowerFromCommand.cpp
+++ b/shared/rocroller/lib/source/KernelGraph/LowerFromCommand.cpp
@@ -31,6 +31,7 @@
 #include <variant>
 #include <vector>
 
+#include <rocRoller/CommandSolution.hpp>
 #include <rocRoller/Expression.hpp>
 #include <rocRoller/KernelGraph/KernelGraph.hpp>
 #include <rocRoller/Operations/BlockScale.hpp>
@@ -656,23 +657,62 @@ namespace rocRoller
                     = std::visit(getBlockParams, *bSource);
 
                 // contraction dims are {1} and {0}, which is matrix multiplication
-                auto TC            = m_graph.control.addElement(contraction);
+                auto TC            = m_graph.control.addElement(NOP{});
                 m_op[mul.getTag()] = TC;
                 m_graph.mapper.connect(TC, D, NaryArgument::DEST);
 
                 std::vector<int> sourceDims;
 
                 auto connectBlockScale = [&](Operations::BlockScale const& op,
+                                             Operations::OperationTag      inputTag,
                                              NaryArgument                  valueArg,
-                                             NaryArgument                  scaleArg) {
+                                             NaryArgument scaleArg) -> std::vector<size_t> {
                     auto mode = op.scaleMode();
                     AssertFatal(mode != Operations::ScaleMode::Inline, ShowValue(mode));
 
+                    auto scaleInput   = op.scale().value();
+                    auto scaleInputOp = m_command->findTag(scaleInput);
+                    AssertFatal(scaleInputOp != nullptr);
+
+                    using TensorAndTranspose
+                        = std::tuple<Operations::OperationTag, std::vector<size_t>>;
+                    auto getTensorAndTranspose = rocRoller::overloaded{
+                        [](Operations::SubTileTranspose const& op) -> TensorAndTranspose {
+                            return {op.input(), op.tileDimensions()};
+                        },
+                        [](Operations::Nop const& op) -> TensorAndTranspose {
+                            return {Operations::OperationTag(), {}};
+                        },
+                        [](auto const& op) -> TensorAndTranspose {
+                            return {op.getTag(), {}};
+                        }};
+
+                    auto [scaleTensor, scaleTranspose]
+                        = std::visit(getTensorAndTranspose, *scaleInputOp);
+
+                    if(!scaleTranspose.empty())
+                    {
+                        AssertFatal(m_params);
+                        auto info = m_params->getDimensionInfo().at(scaleTensor);
+                        auto tile
+                            = std::get<rocRoller::KernelGraph::CoordinateGraph::MacroTile>(info);
+                        size_t miKScale = tile.miTileSizes.at(2);
+
+                        std::vector<size_t> expectedTile{64, 4, miKScale};
+
+                        AssertFatal(scaleTranspose == expectedTile,
+                                    ShowValue(scaleTranspose),
+                                    ShowValue(expectedTile),
+                                    ShowValue(valueArg),
+                                    ShowValue(tile),
+                                    ShowValue(tile.miTileSizes));
+                    }
+
                     auto X      = m_dim.at(op.data());
-                    auto XScale = m_dim.at(*op.scale());
+                    auto XScale = m_dim.at(scaleTensor);
 
                     auto loadX      = m_op.at(op.data());
-                    auto loadXScale = m_op.at(*op.scale());
+                    auto loadXScale = m_op.at(scaleTensor);
 
                     m_graph.control.addElement(Sequence(), {loadX}, {TC});
                     m_graph.control.addElement(Sequence(), {loadXScale}, {TC});
@@ -681,49 +721,52 @@ namespace rocRoller
                     m_graph.mapper.connect(TC, XScale, scaleArg);
 
                     sourceDims.insert(sourceDims.end(), {X, XScale});
-                };
-
-                // Handle A, either T_Load_Tiled or BlockScale
-                std::visit(
-                    rocRoller::overloaded{
-                        [&](auto const& op, auto, auto) { Throw<FatalError>("Can't go here!"); },
-
-                        [&](Operations::T_Load_Tiled const& op, NaryArgument valueArg, auto) {
-                            // This is difficult to make common with the B version since it needs to directly access mul.a or mul.b.
-                            auto A     = m_dim.at(mul.a);
-                            auto loadA = m_op.at(mul.a);
 
-                            sourceDims.push_back(A);
+                    return scaleTranspose;
+                };
 
-                            m_graph.control.addElement(Sequence(), {loadA}, {TC});
+                auto handleInput = rocRoller::overloaded{
+                    [&](auto const& op, auto, auto, auto) -> std::vector<size_t> {
+                        Throw<FatalError>("Can't go here!");
+                        return {};
+                    },
 
-                            m_graph.mapper.connect(TC, A, valueArg);
-                        },
-                        connectBlockScale},
-                    *aSource,
-                    singleVariant(NaryArgument::LHS),
-                    singleVariant(NaryArgument::LHS_SCALE));
+                    [&](Operations::T_Load_Tiled const& op,
+                        Operations::OperationTag        inputTag,
+                        NaryArgument                    valueArg,
+                        auto) -> std::vector<size_t> {
+                        auto AB     = m_dim.at(inputTag);
+                        auto loadAB = m_op.at(inputTag);
 
-                std::visit(
-                    rocRoller::overloaded{
-                        [&](auto const& op, auto, auto) { Throw<FatalError>("Can't go here!"); },
+                        sourceDims.push_back(AB);
 
-                        [&](Operations::T_Load_Tiled const& op, NaryArgument valueArg, auto) {
-                            auto B     = m_dim.at(mul.b);
-                            auto loadB = m_op.at(mul.b);
+                        m_graph.control.addElement(Sequence(), {loadAB}, {TC});
 
-                            sourceDims.push_back(B);
+                        m_graph.mapper.connect(TC, AB, valueArg);
 
-                            m_graph.control.addElement(Sequence(), {loadB}, {TC});
+                        return {};
+                    },
+                    connectBlockScale};
 
-                            m_graph.mapper.connect(TC, B, valueArg);
-                        },
-                        connectBlockScale},
-                    *bSource,
-                    singleVariant(NaryArgument::RHS),
-                    singleVariant(NaryArgument::RHS_SCALE));
+                // Handle A, either T_Load_Tiled or BlockScale
+                contraction.scalePreShuffledTileA
+                    = std::visit(handleInput,
+                                 *aSource,
+                                 singleVariant(mul.a),
+                                 singleVariant(NaryArgument::LHS),
+                                 singleVariant(NaryArgument::LHS_SCALE));
+
+                contraction.scalePreShuffledTileB
+                    = std::visit(handleInput,
+                                 *bSource,
+                                 singleVariant(mul.b),
+                                 singleVariant(NaryArgument::RHS),
+                                 singleVariant(NaryArgument::RHS_SCALE));
 
                 m_graph.coordinates.addElement(DataFlow(), sourceDims, std::vector{D});
+
+                // Replace contraction in graph after we have filled all the fields.
+                m_graph.control.setElement(TC, std::move(contraction));
             }
 
             void operator()(Operations::BlockScale const& t)
@@ -732,6 +775,8 @@ namespace rocRoller
                             "ScaleMode::Inline not supported yet.");
             }
 
+            void operator()(Operations::SubTileTranspose const& t) {}
+
             void operator()(Operations::Literal const& literal)
             {
                 rocRoller::Log::getLogger()->debug("KernelGraph::TranslateVisitor::Literal");
@@ -745,9 +790,10 @@ namespace rocRoller
             void operator()(Operations::Tensor const& t) {}
             void operator()(Operations::Scalar const& t) {}
 
-            KernelGraph call(CommandPtr command)
+            KernelGraph call(CommandPtr command, CommandParametersPtr params)
             {
                 m_command = command;
+                m_params  = params;
                 for(auto const& op : command->operations())
                 {
                     std::visit(*this, *op);
@@ -770,16 +816,17 @@ namespace rocRoller
             // command tag -> dimension/coordinate tag
             std::map<Operations::OperationTag, int> m_dim;
 
-            CommandPtr m_command;
+            CommandPtr           m_command;
+            CommandParametersPtr m_params;
         };
 
-        KernelGraph translate(CommandPtr command)
+        KernelGraph translate(CommandPtr command, CommandParametersPtr params)
         {
             TIMER(t, "KernelGraph::translate");
             rocRoller::Log::getLogger()->debug("KernelGraph::translate(); Command\n{}",
                                                command->toString());
             TranslateVisitor visitor;
-            return visitor.call(command);
+            return visitor.call(command, params);
         }
     }
 }
diff --git a/shared/rocroller/lib/source/KernelGraph/Transformations/LowerTensorContraction.cpp b/shared/rocroller/lib/source/KernelGraph/Transformations/LowerTensorContraction.cpp
index 4ae26952015..d0a1f6d50cc 100644
--- a/shared/rocroller/lib/source/KernelGraph/Transformations/LowerTensorContraction.cpp
+++ b/shared/rocroller/lib/source/KernelGraph/Transformations/LowerTensorContraction.cpp
@@ -28,6 +28,7 @@
 #include <rocRoller/KernelGraph/Transforms/LowerTensorContraction.hpp>
 #include <rocRoller/KernelGraph/Utils.hpp>
 #include <rocRoller/KernelGraph/Visitors.hpp>
+#include <rocRoller/KernelOptions_detail.hpp>
 #include <rocRoller/Operations/Command.hpp>
 #include <rocRoller/Operations/Operations.hpp>
 
@@ -459,6 +460,30 @@ namespace rocRoller
             auto scaleModeA = getScaleMode(info.loadAScale);
             auto scaleModeB = getScaleMode(info.loadBScale);
 
+            {
+                auto expectedSkipValue = false;
+
+                auto contraction = graph.control.get<TensorContraction>(tag).value();
+
+                if(!contraction.scalePreShuffledTileA.empty())
+                {
+                    AssertFatal(
+                        scaleModeA == Operations::ScaleMode::Separate
+                            && scaleModeB == Operations::ScaleMode::Separate
+                            && !contraction.scalePreShuffledTileB.empty(),
+                        "Pre-swizzled inputs must currently be for both A and B or neither.",
+                        ShowValue(scaleModeA),
+                        ShowValue(scaleModeB),
+                        ShowValue(contraction.scalePreShuffledTileB));
+
+                    expectedSkipValue = true;
+                }
+
+                AssertFatal(context->kernelOptions()->scaleSkipPermlane == expectedSkipValue,
+                            ShowValue(context->kernelOptions()->scaleSkipPermlane),
+                            ShowValue(expectedSkipValue));
+            }
+
             auto accumulationCoordSize = getAccumulationLoopSize(graph, a, info.userA);
 
             auto [K, forK] = rangeFor(graph, accumulationCoordSize, rocRoller::KLOOP);
diff --git a/shared/rocroller/lib/source/Operations/BlockScale.cpp b/shared/rocroller/lib/source/Operations/BlockScale.cpp
index baa915f3117..6e64f3cae90 100644
--- a/shared/rocroller/lib/source/Operations/BlockScale.cpp
+++ b/shared/rocroller/lib/source/Operations/BlockScale.cpp
@@ -35,21 +35,26 @@ namespace rocRoller
         BlockScale::BlockScale(OperationTag                data,
                                int                         dimensions,
                                std::optional<OperationTag> scale,
-                               std::vector<size_t> const&  strides)
+                               std::vector<size_t>         strides)
             : BaseOperation()
             , m_data(data)
             , m_scale(scale)
-            , m_strides([&]() {
-                if(dimensions >= 1)
-                {
-                    std::vector<size_t> rt(dimensions, 1);
-                    rt[0] = 32; // Default value for first stride based on hardware arch
-                    std::copy(strides.begin(), strides.end(), rt.begin());
-                    return rt;
-                }
-                return std::vector<size_t>{};
-            }())
         {
+            if(!strides.empty())
+            {
+                m_strides = std::move(strides);
+            }
+
+            if(m_strides.empty())
+            {
+                // Default value for first stride based on hardware arch
+                m_strides.push_back(32);
+            }
+
+            if(m_strides.size() != dimensions)
+            {
+                m_strides.resize(dimensions, 1);
+            }
         }
 
         std::unordered_set<OperationTag> BlockScale::getInputs() const
@@ -125,5 +130,42 @@ namespace rocRoller
         {
             return stream << toString(mode);
         }
+
+        SubTileTranspose::SubTileTranspose(OperationTag input, std::vector<size_t> tileDimensions)
+            : m_input(input)
+            , m_tileDimensions(std::move(tileDimensions))
+        {
+            if(!m_tileDimensions.empty())
+            {
+                AssertFatal(m_tileDimensions.size() == 3, ShowValue(m_tileDimensions));
+                AssertFatal(m_tileDimensions[0] == 64 && m_tileDimensions[1] == 4
+                                && (m_tileDimensions[2] == 2 || m_tileDimensions[2] == 4),
+                            ShowValue(m_tileDimensions));
+            }
+        }
+
+        std::unordered_set<OperationTag> SubTileTranspose::getInputs() const
+        {
+            return {m_input};
+        }
+        std::string SubTileTranspose::toString() const
+        {
+            return fmt::format(
+                "SubTileTranspose({}: input {})", concatenate(m_tileDimensions), m_input.value);
+        }
+        const std::vector<size_t>& SubTileTranspose::tileDimensions() const
+        {
+            return m_tileDimensions;
+        }
+
+        bool SubTileTranspose::operator==(SubTileTranspose const& other) const
+        {
+            return (*this <=> other) == std::strong_ordering::equal;
+        }
+
+        OperationTag SubTileTranspose::input() const
+        {
+            return m_input;
+        }
     }
 }
diff --git a/shared/rocroller/lib/source/Operations/Operation.cpp b/shared/rocroller/lib/source/Operations/Operation.cpp
index f1343a7517b..e83c0badce0 100644
--- a/shared/rocroller/lib/source/Operations/Operation.cpp
+++ b/shared/rocroller/lib/source/Operations/Operation.cpp
@@ -54,5 +54,10 @@ namespace rocRoller
             m_tag = tag;
         }
 
+        std::strong_ordering BaseOperation::operator<=>(BaseOperation const& other) const
+        {
+            return m_tag <=> other.m_tag;
+        }
+
     }
 }
diff --git a/shared/rocroller/lib/source/TensorDescriptor.cpp b/shared/rocroller/lib/source/TensorDescriptor.cpp
new file mode 100644
index 00000000000..42e7cf12815
--- /dev/null
+++ b/shared/rocroller/lib/source/TensorDescriptor.cpp
@@ -0,0 +1,76 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <algorithm>
+#include <cstddef>
+#include <numeric>
+#include <sstream>
+#include <string>
+
+#include <rocRoller/TensorDescriptor.hpp>
+
+namespace rocRoller
+{
+    int64_t TensorDescriptor::dimensionPadding(size_t dim) const
+    {
+        AssertFatal(dim < dimensions(), ShowValue(dim), ShowValue(dimensions()));
+
+        if(dim == 0)
+            return m_strides[0] - 1;
+
+        return m_strides[dim] - (m_strides[dim - 1] * m_sizes[dim - 1]);
+    }
+
+    void TensorDescriptor::collapseDims(size_t begin, size_t end)
+    {
+        AssertFatal(end >= begin, ShowValue(begin), ShowValue(end));
+        AssertFatal(begin < dimensions(), ShowValue(begin), ShowValue(dimensions()));
+        AssertFatal(end <= dimensions(), ShowValue(begin), ShowValue(dimensions()));
+
+        if(end <= (begin + 1))
+            return;
+
+        for(size_t i = begin + 1; i < end; i++)
+            AssertFatal(dimensionPadding(i) == 0, ShowValue(i), ShowValue(dimensionPadding(i)));
+
+        size_t newDimensionSize = 1;
+        for(size_t i = begin; i < end; i++)
+            newDimensionSize *= m_sizes[i];
+
+        m_sizes.erase(m_sizes.begin() + (begin + 1), m_sizes.begin() + end);
+        m_sizes[begin] = newDimensionSize;
+
+        m_strides.erase(m_strides.begin() + (begin + 1), m_strides.begin() + end);
+
+        calculate();
+    }
+
+    std::ostream& operator<<(std::ostream& stream, const TensorDescriptor& t)
+    {
+        return stream << t.toString();
+    }
+
+}
\ No newline at end of file
diff --git a/shared/rocroller/scripts/lib/rrperf/rrsuites.py b/shared/rocroller/scripts/lib/rrperf/rrsuites.py
index 9201e224085..10712d4d615 100644
--- a/shared/rocroller/scripts/lib/rrperf/rrsuites.py
+++ b/shared/rocroller/scripts/lib/rrperf/rrsuites.py
@@ -1334,9 +1334,16 @@ def fp4_16x16x128_scale_options():
     yield from addSkipPermlane(fp4_target_d2lds_mi16x16x128_pf4x1_wgm())
 
 
+def fp4_32x32x64_scale_options():
+    yield from fp4_target_d2lds_mi16x16x128_pf4x1_wgm()
+    yield from addSkipPermlane(fp4_target_d2lds_mi32x32x64_pf4x1_wgm())
+
+
 def fp4_kernels():
     yield from fp4_kernels_no_wgm()
     yield from fp4_kernels_wgm()
+    yield from fp4_16x16x128_scale_options()
+    yield from fp4_32x32x64_scale_options()
 
 
 def fp4_target_sweep_wgms():
diff --git a/shared/rocroller/test/catch/CMakeLists.txt b/shared/rocroller/test/catch/CMakeLists.txt
index cdcd45529d0..8b4d1dd262c 100644
--- a/shared/rocroller/test/catch/CMakeLists.txt
+++ b/shared/rocroller/test/catch/CMakeLists.txt
@@ -74,6 +74,7 @@ target_sources(rocroller-tests-catch
         "${CMAKE_CURRENT_SOURCE_DIR}/ScaleUtilsTest.cpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/SettingsTest.cpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/SubDwordExpressionTest.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/TensorDescriptorTest.cpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/TernaryExpressionTest.cpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/TestContext.cpp"
         "${CMAKE_CURRENT_SOURCE_DIR}/TestContextTest.cpp"
diff --git a/shared/rocroller/test/catch/TensorDescriptorTest.cpp b/shared/rocroller/test/catch/TensorDescriptorTest.cpp
new file mode 100644
index 00000000000..0ce5faa77ff
--- /dev/null
+++ b/shared/rocroller/test/catch/TensorDescriptorTest.cpp
@@ -0,0 +1,333 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/generators/catch_generators.hpp>
+#include <catch2/generators/catch_generators_adapters.hpp>
+#include <catch2/generators/catch_generators_random.hpp>
+#include <catch2/generators/catch_generators_range.hpp>
+
+#include <rocRoller/TensorDescriptor.hpp>
+
+#include <rocRoller/Utilities/Random.hpp>
+
+#include <cstddef>
+
+using namespace rocRoller;
+
+TEST_CASE("TensorDescriptor basic functionality", "[utils][tensor-descriptor]")
+{
+    TensorDescriptor t(DataType::Float, {11, 13, 17});
+
+    CHECK(t.dimensions() == 3);
+    CHECK(t.sizes() == std::vector<size_t>({11, 13, 17}));
+    CHECK(t.strides() == std::vector<size_t>({1, 11, 11 * 13}));
+
+    CHECK(t.totalLogicalElements() == 11 * 13 * 17);
+    CHECK(t.totalAllocatedElements() == 11 * 13 * 17);
+    CHECK(t.totalAllocatedBytes() == 11 * 13 * 17 * 4);
+
+    for(int i = 0; i < 3; i++)
+        CHECK(t.dimensionPadding(i) == 0);
+
+    CHECK(t.index(3, 4, 1) == 3 + 4 * 11 + 11 * 13);
+}
+
+TEST_CASE("TensorDescriptor works with overlapping strides", "[utils][tensor-descriptor]")
+{
+    TensorDescriptor t(DataType::Float, {4, 6, 3}, {1, 4, 1});
+
+    CHECK(t.dimensions() == 3);
+    CHECK(t.sizes() == std::vector<size_t>({4, 6, 3}));
+    CHECK(t.strides() == std::vector<size_t>({1, 4, 1}));
+
+    CHECK(t.dimensionPadding(0) == 0);
+    CHECK(t.dimensionPadding(1) == 0);
+
+    CHECK(t.totalLogicalElements() == 4 * 6 * 3);
+    CHECK(t.totalAllocatedElements() == 4 * 6 + (3 - 1));
+    CHECK(t.totalAllocatedBytes() == (4 * 6 + (3 - 1)) * 4);
+}
+
+TEST_CASE("TensorDescriptor works with padding", "[utils][tensor-descriptor]")
+{
+    TensorDescriptor t(DataType::Float, {11, 13, 17, 4}, {1, 16, 16 * 13, 16 * 13 * 17});
+
+    CHECK(t.dimensions() == 4);
+    CHECK(t.sizes() == std::vector<size_t>({11, 13, 17, 4}));
+    CHECK(t.strides() == std::vector<size_t>({1, 16, 16 * 13, 16 * 13 * 17}));
+
+    CHECK(t.totalLogicalElements() == 11 * 13 * 17 * 4);
+    CHECK(t.totalAllocatedElements()
+          == 1 + 1 * (11 - 1) + 16 * (13 - 1) + (16 * 13) * (17 - 1) + (16 * 13 * 17) * (4 - 1));
+
+    CHECK(t.dimensionPadding(0) == 0);
+    CHECK(t.dimensionPadding(1) == 5);
+    CHECK(t.dimensionPadding(2) == 0);
+    CHECK(t.dimensionPadding(3) == 0);
+
+    CHECK(t.index(3, 4, 1, 2) == 3 + 4 * 16 + 16 * 13 + 16 * 13 * 17 * 2);
+}
+
+TEST_CASE("TensorDescriptor works with simplified padding", "[utils][tensor-descriptor]")
+{
+    TensorDescriptor t(DataType::Float, {4, 5}, {1, 8});
+
+    CHECK(t.dimensions() == 2);
+    CHECK(t.sizes() == std::vector<size_t>({4, 5}));
+    CHECK(t.strides() == std::vector<size_t>({1, 8})); // default 1,4
+
+    CHECK(t.dimensionPadding(0) == 0);
+    CHECK(t.dimensionPadding(1) == 4);
+
+    CHECK(t.totalLogicalElements() == 4 * 5);
+    CHECK(t.totalAllocatedElements() == 4 + 8 * (5 - 1));
+    CHECK(t.totalAllocatedBytes() == (4 + 8 * (5 - 1)) * 4);
+}
+
+TEST_CASE("TensorDescriptor works with zero strides", "[utils][tensor-descriptor]")
+{
+    TensorDescriptor t(DataType::Float,
+                       {4, 5, 6},
+                       {TensorDescriptor::UseDefaultStride, TensorDescriptor::UseDefaultStride, 0});
+
+    CHECK(t.dimensions() == 3);
+    CHECK(t.sizes() == std::vector<size_t>({4, 5, 6}));
+    CHECK(t.strides() == std::vector<size_t>({1, 4, 0})); // default 1,4
+
+    CHECK(t.dimensionPadding(0) == 0);
+    CHECK(t.dimensionPadding(1) == 0);
+    CHECK(t.dimensionPadding(2) == -20);
+
+    CHECK(t.totalLogicalElements() == 4 * 5 * 6);
+    CHECK(t.totalAllocatedElements() == 4 * 5);
+    CHECK(t.totalAllocatedBytes() == 4 * 5 * 4);
+}
+
+TEST_CASE("TensorDescriptor::CollapseDims works", "[utils][tensor-descriptor]")
+{
+    TensorDescriptor t(DataType::Float, {11, 13, 17, 4}, {1, 16, 16 * 13, 16 * 13 * 17});
+
+    TensorDescriptor u = t;
+    CHECK_THROWS_AS(u.collapseDims(0, 2), std::runtime_error);
+
+    u.collapseDims(1, 3);
+
+    CHECK(u.dimensions() == 3);
+    CHECK(u.sizes() == std::vector<size_t>({11, 13 * 17, 4}));
+    CHECK(u.strides() == std::vector<size_t>({1, 16, 16 * 13 * 17}));
+
+    CHECK(u.totalLogicalElements() == t.totalLogicalElements());
+    CHECK(u.totalAllocatedElements() == t.totalAllocatedElements());
+    CHECK(u.totalAllocatedBytes() == t.totalAllocatedBytes());
+}
+
+TEST_CASE("TensorDescriptor::CollapseDims works part 2", "[utils][tensor-descriptor]")
+{
+    TensorDescriptor t(DataType::Float, {11, 13, 17, 4});
+
+    SECTION("0,2")
+    {
+        TensorDescriptor u = t;
+        u.collapseDims(0, 2);
+
+        CHECK(u.dimensions() == 3);
+        CHECK(u.sizes() == std::vector<size_t>({11 * 13, 17, 4}));
+        CHECK(u.strides() == std::vector<size_t>({1, 11 * 13, 11 * 13 * 17}));
+
+        CHECK(u.totalLogicalElements() == t.totalLogicalElements());
+        CHECK(u.totalAllocatedElements() == t.totalAllocatedElements());
+        CHECK(u.totalAllocatedBytes() == t.totalAllocatedBytes());
+    }
+
+    SECTION("0,4")
+    {
+        TensorDescriptor u = t;
+        u.collapseDims(0, 4);
+
+        CHECK(u.dimensions() == 1);
+        CHECK(u.sizes() == std::vector<size_t>({11 * 13 * 17 * 4}));
+        CHECK(u.strides() == std::vector<size_t>({1}));
+
+        CHECK(u.totalLogicalElements() == t.totalLogicalElements());
+        CHECK(u.totalAllocatedElements() == t.totalAllocatedElements());
+        CHECK(u.totalAllocatedBytes() == t.totalAllocatedBytes());
+    }
+
+    SECTION("1,4")
+    {
+        TensorDescriptor u = t;
+        u.collapseDims(1, 4);
+
+        CHECK(u.dimensions() == 2);
+        CHECK(u.sizes() == std::vector<size_t>({11, 13 * 17 * 4}));
+        CHECK(u.strides() == std::vector<size_t>({1, 11}));
+
+        CHECK(u.totalLogicalElements() == t.totalLogicalElements());
+        CHECK(u.totalAllocatedElements() == t.totalAllocatedElements());
+        CHECK(u.totalAllocatedBytes() == t.totalAllocatedBytes());
+    }
+
+    SECTION("1,3")
+    {
+        TensorDescriptor u = t;
+        u.collapseDims(1, 3);
+
+        CHECK(u.dimensions() == 3);
+        CHECK(u.sizes() == std::vector<size_t>({11, 13 * 17, 4}));
+        CHECK(u.strides() == std::vector<size_t>({1, 11, 11 * 13 * 17}));
+
+        CHECK(u.totalLogicalElements() == t.totalLogicalElements());
+        CHECK(u.totalAllocatedElements() == t.totalAllocatedElements());
+        CHECK(u.totalAllocatedBytes() == t.totalAllocatedBytes());
+    }
+}
+
+TEST_CASE("IncrementCoord works for 2 dimensions", "[utils][tensor-descriptor]")
+{
+    std::vector<size_t> dims{2, 4};
+    std::vector<size_t> lastCoord{1, 3};
+    std::vector<size_t> coordRef(2);
+    std::vector<size_t> coordRun(2);
+
+    for(coordRef[1] = 0; coordRef[1] < dims[1]; coordRef[1]++)
+        for(coordRef[0] = 0; coordRef[0] < dims[0]; coordRef[0]++)
+        {
+            CHECK(coordRun == coordRef);
+
+            bool continueIteration
+                = IncrementCoord(coordRun.begin(), coordRun.end(), dims.begin(), dims.end());
+            if(coordRef == lastCoord)
+                CHECK(continueIteration == false);
+            else
+                CHECK(continueIteration == true);
+        }
+
+    coordRef = {0, 0};
+    CHECK(coordRun == coordRef);
+
+    CHECK(IncrementCoord(coordRun.begin(), coordRun.end(), dims.begin(), dims.end()));
+}
+
+TEST_CASE("Default strides work for TensorDescriptor.", "[utils][tensor-descriptor]")
+{
+    TensorDescriptor desc(DataType::Float, {4, 5, 6}, {static_cast<size_t>(-1), 5});
+    CHECK(desc.dimensions() == 3);
+    CHECK(desc.sizes() == std::vector<size_t>({4, 5, 6}));
+    CHECK(desc.strides() == std::vector<size_t>({1, 5, 25}));
+}
+
+TEST_CASE("Specifying a subset of strides works for TensorDescriptor.",
+          "[utils][tensor-descriptor]")
+{
+    TensorDescriptor desc(DataType::Float, {4, 5, 6}, {5});
+    CHECK(desc.dimensions() == 3);
+    CHECK(desc.sizes() == std::vector<size_t>({4, 5, 6}));
+    CHECK(desc.strides() == std::vector<size_t>({5, 20, 100}));
+}
+
+TEST_CASE("ShuffleDims works in the no-op case", "[utils][tensor-descriptor]")
+{
+    auto dims = GENERATE(Catch::Generators::range(2, 10));
+    DYNAMIC_SECTION(fmt::format("dims={}", dims))
+    {
+        auto seed = GENERATE(Catch::Generators::take(
+            4, Catch::Generators::random(0, std::numeric_limits<int>::max())));
+
+        DYNAMIC_SECTION(fmt::format("seed={}", seed))
+        {
+            auto sizes = RandomGenerator(seed).vector<size_t>(dims, 1, 12);
+            CAPTURE(sizes);
+
+            TensorDescriptor desc(DataType::Int32, sizes);
+
+            auto input  = iota<int>(0, desc.totalAllocatedElements()).to<std::vector>();
+            auto output = shuffleDims(input, desc, desc);
+
+            CHECK(input == output);
+        }
+    }
+}
+
+TEST_CASE("ShuffleDims is reversible", "[utils][tensor-descriptor]")
+{
+    auto dims = GENERATE(Catch::Generators::range(2, 10));
+    DYNAMIC_SECTION(fmt::format("dims={}", dims))
+    {
+        auto seed = GENERATE(Catch::Generators::take(
+            4, Catch::Generators::random(0, std::numeric_limits<int>::max())));
+
+        DYNAMIC_SECTION(fmt::format("seed={}", seed))
+        {
+            auto gen   = RandomGenerator(seed);
+            auto sizes = gen.vector<size_t>(dims, 2, 12);
+            CAPTURE(sizes);
+
+            TensorDescriptor src(DataType::Int32, sizes);
+
+            std::vector<size_t> order;
+            {
+                auto indices = iota<size_t>(0, dims).to<std::vector>();
+                while(!indices.empty())
+                {
+                    auto idx = gen.next<size_t>(0, indices.size() - 1);
+                    REQUIRE(idx < dims);
+                    order.push_back(indices.at(idx));
+                    indices.erase(next(indices.begin(), idx));
+                }
+            }
+
+            auto dst = TensorDescriptor::ShuffledNoPadding(DataType::Int32, sizes, order);
+
+            auto numNonUnitSizes = std::ranges::count_if(sizes, [](auto x) { return x > 1; });
+
+            CAPTURE(src.strides());
+            CAPTURE(dst.strides());
+
+            auto input = iota<int>(0, src.totalAllocatedElements()).to<std::vector>();
+
+            SECTION("a -> b -> a")
+            {
+                auto intermediate = shuffleDims(input, dst, src);
+                CAPTURE(intermediate);
+                if(numNonUnitSizes > 1 && src.strides() != dst.strides())
+                    CHECK(input != intermediate);
+                auto output = shuffleDims(intermediate, src, dst);
+                REQUIRE(input == output);
+            }
+
+            SECTION("b -> a -> b")
+            {
+                auto intermediate = shuffleDims(input, src, dst);
+                CAPTURE(intermediate);
+                if(numNonUnitSizes > 1 && src.strides() != dst.strides())
+                    CHECK(input != intermediate);
+                auto output = shuffleDims(intermediate, dst, src);
+                REQUIRE(input == output);
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/shared/rocroller/test/common/include/common/mxDataGen.hpp b/shared/rocroller/test/common/include/common/mxDataGen.hpp
index 9fd5090b1a8..51fcc1127a4 100644
--- a/shared/rocroller/test/common/include/common/mxDataGen.hpp
+++ b/shared/rocroller/test/common/include/common/mxDataGen.hpp
@@ -94,12 +94,12 @@ namespace rocRoller
 
     template <typename rrDT>
     DGen::DataGenerator<typename rrDT2DGenDT<rrDT>::type>
-        getDataGenerator(TensorDescriptor& desc,
-                         const float       min,
-                         const float       max,
-                         const uint32_t    seed,
-                         const index_t     blockScaling = 1,
-                         const DataPattern pattern      = Bounded)
+        getDataGenerator(TensorDescriptor const& desc,
+                         const float             min,
+                         const float             max,
+                         const uint32_t          seed,
+                         const index_t           blockScaling = 1,
+                         const DataPattern       pattern      = Bounded)
     {
         auto sizes   = desc.sizes();
         auto strides = desc.strides();
@@ -175,20 +175,20 @@ namespace rocRoller
     }
 
     template <typename TA, typename TB, typename TC>
-    void DGenInput(const uint32_t        seed,
-                   std::vector<TA>&      hostA,
-                   TensorDescriptor&     descA,
-                   std::vector<TB>&      hostB,
-                   TensorDescriptor&     descB,
-                   std::vector<TC>&      hostC,
-                   TensorDescriptor&     descC,
-                   std::vector<uint8_t>& hostScaleA,
-                   std::vector<uint8_t>& hostScaleB,
-                   bool                  hasScaleA      = false,
-                   bool                  hasScaleB      = false,
-                   float                 min            = -1.f,
-                   float                 max            = 1.f,
-                   const uint            scaleBlockSize = 32
+    void DGenInput(const uint32_t          seed,
+                   std::vector<TA>&        hostA,
+                   TensorDescriptor const& descA,
+                   std::vector<TB>&        hostB,
+                   TensorDescriptor const& descB,
+                   std::vector<TC>&        hostC,
+                   TensorDescriptor const& descC,
+                   std::vector<uint8_t>&   hostScaleA,
+                   std::vector<uint8_t>&   hostScaleB,
+                   bool                    hasScaleA      = false,
+                   bool                    hasScaleB      = false,
+                   float                   min            = -1.f,
+                   float                   max            = 1.f,
+                   const uint              scaleBlockSize = 32
 
     )
     {
diff --git a/shared/rocroller/test/unit/PermLanesTest.cpp b/shared/rocroller/test/unit/PermLanesTest.cpp
index f34c1c6a437..8e47ae145c5 100644
--- a/shared/rocroller/test/unit/PermLanesTest.cpp
+++ b/shared/rocroller/test/unit/PermLanesTest.cpp
@@ -33,6 +33,7 @@
 #include <rocRoller/KernelGraph/CoordinateGraph/CoordinateGraph.hpp>
 #include <rocRoller/KernelGraph/KernelGraph.hpp>
 #include <rocRoller/KernelGraph/Transforms/All.hpp>
+#include <rocRoller/TensorDescriptor.hpp>
 
 #include "GPUContextFixture.hpp"
 
@@ -160,25 +161,50 @@ namespace PermLanesTest
                 result.data(), dResult.get(), result.size() * sizeof(uint8_t), hipMemcpyDefault),
             HasHipSuccess(0));
 
-        auto nWaves = 4;
-        auto factor = waveK / miK;
-        auto nLanes = 16;
-        for(int wave = 0; wave < nWaves; wave++)
-            for(int simdBlock = 0; simdBlock < miK; simdBlock++)
-                for(int simdIndex = 0; simdIndex < factor; simdIndex++)
-                    for(int lane = 0; lane < nLanes; lane++)
-                        for(int vgprBlock = 0; vgprBlock < factor; vgprBlock++)
-                            for(int vgprIndex = 0; vgprIndex < miK; vgprIndex++)
-                            {
-                                ASSERT_EQ(a[wave * waveK * nLanes * waveK
-                                            + simdBlock * factor * nLanes * waveK
-                                            + simdIndex * nLanes * waveK + lane * waveK
-                                            + vgprBlock * miK + vgprIndex],
-                                          result[wave * waveK * nLanes * waveK
-                                                 + vgprIndex * factor * nLanes * waveK
-                                                 + simdIndex * nLanes * waveK + lane * waveK
-                                                 + vgprBlock * miK + simdBlock]);
-                            }
+        int nWaves = 4;
+        int factor = waveK / miK;
+        int nLanes = 16;
+
+        // clang-format off
+        for(int wave      = 0;      wave < nWaves; wave++)
+        for(int simdBlock = 0; simdBlock < miK;    simdBlock++)
+        for(int simdIndex = 0; simdIndex < factor; simdIndex++)
+        for(int lane      = 0;      lane < nLanes; lane++)
+        for(int vgprBlock = 0; vgprBlock < factor; vgprBlock++)
+        for(int vgprIndex = 0; vgprIndex < miK;    vgprIndex++)
+        {
+            auto aIdx = wave * waveK * nLanes * waveK
+                        + simdBlock * factor * nLanes * waveK
+                        + simdIndex * nLanes * waveK
+                        + lane * waveK
+                        + vgprBlock * miK
+                        + vgprIndex;
+            
+            auto resultIdx = wave * waveK * nLanes * waveK
+                             + vgprIndex * factor * nLanes * waveK
+                             + simdIndex * nLanes * waveK
+                             + lane * waveK
+                             + vgprBlock * miK
+                             + simdBlock;
+
+            ASSERT_EQ(a[aIdx], result[resultIdx]);
+        }
+        // clang-format on
+
+        std::vector<size_t> sizes = {static_cast<size_t>(miK),
+                                     static_cast<size_t>(factor),
+                                     static_cast<size_t>(nLanes),
+                                     static_cast<size_t>(factor),
+                                     static_cast<size_t>(miK),
+                                     static_cast<size_t>(nWaves)};
+
+        auto order = {4, 1, 2, 3, 0, 5};
+
+        TensorDescriptor src(DataType::E8M0, sizes);
+        auto dst = TensorDescriptor::ShuffledNoPadding(DataType::E8M0, sizes, {4, 1, 2, 3, 0, 5});
+
+        auto a_reordered = shuffleDims(a, dst, src);
+        EXPECT_EQ(a_reordered, result);
     }
 
     TEST_F(PermLanesTest, PermLanesBlockScale16x4GPUTest)