From 3d920d9a84f14135a79902e48d2a04359e5cf166 Mon Sep 17 00:00:00 2001
From: Bartlomiej Kocot <barkocot@amd.com>
Date: Wed, 20 Dec 2023 20:20:02 +0000
Subject: [PATCH 1/7] Add tensor partition and generic copy for ck wrapper

---
 docs/wrapper.rst                              |   8 +
 include/ck/wrapper/layout.hpp                 |  71 +++--
 include/ck/wrapper/operations/copy.hpp        |  42 +++
 include/ck/wrapper/tensor.hpp                 |   3 +
 include/ck/wrapper/utils/layout_utils.hpp     |  10 +-
 include/ck/wrapper/utils/tensor_partition.hpp | 264 ++++++++++++++++++
 include/ck/wrapper/utils/tensor_utils.hpp     |  24 +-
 test/wrapper/CMakeLists.txt                   |   4 +
 test/wrapper/test_copy.cpp                    | 132 +++++++++
 test/wrapper/test_layout.cpp                  |  14 +-
 test/wrapper/test_partition.cpp               | 172 ++++++++++++
 test/wrapper/test_tensor.cpp                  |   7 +-
 12 files changed, 705 insertions(+), 46 deletions(-)
 create mode 100644 include/ck/wrapper/operations/copy.hpp
 create mode 100644 include/ck/wrapper/utils/tensor_partition.hpp
 create mode 100644 test/wrapper/test_copy.cpp
 create mode 100644 test/wrapper/test_partition.cpp
diff --git a/docs/wrapper.rst b/docs/wrapper.rst
index a2f60b97ae4..da3a79eda85 100644
--- a/docs/wrapper.rst
+++ b/docs/wrapper.rst
@@ -71,3 +71,11 @@ Tensor helpers
 -------------------------------------
 
 .. doxygenfile:: tensor_utils.hpp
+
+.. doxygenfile:: tensor_partition.hpp
+
+-------------------------------------
+Operations
+-------------------------------------
+
+.. doxygenfile:: copy.hpp
diff --git a/include/ck/wrapper/layout.hpp b/include/ck/wrapper/layout.hpp
index f20d985b497..4b6f1b19bf4 100644
--- a/include/ck/wrapper/layout.hpp
+++ b/include/ck/wrapper/layout.hpp
@@ -48,22 +48,24 @@ struct Layout
     // Generate packed (column-major) strides if not passed
     template <typename... Ts>
     __host__ __device__ constexpr static auto
-    GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
+    GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape, index_t& stride)
     {
-        const auto unrolled_shape = UnrollNestedTuple(shape);
         return generate_tuple(
             [&](auto i) {
-                if constexpr(i.value == 0)
+                const auto num_i = Number<i>{};
+                if constexpr(is_detected<is_tuple, tuple_element_t<i, Tuple<Ts...>>>::value)
                 {
-                    return I1;
+                    return GenerateColumnMajorPackedStrides(shape.At(num_i), stride);
                 }
                 else
                 {
-                    return TupleReduce<I0.value, i.value>([](auto x, auto y) { return x * y; },
-                                                          unrolled_shape);
+                    const index_t dim_stride = stride;
+                    // update stride
+                    stride *= shape.At(num_i);
+                    return dim_stride;
                 }
             },
-            Number<decltype(unrolled_shape)::Size()>{});
+            Number<Tuple<Ts...>::Size()>{});
     }
 
     // Generate LowerDims in Compile-time for MergeTrasform using passed Type
@@ -211,20 +213,28 @@ struct Layout
     __host__ __device__ static auto MakeFlattenDescriptor(const LayoutShape& shape,
                                                           const LayoutStrides& strides)
     {
-        const auto unrolled_shape   = UnrollNestedTuple(shape);
-        const auto unrolled_strides = UnrollNestedTuple(strides);
-        static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
-                      "Size of strides and shape are not consistent.");
-        return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
+        const auto unrolled_shape = UnrollNestedTuple(shape);
+        if constexpr(is_same_v<LayoutStrides, Tuple<>>)
+        {
+            index_t start_stride = 1;
+            // if not passed, then generate
+            const auto unrolled_strides =
+                GenerateColumnMajorPackedStrides(unrolled_shape, start_stride);
+            static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
+                          "Size of strides and shape are not consistent.");
+            return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
+        }
+        else
+        {
+            const auto unrolled_strides = UnrollNestedTuple(strides);
+            static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
+                          "Size of strides and shape are not consistent.");
+            return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
+        }
     }
 
-    // If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`.
-    using DeducedStrides =
-        std::conditional_t<is_same_v<Strides, Tuple<>>,
-                           remove_cvref_t<decltype(GenerateColumnMajorPackedStrides(Shape{}))>,
-                           Strides>;
     using FlattenDescriptorType =
-        remove_cvref_t<decltype(MakeFlattenDescriptor(Shape{}, DeducedStrides{}))>;
+        remove_cvref_t<decltype(MakeFlattenDescriptor(Shape{}, Strides{}))>;
     using Descriptor1dType =
         remove_cvref_t<decltype(MakeMerge1d(Shape{}, FlattenDescriptorType{}))>;
     using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>;
@@ -290,7 +300,7 @@ struct Layout
      * \param shape Shape for layout.
      */
     __host__ __device__ constexpr Layout(const Shape& shape)
-        : flatten_descriptor_{}, shape_(shape), strides_(GenerateColumnMajorPackedStrides(shape_))
+        : flatten_descriptor_{}, shape_(shape), strides_()
     {
         if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
         {
@@ -299,6 +309,10 @@ struct Layout
             merged_nests_descriptor_ =
                 TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_);
         }
+        else
+        {
+            static_assert(true, "Compiletime Layout require strides parameter.");
+        }
     }
 
     /**
@@ -351,7 +365,7 @@ struct Layout
      * \return Calculated size.
      */
     template <index_t IDim>
-    __host__ __device__ constexpr index_t GetLength() const
+    __host__ __device__ constexpr auto GetLength() const
     {
         const auto elem = shape_.At(Number<IDim>{});
         if constexpr(is_detected<is_tuple, tuple_element_t<IDim, Shape>>::value)
@@ -371,7 +385,7 @@ struct Layout
      *
      * \return Calculated size.
      */
-    __host__ __device__ constexpr index_t GetLengths() const
+    __host__ __device__ constexpr auto GetLengths() const
     {
         const auto unrolled_shape = UnrollNestedTuple(shape_);
         return TupleReduce<I0.value, unrolled_shape.Size()>([](auto x, auto y) { return x * y; },
@@ -390,7 +404,18 @@ struct Layout
      *
      * \return Strides.
      */
-    __host__ __device__ constexpr const DeducedStrides& GetStrides() const { return strides_; }
+    __host__ __device__ constexpr auto GetStrides() const
+    {
+        if constexpr(is_same_v<Strides, Tuple<>>)
+        {
+            index_t start_stride = 1;
+            return GenerateColumnMajorPackedStrides(shape_, start_stride);
+        }
+        else
+        {
+            return strides_;
+        }
+    }
 
     /**
      * \brief Get default lengths (tuple filled with Shape length elements).
@@ -427,7 +452,7 @@ struct Layout
     Descriptor1dType descriptor_1d_;
     MergedNestsDescriptorType merged_nests_descriptor_;
     const Shape shape_;
-    const DeducedStrides strides_;
+    const Strides strides_;
 };
 
 } // namespace wrapper
diff --git a/include/ck/wrapper/operations/copy.hpp b/include/ck/wrapper/operations/copy.hpp
new file mode 100644
index 00000000000..267464c0963
--- /dev/null
+++ b/include/ck/wrapper/operations/copy.hpp
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "../utils/tensor_utils.hpp"
+
+namespace ck {
+namespace wrapper {
+
+/**
+ * \brief Perform generic copy between two tensors. Tensors must have the
+ *  same size.
+ *
+ * \param src_tensor Source tensor.
+ * \param dst_tensor Destination tensor.
+ */
+template <typename SrcTensorType, typename DstTensorType>
+__host__ __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
+{
+    assert(size(src_tensor) == size(dst_tensor));
+    using SrcSizeTensor = decltype(size(src_tensor));
+    using DstSizeTensor = decltype(size(dst_tensor));
+    if constexpr(is_known_at_compile_time<SrcSizeTensor>::value)
+    {
+        static_for<0, SrcSizeTensor{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); });
+    }
+    else if constexpr(is_known_at_compile_time<DstSizeTensor>::value)
+    {
+        static_for<0, DstSizeTensor{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); });
+    }
+    else
+    {
+        for(int i = 0; i < size(src_tensor); i++)
+        {
+            dst_tensor(i) = src_tensor(i);
+        }
+    }
+}
+
+} // namespace wrapper
+} // namespace ck
diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp
index 4ec6498fbc6..fad790a6902 100644
--- a/include/ck/wrapper/tensor.hpp
+++ b/include/ck/wrapper/tensor.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "utils/tensor_utils.hpp"
+#include "utils/tensor_partition.hpp"
 #include "utils/layout_utils.hpp"
 
 namespace ck {
@@ -292,6 +293,8 @@ struct Tensor
         return layout_.GetDefaultDescriptor();
     }
 
+    __host__ __device__ ElementType* GetPointer() const { return buffer_.p_data_; }
+
     private:
     using DynamicBufferType = DynamicBuffer<BufferAddressSpace,
                                             ElementType,
diff --git a/include/ck/wrapper/utils/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp
index 5df9dd7dea2..e99596d8f50 100644
--- a/include/ck/wrapper/utils/layout_utils.hpp
+++ b/include/ck/wrapper/utils/layout_utils.hpp
@@ -143,7 +143,7 @@ __host__ __device__ T constexpr size(const T& dim)
  * \return Requsted length.
  */
 template <index_t idx, typename Shape, typename Strides>
-__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
+__host__ __device__ constexpr auto size(const Layout<Shape, Strides>& layout)
 {
     return layout.template GetLength<idx>();
 }
@@ -155,7 +155,7 @@ __host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
  * \return Requsted size.
  */
 template <typename... ShapeDims>
-__host__ __device__ constexpr index_t size(const Tuple<ShapeDims...>& shape)
+__host__ __device__ constexpr auto size(const Tuple<ShapeDims...>& shape)
 {
     const auto unrolled_shape = UnrollNestedTuple(shape);
     return TupleReduce<0, unrolled_shape.Size()>([](auto x, auto y) { return x * y; },
@@ -169,7 +169,7 @@ __host__ __device__ constexpr index_t size(const Tuple<ShapeDims...>& shape)
  * \return Requsted size.
  */
 template <typename Shape, typename Strides>
-__host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
+__host__ __device__ constexpr auto size(const Layout<Shape, Strides>& layout)
 {
     return layout.GetLengths();
 }
@@ -182,7 +182,7 @@ __host__ __device__ constexpr index_t size(const Layout<Shape, Strides>& layout)
  * \return Requsted length.
  */
 template <index_t idx, typename... Ts>
-__host__ __device__ constexpr index_t size(const Tuple<Ts...>& tuple)
+__host__ __device__ constexpr auto size(const Tuple<Ts...>& tuple)
 {
     return size(tuple.At(Number<idx>{}));
 }
@@ -314,7 +314,7 @@ __host__ __device__ constexpr auto depth(const T& elem)
  * \return Requsted strides.
  */
 template <typename Shape, typename Strides>
-__host__ __device__ constexpr const auto& stride(const Layout<Shape, Strides>& layout)
+__host__ __device__ constexpr auto stride(const Layout<Shape, Strides>& layout)
 {
     return layout.GetStrides();
 }
diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp
new file mode 100644
index 00000000000..8639c91b7bb
--- /dev/null
+++ b/include/ck/wrapper/utils/tensor_partition.hpp
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "tensor_utils.hpp"
+#include "layout_utils.hpp"
+
+namespace ck {
+namespace wrapper {
+
+namespace {
+// Calculate shape for partition based on number of threads per each dim and
+// previous shape
+template <typename... Ts, typename... Ls>
+__host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts...>& shape,
+                                                                const Tuple<Ls...>& thread_lengths)
+{
+
+    static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
+    return generate_tuple(
+        [&](auto i) {
+            constexpr auto num_i = Number<i>{};
+            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+            {
+                // if tuple then recurrence
+                return CalculateLocalPartitionShape(shape.At(num_i), thread_lengths.At(num_i));
+            }
+            else
+            {
+                const auto slice_len = shape.At(num_i) / thread_lengths.At(i);
+                return slice_len;
+            }
+        },
+        Number<Tuple<Ts...>::Size()>{});
+}
+
+// Calculate shape for partition based on number of threads per each dim,
+// previous strides and steps
+template <typename... Ts, typename... Ls, typename... Steps>
+__host__ __device__ constexpr auto CalculateLocalPartitionStrides(
+    const Tuple<Ts...>& strides, const Tuple<Ls...>& thread_lengths, const Tuple<Steps...>& steps)
+{
+
+    static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
+    return generate_tuple(
+        [&](auto i) {
+            constexpr auto num_i = Number<i>{};
+            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+            {
+                // if tuple then recurrence
+                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
+                {
+                    return CalculateLocalPartitionStrides(
+                        strides.At(num_i), thread_lengths.At(num_i), Tuple<>{});
+                }
+                else
+                {
+                    return CalculateLocalPartitionStrides(
+                        strides.At(num_i), thread_lengths.At(num_i), steps.At(num_i));
+                }
+            }
+            else
+            {
+                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
+                {
+                    // By default raked partition
+                    const auto partition_stride = thread_lengths.At(i);
+                    return partition_stride * strides.At(i);
+                }
+                else if constexpr(!is_same_v<tuple_element_t<i.value, Tuple<Steps...>>, index_t>)
+                {
+                    // Compiletime partition
+                    if constexpr(is_same_v<tuple_element_t<i.value, Tuple<Steps...>>, Number<1>>)
+                    {
+                        // raked
+                        const auto partition_stride = thread_lengths.At(i);
+                        return partition_stride * strides.At(i);
+                    }
+                    else
+                    {
+                        // packed
+                        return strides.At(i);
+                    }
+                }
+                else
+                {
+                    // Runtime partition
+                    if(steps.At(i) == 1)
+                    {
+                        // raked
+                        const auto partition_stride = thread_lengths.At(i);
+                        return partition_stride * strides.At(i);
+                    }
+                    else
+                    {
+                        // packed
+                        return strides.At(i);
+                    }
+                }
+            }
+        },
+        Number<Tuple<Ts...>::Size()>{});
+}
+
+// Convert interger thread_idx to tuple index with applied steps
+template <typename... Ls, typename... Steps>
+__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Ls...>& thread_lengths,
+                                                            const Tuple<Steps...>& steps,
+                                                            index_t& thread_id)
+{
+    return generate_tuple(
+        [&](auto i) {
+            constexpr auto num_i = Number<i>{};
+            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ls...>>>::value)
+            {
+                // if tuple then recurrence
+                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
+                {
+                    return CalculateLayoutOffsetIdx(thread_lengths.At(num_i), Tuple<>{}, thread_id);
+                }
+                else
+                {
+                    return CalculateLayoutOffsetIdx(
+                        thread_lengths.At(num_i), steps.At(num_i), thread_id);
+                }
+            }
+            else
+            {
+                // Update thread_id after each dim
+                const auto dim_thread_id = thread_id % thread_lengths.At(i);
+                thread_id /= thread_lengths.At(i);
+                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
+                {
+                    return dim_thread_id;
+                }
+                else
+                {
+                    // Apply step
+                    return steps.At(num_i) * dim_thread_id;
+                }
+            }
+        },
+        Number<Tuple<Ls...>::Size()>{});
+}
+
+// Aply steps to index represented as tuple
+template <typename... Steps, typename... Idxs>
+__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Steps...>& steps,
+                                                            const Tuple<Idxs...>& block_idxs)
+{
+    return generate_tuple(
+        [&](auto i) {
+            constexpr auto num_i = Number<i>{};
+            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Idxs...>>>::value)
+            {
+                // if tuple then recurrence
+                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
+                {
+                    return CalculateLayoutOffsetIdx(Tuple<>{}, block_idxs.At(num_i));
+                }
+                else
+                {
+                    return CalculateLayoutOffsetIdx(steps.At(num_i), block_idxs.At(num_i));
+                }
+            }
+            else
+            {
+                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
+                {
+                    return block_idxs.At(num_i);
+                }
+                else
+                {
+                    // apply step
+                    return steps.At(num_i) * block_idxs.At(num_i);
+                }
+            }
+        },
+        Number<Tuple<Idxs...>::Size()>{});
+}
+
+// For make_local_tile user pass only shape per block. This function calculates
+// block layout based on shape.
+template <typename... Ts, typename... BlockDims>
+__host__ __device__ constexpr auto CalculateBlockLengths(const Tuple<Ts...>& shape,
+                                                         const Tuple<BlockDims...>& block_shape)
+{
+    return generate_tuple(
+        [&](auto i) {
+            constexpr auto num_i = Number<i>{};
+            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+            {
+                // if tuple then recurrence
+                return CalculateBlockLengths(shape.At(num_i), block_shape.At(num_i));
+            }
+            else
+            {
+                return shape.At(num_i) / block_shape.At(num_i);
+            }
+        },
+        Number<Tuple<Ts...>::Size()>{});
+}
+} // namespace
+
+/**
+ * \brief Create local partition for thread.
+ *
+ * \param tensor Tensor for partition.
+ * \param thread_lengths Layout of threads.
+ * \param thread_id Thread index represented as integer.
+ * \param steps Thread step (default=1, raked partition)
+ * \return Partition tensor.
+ */
+template <typename TensorType, typename ThreadLengthsTuple, typename StepsTuple = Tuple<>>
+__host__ __device__ constexpr auto make_local_partition(const TensorType& tensor,
+                                                        const ThreadLengthsTuple& thread_lengths,
+                                                        const index_t thread_id,
+                                                        const StepsTuple steps = StepsTuple{})
+{
+    // Create shape, strides and layout for new partition tensor
+    const auto partition_shape = CalculateLocalPartitionShape(shape(tensor), thread_lengths);
+    const auto partition_strides =
+        CalculateLocalPartitionStrides(stride(tensor), thread_lengths, steps);
+    const auto partition_layout = make_layout(partition_shape, partition_strides);
+    // Calculate offset for new partition tensor
+    index_t thread_id_copy      = thread_id;
+    const auto offset_idx       = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id_copy);
+    const auto partition_offset = layout(tensor)(offset_idx);
+    return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + partition_offset,
+                                                             partition_layout);
+}
+
+/**
+ * \brief Create local tile for thread block.
+ *
+ * \param tensor Tensor for partition.
+ * \param block_shape Shapes of requested tile.
+ * \param block_idx Block index represented as tuple.
+ * \param steps Block step (default=1, raked partition)
+ * \return Tile tensor.
+ */
+template <typename TensorType,
+          typename BlockShapeTuple,
+          typename BlockIdxTuple,
+          typename StepsTuple = Tuple<>>
+__host__ __device__ constexpr auto make_local_tile(const TensorType& tensor,
+                                                   const BlockShapeTuple& block_shape,
+                                                   const BlockIdxTuple& block_idx,
+                                                   const StepsTuple steps = StepsTuple{})
+{
+    // Create block lengths, strides and layout for new tile tensor
+    const auto block_lengths = CalculateBlockLengths(shape(tensor), block_shape);
+    const auto block_strides = CalculateLocalPartitionStrides(stride(tensor), block_lengths, steps);
+    const auto tile_layout   = make_layout(block_shape, block_strides);
+    // Calculate offset for new partition tensor
+    const auto offset_idx       = CalculateLayoutOffsetIdx(steps, block_idx);
+    const auto partition_offset = layout(tensor)(offset_idx);
+    return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + partition_offset,
+                                                             tile_layout);
+}
+
+} // namespace wrapper
+} // namespace ck
diff --git a/include/ck/wrapper/utils/tensor_utils.hpp b/include/ck/wrapper/utils/tensor_utils.hpp
index 5f0dc3e5006..35bb4f04be7 100644
--- a/include/ck/wrapper/utils/tensor_utils.hpp
+++ b/include/ck/wrapper/utils/tensor_utils.hpp
@@ -112,19 +112,21 @@ constexpr auto make_tensor(ElementType* pointer, const Layout<Shape, Strides>& l
  * \tparam NumVectors Number of vectors.
  * \tparam ScalarPerVector Scalars per vector.
  * \tparam ElementType Memory data type.
- * \param layout Tensor layout.
  * \return Constructed tensor.
  */
 template <MemoryTypeEnum MemoryType,
           index_t NumVectors,
           index_t ScalarPerVector,
-          typename ElementType,
-          typename Shape,
-          typename Strides>
-constexpr auto make_register_tensor(const Layout<Shape, Strides>& layout)
+          typename ElementType>
+constexpr auto make_register_tensor()
 {
-    static_assert(!IsNestedTuple(Shape{}), "Register tensor with nested layout is not supported");
-    return Tensor<MemoryType, ElementType, Shape, Strides, NumVectors, ScalarPerVector>(layout);
+    const auto layout = make_layout(make_tuple(Number<NumVectors>{}), make_tuple(Number<1>{}));
+    return Tensor<MemoryType,
+                  ElementType,
+                  Tuple<Number<NumVectors>>,
+                  Tuple<Number<1>>,
+                  NumVectors,
+                  ScalarPerVector>(layout);
 }
 
 /**
@@ -160,7 +162,7 @@ template <index_t... Idxs,
           typename Strides,
           index_t NumVectors,
           index_t ScalarPerVector>
-__host__ __device__ constexpr index_t
+__host__ __device__ constexpr auto
 size(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
          tensor)
 {
@@ -181,7 +183,7 @@ template <index_t... Idxs,
           typename Strides,
           index_t NumVectors,
           index_t ScalarPerVector>
-__host__ __device__ constexpr index_t
+__host__ __device__ constexpr auto
 rank(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
          tensor)
 {
@@ -202,7 +204,7 @@ template <index_t... Idxs,
           typename Strides,
           index_t NumVectors,
           index_t ScalarPerVector>
-__host__ __device__ constexpr index_t
+__host__ __device__ constexpr auto
 depth(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
           tensor)
 {
@@ -221,7 +223,7 @@ template <MemoryTypeEnum BufferAddressSpace,
           typename Strides,
           index_t NumVectors,
           index_t ScalarPerVector>
-__host__ __device__ constexpr const auto&
+__host__ __device__ constexpr auto
 stride(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
            tensor)
 {
diff --git a/test/wrapper/CMakeLists.txt b/test/wrapper/CMakeLists.txt
index 6b25c08a8a5..6c3e29ab87d 100644
--- a/test/wrapper/CMakeLists.txt
+++ b/test/wrapper/CMakeLists.txt
@@ -2,3 +2,7 @@ add_gtest_executable(test_layout test_layout.cpp)
 target_link_libraries(test_layout PRIVATE utility)
 add_gtest_executable(test_tensor test_tensor.cpp)
 target_link_libraries(test_tensor PRIVATE utility)
+add_gtest_executable(test_copy test_copy.cpp)
+target_link_libraries(test_copy PRIVATE utility)
+add_gtest_executable(test_partition test_partition.cpp)
+target_link_libraries(test_partition PRIVATE utility)
diff --git a/test/wrapper/test_copy.cpp b/test/wrapper/test_copy.cpp
new file mode 100644
index 00000000000..21693ad3ce1
--- /dev/null
+++ b/test/wrapper/test_copy.cpp
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <numeric>
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/host_utility/kernel_launch.hpp"
+
+#include "ck/utility/common_header.hpp"
+
+#include "ck/wrapper/layout.hpp"
+#include "ck/wrapper/tensor.hpp"
+#include "ck/wrapper/operations/copy.hpp"
+
+// Test copy from Global to Global through LDS and SGPR
+template <typename InputTensor,
+          typename OutputTensor,
+          typename BlockShape,
+          typename ThreadLayoutShape,
+          typename LocalTileSteps,
+          typename LocalPartitionSteps>
+__global__ void TestCopyDevice(const InputTensor input_tensor,
+                               OutputTensor output_tensor,
+                               const BlockShape block_shape,
+                               const ThreadLayoutShape thread_layout,
+                               const LocalTileSteps block_steps,
+                               const LocalPartitionSteps thread_steps)
+{
+    __shared__ ck::index_t p_shared[ck::wrapper::size(block_shape)];
+    auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
+        p_shared, ck::wrapper::make_layout(block_shape));
+
+    const auto block_idxs = ck::make_tuple(ck::make_tuple(0, 0), blockIdx.x);
+
+    // Get local tiles for global memory
+    const auto input_make_local_tile =
+        ck::wrapper::make_local_tile(input_tensor, block_shape, block_idxs, block_steps);
+    const auto output_make_local_tile =
+        ck::wrapper::make_local_tile(output_tensor, block_shape, block_idxs, block_steps);
+
+    // Get partition per thread
+    const auto input_make_local_partition = ck::wrapper::make_local_partition(
+        input_make_local_tile, thread_layout, threadIdx.x, thread_steps);
+    auto lds_make_local_partition =
+        ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x, thread_steps);
+    auto output_make_local_partition = ck::wrapper::make_local_partition(
+        output_make_local_tile, thread_layout, threadIdx.x, thread_steps);
+
+    // Allocate sgpr
+    constexpr ck::index_t scalar_per_vector = 1;
+    constexpr ck::index_t sgpr_size         = ck::wrapper::size(lds_make_local_partition);
+    auto tensor_sgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Sgpr,
+                                                         sgpr_size,
+                                                         scalar_per_vector,
+                                                         ck::index_t>();
+
+    // Perform copy
+    ck::wrapper::copy(input_make_local_partition, lds_make_local_partition);
+    ck::wrapper::copy(lds_make_local_partition, tensor_sgpr);
+    ck::wrapper::copy(tensor_sgpr, output_make_local_partition);
+}
+
+void PerformCopyGlobalToGlobalViaLDS()
+{
+    const auto shape =
+        ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<256>{});
+    const auto strides =
+        ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<2>{}), ck::Number<4>{});
+    const auto layout = ck::wrapper::make_layout(shape, strides);
+
+    // 0,1,2...size(shape) - 1
+    std::vector<ck::index_t> input_data(ck::wrapper::size(shape));
+    std::iota(input_data.begin(), input_data.end(), 0);
+
+    // Global memory buffers
+    DeviceMem in_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
+    DeviceMem out_buf(ck::wrapper::size(layout) * sizeof(ck::index_t));
+
+    in_buf.ToDevice(input_data.data());
+    out_buf.SetZero();
+
+    // Create tensors for global memory
+    const auto input_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
+        static_cast<const ck::index_t*>(in_buf.GetDeviceBuffer()), layout);
+    auto output_tensor_global = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
+        static_cast<ck::index_t*>(out_buf.GetDeviceBuffer()), layout);
+
+    const auto thread_layout =
+        ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<32>{});
+    const auto block_shape =
+        ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<64>{});
+
+    const auto thread_steps =
+        ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<2>{});
+    const auto block_steps =
+        ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<64>{});
+
+    const ck::index_t grid_size = ck::math::integer_divide_ceil(
+        ck::wrapper::size(input_tensor_global), ck::wrapper::size(block_shape));
+
+    const auto kernel = TestCopyDevice<decltype(input_tensor_global),
+                                       decltype(output_tensor_global),
+                                       decltype(block_shape),
+                                       decltype(thread_layout),
+                                       decltype(block_steps),
+                                       decltype(thread_steps)>;
+    launch_and_time_kernel(StreamConfig{},
+                           kernel,
+                           dim3(grid_size),
+                           dim3(ck::wrapper::size(thread_layout)),
+                           0,
+                           input_tensor_global,
+                           output_tensor_global,
+                           block_shape,
+                           thread_layout,
+                           block_steps,
+                           thread_steps);
+
+    // Verify results
+    std::vector<ck::index_t> output_data(ck::wrapper::size(shape));
+    out_buf.FromDevice(output_data.data());
+    EXPECT_TRUE(ck::utils::check_err(output_data, input_data));
+}
+
+TEST(TestCopy, CopyGlobalToGlobalViaLDS) { PerformCopyGlobalToGlobalViaLDS(); }
diff --git a/test/wrapper/test_layout.cpp b/test/wrapper/test_layout.cpp
index 14a8b964628..d5dba4841f5 100644
--- a/test/wrapper/test_layout.cpp
+++ b/test/wrapper/test_layout.cpp
@@ -84,7 +84,8 @@ TEST_F(TestWrapperLayout, 2d)
         ck::make_tuple(ck::Sequence<0>{}));
     const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0));
     const auto layout_compiletime =
-        ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}));
+        ck::wrapper::make_layout(ck::make_tuple(ck::Number<d1>{}, ck::Number<d0>{}),
+                                 ck::make_tuple(ck::Number<s1>{}, ck::Number<s0>{}));
     std::vector<ck::Tuple<ck::index_t, ck::index_t>> idxs;
 
     for(ck::index_t h = 0; h < d1; h++)
@@ -436,18 +437,25 @@ TEST(TestLayoutHelpers, ShapeAndStrides)
         std::is_same_v<decltype(shape_compiletime),
                        std::remove_reference_t<decltype(shape(layout_compiletime))>>;
     constexpr bool check_compiletime_strides =
-        std::is_same_v<decltype(strides_compiletime),
+        std::is_same_v<std::remove_const_t<decltype(strides_compiletime)>,
                        std::remove_reference_t<decltype(stride(layout_compiletime))>>;
     constexpr bool check_runtime_shape =
         std::is_same_v<decltype(shape_runtime),
                        std::remove_reference_t<decltype(shape(layout_runtime))>>;
     constexpr bool check_runtime_strides =
-        std::is_same_v<decltype(strides_runtime),
+        std::is_same_v<std::remove_const_t<decltype(strides_runtime)>,
                        std::remove_reference_t<decltype(stride(layout_runtime))>>;
     EXPECT_TRUE(check_compiletime_shape);
     EXPECT_TRUE(check_compiletime_strides);
     EXPECT_TRUE(check_runtime_shape);
     EXPECT_TRUE(check_runtime_strides);
+
+    // Check packed strides generation
+    const auto packed_layout = ck::wrapper::make_layout(shape_runtime);
+    constexpr bool check_packed_layout_strides =
+        std::is_same_v<std::remove_const_t<decltype(strides_runtime)>,
+                       std::remove_reference_t<decltype(stride(packed_layout))>>;
+    EXPECT_TRUE(check_packed_layout_strides);
 }
 
 TEST(TestLayoutHelpers, Hierarchical)
diff --git a/test/wrapper/test_partition.cpp b/test/wrapper/test_partition.cpp
new file mode 100644
index 00000000000..a2ae30f5de0
--- /dev/null
+++ b/test/wrapper/test_partition.cpp
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <numeric>
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/host_utility/kernel_launch.hpp"
+
+#include "ck/utility/common_header.hpp"
+
+#include "ck/wrapper/layout.hpp"
+#include "ck/wrapper/tensor.hpp"
+
+TEST(TestPartition, LocalPartition)
+{
+    const auto shape =
+        ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{});
+    const auto strides =
+        ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{});
+    const auto layout = ck::wrapper::make_layout(shape, strides);
+
+    std::vector<ck::index_t> data(ck::wrapper::size(layout));
+    std::iota(data.begin(), data.end(), 0);
+
+    const auto tensor =
+        ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
+
+    const auto thread_steps =
+        ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<1>{}), ck::Number<1>{});
+    const auto thread_layout =
+        ck::make_tuple(ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}), ck::Number<1>{});
+
+    for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++)
+    {
+        const auto raked_partition =
+            ck::wrapper::make_local_partition(tensor, thread_layout, thread_id);
+
+        const auto expected_partition_size =
+            ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout);
+        EXPECT_EQ(ck::wrapper::size(raked_partition), expected_partition_size);
+        EXPECT_EQ(raked_partition(0), thread_id);
+
+        const auto expected_partition_stride_00 =
+            ck::wrapper::size<0, 0>(strides) * ck::wrapper::size<0, 0>(thread_layout);
+        const auto expected_partition_stride_01 =
+            ck::wrapper::size<0, 1>(strides) * ck::wrapper::size<0, 1>(thread_layout);
+        const auto expected_partition_stride_1 =
+            ck::wrapper::size<1>(strides) * ck::wrapper::size<1>(thread_layout);
+        const auto partition_stride_00 =
+            ck::wrapper::size<0, 0>(ck::wrapper::stride(raked_partition));
+        const auto partition_stride_01 =
+            ck::wrapper::size<0, 1>(ck::wrapper::stride(raked_partition));
+        const auto partition_stride_1 = ck::wrapper::size<1>(ck::wrapper::stride(raked_partition));
+        EXPECT_EQ(partition_stride_00, expected_partition_stride_00);
+        EXPECT_EQ(partition_stride_01, expected_partition_stride_01);
+        EXPECT_EQ(partition_stride_1, expected_partition_stride_1);
+    }
+
+    for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++)
+    {
+        const auto packed_partition =
+            ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_steps);
+
+        const auto expected_partition_size =
+            ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout);
+        const auto expected_partition_first_val = thread_id * ck::wrapper::size<0, 0>(thread_steps);
+        EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size);
+        EXPECT_EQ(packed_partition(0), expected_partition_first_val);
+
+        const auto expected_partition_stride_00 = ck::wrapper::size<0, 0>(strides);
+        const auto expected_partition_stride_01 = ck::wrapper::size<0, 1>(strides);
+        const auto expected_partition_stride_1  = ck::wrapper::size<1>(strides);
+        const auto partition_stride_00 =
+            ck::wrapper::size<0, 0>(ck::wrapper::stride(packed_partition));
+        const auto partition_stride_01 =
+            ck::wrapper::size<0, 1>(ck::wrapper::stride(packed_partition));
+        const auto partition_stride_1 = ck::wrapper::size<1>(ck::wrapper::stride(packed_partition));
+        EXPECT_EQ(partition_stride_00, expected_partition_stride_00);
+        EXPECT_EQ(partition_stride_01, expected_partition_stride_01);
+        EXPECT_EQ(partition_stride_1, expected_partition_stride_1);
+    }
+}
+
+TEST(TestPartition, LocalTile)
+{
+    const auto shape =
+        ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{});
+    const auto strides =
+        ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{});
+    const auto layout = ck::wrapper::make_layout(shape, strides);
+
+    std::vector<ck::index_t> data(ck::wrapper::size(layout));
+    std::iota(data.begin(), data.end(), 0);
+
+    const auto tensor =
+        ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(data.data(), layout);
+
+    const auto block_steps =
+        ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{});
+    const auto block_shape =
+        ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{});
+    const auto block_layout =
+        ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{});
+
+    std::vector<ck::Tuple<ck::Tuple<ck::index_t, ck::index_t>, ck::index_t>> block_idxs;
+    for(ck::index_t x = 0; x < ck::wrapper::size<0, 0>(block_layout); x++)
+    {
+        for(ck::index_t y = 0; y < ck::wrapper::size<0, 1>(block_layout); y++)
+        {
+            for(ck::index_t z = 0; z < ck::wrapper::size<1>(block_layout); z++)
+            {
+                block_idxs.emplace_back(ck::make_tuple(x, y), z);
+            }
+        }
+    }
+
+    for(const auto& block_idx : block_idxs)
+    {
+        const auto raked_tile = ck::wrapper::make_local_tile(tensor, block_shape, block_idx);
+
+        const auto expected_tile_size = ck::wrapper::size(block_shape);
+        EXPECT_EQ(ck::wrapper::size(raked_tile), expected_tile_size);
+        EXPECT_EQ(raked_tile(0), layout(block_idx));
+
+        const auto expected_tile_stride_00 =
+            ck::wrapper::size<0, 0>(strides) * ck::wrapper::size<0, 0>(block_layout);
+        const auto expected_tile_stride_01 =
+            ck::wrapper::size<0, 1>(strides) * ck::wrapper::size<0, 1>(block_layout);
+        const auto expected_tile_stride_1 =
+            ck::wrapper::size<1>(strides) * ck::wrapper::size<1>(block_layout);
+        const auto tile_stride_00 = ck::wrapper::size<0, 0>(ck::wrapper::stride(raked_tile));
+        const auto tile_stride_01 = ck::wrapper::size<0, 1>(ck::wrapper::stride(raked_tile));
+        const auto tile_stride_1  = ck::wrapper::size<1>(ck::wrapper::stride(raked_tile));
+        EXPECT_EQ(tile_stride_00, expected_tile_stride_00);
+        EXPECT_EQ(tile_stride_01, expected_tile_stride_01);
+        EXPECT_EQ(tile_stride_1, expected_tile_stride_1);
+    }
+
+    for(const auto& block_idx : block_idxs)
+    {
+        const auto packed_tile =
+            ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_steps);
+
+        const auto expected_tile_size = ck::wrapper::size(block_shape);
+        const auto expected_tile_first_val =
+            ck::wrapper::size<0, 0>(block_idx) * ck::wrapper::size<0, 0>(block_shape) *
+                ck::wrapper::size<0, 0>(strides) +
+            ck::wrapper::size<0, 1>(block_idx) * ck::wrapper::size<0, 1>(block_shape) *
+                ck::wrapper::size<0, 1>(strides) +
+            ck::wrapper::size<1>(block_idx) * ck::wrapper::size<1>(block_shape) *
+                ck::wrapper::size<1>(strides);
+        EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size);
+        EXPECT_EQ(packed_tile(0), expected_tile_first_val);
+
+        const auto expected_tile_stride_00 = ck::wrapper::size<0, 0>(strides);
+        const auto expected_tile_stride_01 = ck::wrapper::size<0, 1>(strides);
+        const auto expected_tile_stride_1  = ck::wrapper::size<1>(strides);
+        const auto tile_stride_00 = ck::wrapper::size<0, 0>(ck::wrapper::stride(packed_tile));
+        const auto tile_stride_01 = ck::wrapper::size<0, 1>(ck::wrapper::stride(packed_tile));
+        const auto tile_stride_1  = ck::wrapper::size<1>(ck::wrapper::stride(packed_tile));
+        EXPECT_EQ(tile_stride_00, expected_tile_stride_00);
+        EXPECT_EQ(tile_stride_01, expected_tile_stride_01);
+        EXPECT_EQ(tile_stride_1, expected_tile_stride_1);
+    }
+}
diff --git a/test/wrapper/test_tensor.cpp b/test/wrapper/test_tensor.cpp
index 92f8e2e1bdc..72b56068749 100644
--- a/test/wrapper/test_tensor.cpp
+++ b/test/wrapper/test_tensor.cpp
@@ -108,7 +108,6 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success)
     bool* casted_success_ptr     = static_cast<bool*>(success);
 
     const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2));
-    constexpr auto register_layout = ck::wrapper::make_layout(ck::make_tuple(ck::Number<8>{}));
 
     auto tensor_global =
         ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(casted_data_ptr, layout);
@@ -116,11 +115,11 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success)
     auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr,
                                                          nelems,
                                                          scalar_per_vector,
-                                                         ck::index_t>(register_layout);
+                                                         ck::index_t>();
     auto tensor_sgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Sgpr,
                                                          nelems,
                                                          scalar_per_vector,
-                                                         ck::index_t>(register_layout);
+                                                         ck::index_t>();
 
     InitTensor(tensor_global);
     InitTensor(tensor_lds);
@@ -151,7 +150,7 @@ TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory)
                            TestTensorReadWriteDevice,
                            dim3(1),
                            dim3(1),
-                           nelems * sizeof(ck::index_t),
+                           0,
                            data_buf.GetDeviceBuffer(),
                            success_buf.GetDeviceBuffer());
 

From 40f423aaf4cf409912bcb5999445e0ad5e2e93e8 Mon Sep 17 00:00:00 2001
From: Bartlomiej Kocot <barkocot@amd.com>
Date: Wed, 20 Dec 2023 20:22:25 +0000
Subject: [PATCH 2/7] Update changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2891b8585b6..abca69142e1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,7 +19,7 @@ None
 - Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
 - Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799)
 - Support for Batched Gemm DL (#732)
-- Introduce wrapper sublibrary (limited functionality). (#1071, #1098)
+- Introduce wrapper sublibrary (limited functionality). (#1071, #1098, #1108)
 
 ### Changes
  - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)

From c609ee7b69068b8d4472fc61ec9198f83e78313b Mon Sep 17 00:00:00 2001
From: Bartlomiej Kocot <barkocot@amd.com>
Date: Thu, 21 Dec 2023 16:32:12 +0000
Subject: [PATCH 3/7] Stylistic fixes

---
 include/ck/wrapper/operations/copy.hpp        | 13 +++--
 include/ck/wrapper/utils/tensor_partition.hpp | 14 +++---
 test/wrapper/test_copy.cpp                    | 48 +++++++++----------
 3 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/include/ck/wrapper/operations/copy.hpp b/include/ck/wrapper/operations/copy.hpp
index 267464c0963..aec80f9ca72 100644
--- a/include/ck/wrapper/operations/copy.hpp
+++ b/include/ck/wrapper/operations/copy.hpp
@@ -18,16 +18,15 @@ namespace wrapper {
 template <typename SrcTensorType, typename DstTensorType>
 __host__ __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor)
 {
-    assert(size(src_tensor) == size(dst_tensor));
-    using SrcSizeTensor = decltype(size(src_tensor));
-    using DstSizeTensor = decltype(size(dst_tensor));
-    if constexpr(is_known_at_compile_time<SrcSizeTensor>::value)
+    if constexpr(!SrcTensorType::IsDynamicBuffer)
     {
-        static_for<0, SrcSizeTensor{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); });
+        using SizeType = decltype(size(src_tensor));
+        static_for<0, SizeType{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); });
     }
-    else if constexpr(is_known_at_compile_time<DstSizeTensor>::value)
+    else if constexpr(!DstTensorType::IsDynamicBuffer)
     {
-        static_for<0, DstSizeTensor{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); });
+        using SizeType = decltype(size(dst_tensor));
+        static_for<0, SizeType{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); });
     }
     else
     {
diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp
index 8639c91b7bb..edc61e1e466 100644
--- a/include/ck/wrapper/utils/tensor_partition.hpp
+++ b/include/ck/wrapper/utils/tensor_partition.hpp
@@ -184,7 +184,7 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Steps...
 // block layout based on shape.
 template <typename... Ts, typename... BlockDims>
 __host__ __device__ constexpr auto CalculateBlockLengths(const Tuple<Ts...>& shape,
-                                                         const Tuple<BlockDims...>& block_shape)
+                                                         const Tuple<BlockDims...>& tile_shape)
 {
     return generate_tuple(
         [&](auto i) {
@@ -192,11 +192,11 @@ __host__ __device__ constexpr auto CalculateBlockLengths(const Tuple<Ts...>& sha
             if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
             {
                 // if tuple then recurrence
-                return CalculateBlockLengths(shape.At(num_i), block_shape.At(num_i));
+                return CalculateBlockLengths(shape.At(num_i), tile_shape.At(num_i));
             }
             else
             {
-                return shape.At(num_i) / block_shape.At(num_i);
+                return shape.At(num_i) / tile_shape.At(num_i);
             }
         },
         Number<Tuple<Ts...>::Size()>{});
@@ -235,7 +235,7 @@ __host__ __device__ constexpr auto make_local_partition(const TensorType& tensor
  * \brief Create local tile for thread block.
  *
  * \param tensor Tensor for partition.
- * \param block_shape Shapes of requested tile.
+ * \param tile_shape Shapes of requested tile.
  * \param block_idx Block index represented as tuple.
  * \param steps Block step (default=1, raked partition)
  * \return Tile tensor.
@@ -245,14 +245,14 @@ template <typename TensorType,
           typename BlockIdxTuple,
           typename StepsTuple = Tuple<>>
 __host__ __device__ constexpr auto make_local_tile(const TensorType& tensor,
-                                                   const BlockShapeTuple& block_shape,
+                                                   const BlockShapeTuple& tile_shape,
                                                    const BlockIdxTuple& block_idx,
                                                    const StepsTuple steps = StepsTuple{})
 {
     // Create block lengths, strides and layout for new tile tensor
-    const auto block_lengths = CalculateBlockLengths(shape(tensor), block_shape);
+    const auto block_lengths = CalculateBlockLengths(shape(tensor), tile_shape);
     const auto block_strides = CalculateLocalPartitionStrides(stride(tensor), block_lengths, steps);
-    const auto tile_layout   = make_layout(block_shape, block_strides);
+    const auto tile_layout   = make_layout(tile_shape, block_strides);
     // Calculate offset for new partition tensor
     const auto offset_idx       = CalculateLayoutOffsetIdx(steps, block_idx);
     const auto partition_offset = layout(tensor)(offset_idx);
diff --git a/test/wrapper/test_copy.cpp b/test/wrapper/test_copy.cpp
index 21693ad3ce1..457288a4a97 100644
--- a/test/wrapper/test_copy.cpp
+++ b/test/wrapper/test_copy.cpp
@@ -19,7 +19,7 @@
 #include "ck/wrapper/tensor.hpp"
 #include "ck/wrapper/operations/copy.hpp"
 
-// Test copy from Global to Global through LDS and SGPR
+// Test copy from Global to Global through LDS and VGPR
 template <typename InputTensor,
           typename OutputTensor,
           typename BlockShape,
@@ -28,43 +28,43 @@ template <typename InputTensor,
           typename LocalPartitionSteps>
 __global__ void TestCopyDevice(const InputTensor input_tensor,
                                OutputTensor output_tensor,
-                               const BlockShape block_shape,
+                               const BlockShape tile_shape,
                                const ThreadLayoutShape thread_layout,
                                const LocalTileSteps block_steps,
                                const LocalPartitionSteps thread_steps)
 {
-    __shared__ ck::index_t p_shared[ck::wrapper::size(block_shape)];
+    __shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)];
     auto tensor_lds = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
-        p_shared, ck::wrapper::make_layout(block_shape));
+        p_shared, ck::wrapper::make_layout(tile_shape));
 
     const auto block_idxs = ck::make_tuple(ck::make_tuple(0, 0), blockIdx.x);
 
     // Get local tiles for global memory
-    const auto input_make_local_tile =
-        ck::wrapper::make_local_tile(input_tensor, block_shape, block_idxs, block_steps);
-    const auto output_make_local_tile =
-        ck::wrapper::make_local_tile(output_tensor, block_shape, block_idxs, block_steps);
+    const auto input_local_tile =
+        ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs, block_steps);
+    const auto output_local_tile =
+        ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs, block_steps);
 
     // Get partition per thread
-    const auto input_make_local_partition = ck::wrapper::make_local_partition(
-        input_make_local_tile, thread_layout, threadIdx.x, thread_steps);
-    auto lds_make_local_partition =
+    const auto input_local_partition = ck::wrapper::make_local_partition(
+        input_local_tile, thread_layout, threadIdx.x, thread_steps);
+    auto lds_local_partition =
         ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x, thread_steps);
-    auto output_make_local_partition = ck::wrapper::make_local_partition(
-        output_make_local_tile, thread_layout, threadIdx.x, thread_steps);
+    auto output_local_partition = ck::wrapper::make_local_partition(
+        output_local_tile, thread_layout, threadIdx.x, thread_steps);
 
-    // Allocate sgpr
+    // Allocate VGPR
     constexpr ck::index_t scalar_per_vector = 1;
-    constexpr ck::index_t sgpr_size         = ck::wrapper::size(lds_make_local_partition);
-    auto tensor_sgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Sgpr,
-                                                         sgpr_size,
+    constexpr ck::index_t vgpr_size         = ck::wrapper::size(lds_local_partition);
+    auto tensor_vgpr = ck::wrapper::make_register_tensor<ck::wrapper::MemoryTypeEnum::Vgpr,
+                                                         vgpr_size,
                                                          scalar_per_vector,
                                                          ck::index_t>();
 
     // Perform copy
-    ck::wrapper::copy(input_make_local_partition, lds_make_local_partition);
-    ck::wrapper::copy(lds_make_local_partition, tensor_sgpr);
-    ck::wrapper::copy(tensor_sgpr, output_make_local_partition);
+    ck::wrapper::copy(input_local_partition, lds_local_partition);
+    ck::wrapper::copy(lds_local_partition, tensor_vgpr);
+    ck::wrapper::copy(tensor_vgpr, output_local_partition);
 }
 
 void PerformCopyGlobalToGlobalViaLDS()
@@ -94,7 +94,7 @@ void PerformCopyGlobalToGlobalViaLDS()
 
     const auto thread_layout =
         ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<32>{});
-    const auto block_shape =
+    const auto tile_shape =
         ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<64>{});
 
     const auto thread_steps =
@@ -103,11 +103,11 @@ void PerformCopyGlobalToGlobalViaLDS()
         ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<64>{});
 
     const ck::index_t grid_size = ck::math::integer_divide_ceil(
-        ck::wrapper::size(input_tensor_global), ck::wrapper::size(block_shape));
+        ck::wrapper::size(input_tensor_global), ck::wrapper::size(tile_shape));
 
     const auto kernel = TestCopyDevice<decltype(input_tensor_global),
                                        decltype(output_tensor_global),
-                                       decltype(block_shape),
+                                       decltype(tile_shape),
                                        decltype(thread_layout),
                                        decltype(block_steps),
                                        decltype(thread_steps)>;
@@ -118,7 +118,7 @@ void PerformCopyGlobalToGlobalViaLDS()
                            0,
                            input_tensor_global,
                            output_tensor_global,
-                           block_shape,
+                           tile_shape,
                            thread_layout,
                            block_steps,
                            thread_steps);

From 0147b9431ec10d7d2b2a6df5a92b3a4c35a54e7e Mon Sep 17 00:00:00 2001
From: Bartlomiej Kocot <barkocot@amd.com>
Date: Fri, 29 Dec 2023 00:05:51 +0000
Subject: [PATCH 4/7] Change shape/strides logic to descriptor transforms

---
 include/ck/utility/tuple_helper.hpp           |  11 +
 include/ck/wrapper/layout.hpp                 | 117 ++--------
 include/ck/wrapper/tensor.hpp                 | 213 ++++++++++--------
 include/ck/wrapper/utils/layout_utils.hpp     | 155 +++++++++----
 include/ck/wrapper/utils/tensor_partition.hpp | 121 +++++-----
 include/ck/wrapper/utils/tensor_utils.hpp     |  95 ++++----
 test/wrapper/test_layout.cpp                  |  15 --
 test/wrapper/test_partition.cpp               |  50 ----
 test/wrapper/test_tensor.cpp                  |  15 +-
 9 files changed, 378 insertions(+), 414 deletions(-)

diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp
index 75f2693f205..f3652300541 100644
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -178,4 +178,15 @@ __host__ __device__ constexpr auto TupleDepth(const Tuple<Ts...>&)
     return math::max(TupleDepth<depth + 1>(Ts{})...);
 }
 
+template <index_t from, index_t to, typename... Ts>
+__host__ __device__ constexpr auto TupleSlice(const Tuple<Ts...>& tuple)
+{
+    return generate_tuple(
+        [&](auto i) {
+            using Idx = Number<from + i>;
+            return tuple.At(Idx{});
+        },
+        Number<to - from>{});
+}
+
 } // namespace ck
diff --git a/include/ck/wrapper/layout.hpp b/include/ck/wrapper/layout.hpp
index 4b6f1b19bf4..c0a52c03327 100644
--- a/include/ck/wrapper/layout.hpp
+++ b/include/ck/wrapper/layout.hpp
@@ -14,11 +14,9 @@ namespace wrapper {
  * \tparam Shape Tuple of Number<> (for compile-time layout) or index_t
  *         (dynamic layout). It is possible to pass nested shapes
  *         (e.g. ((4, 2), 2)), nested dimensions are merged.
- * \tparam Strides Tuple of Number<> (for compile-time layout) or index_t
- *         (dynamic layout). Stride tuple should be nested if shape tuple is
- *         nested.
+ * \tparam FlattenDescriptorType Tensor descriptor for flatten shape dims.
  */
-template <typename Shape, typename Strides>
+template <typename Shape, typename FlattenDescriptorType>
 struct Layout
 {
     private:
@@ -45,29 +43,6 @@ struct Layout
             Number<Tuple<Ts...>::Size()>{});
     }
 
-    // Generate packed (column-major) strides if not passed
-    template <typename... Ts>
-    __host__ __device__ constexpr static auto
-    GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape, index_t& stride)
-    {
-        return generate_tuple(
-            [&](auto i) {
-                const auto num_i = Number<i>{};
-                if constexpr(is_detected<is_tuple, tuple_element_t<i, Tuple<Ts...>>>::value)
-                {
-                    return GenerateColumnMajorPackedStrides(shape.At(num_i), stride);
-                }
-                else
-                {
-                    const index_t dim_stride = stride;
-                    // update stride
-                    stride *= shape.At(num_i);
-                    return dim_stride;
-                }
-            },
-            Number<Tuple<Ts...>::Size()>{});
-    }
-
     // Generate LowerDims in Compile-time for MergeTrasform using passed Type
     // If element of Tuple<Ts...> is also tuple, then merge (generate sequence for merge)
     // If tuple is element, then pass through (sequence with one element)
@@ -209,32 +184,6 @@ struct Layout
         return transform_tensor_descriptor(desc, transforms, lower_dims, upper_dims);
     }
 
-    template <typename LayoutShape, typename LayoutStrides>
-    __host__ __device__ static auto MakeFlattenDescriptor(const LayoutShape& shape,
-                                                          const LayoutStrides& strides)
-    {
-        const auto unrolled_shape = UnrollNestedTuple(shape);
-        if constexpr(is_same_v<LayoutStrides, Tuple<>>)
-        {
-            index_t start_stride = 1;
-            // if not passed, then generate
-            const auto unrolled_strides =
-                GenerateColumnMajorPackedStrides(unrolled_shape, start_stride);
-            static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
-                          "Size of strides and shape are not consistent.");
-            return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
-        }
-        else
-        {
-            const auto unrolled_strides = UnrollNestedTuple(strides);
-            static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
-                          "Size of strides and shape are not consistent.");
-            return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
-        }
-    }
-
-    using FlattenDescriptorType =
-        remove_cvref_t<decltype(MakeFlattenDescriptor(Shape{}, Strides{}))>;
     using Descriptor1dType =
         remove_cvref_t<decltype(MakeMerge1d(Shape{}, FlattenDescriptorType{}))>;
     using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>;
@@ -275,44 +224,25 @@ struct Layout
     }
 
     __host__ __device__ Layout() = delete;
+
     /**
      * \brief Layout constructor.
      *
      * \param shape Shape for layout.
-     * \param strides Strides for layout (optional if tensor is packed).
+     * \param flatten_descriptor Descriptor
      */
-    __host__ __device__ constexpr Layout(const Shape& shape, const Strides& strides)
-        : flatten_descriptor_{}, shape_(shape), strides_(strides)
+    __host__ __device__ constexpr Layout(const Shape& shape,
+                                         const FlattenDescriptorType& flatten_descriptor)
+        : shape_(shape)
     {
         // Construct if runtime mode
         if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
         {
-            flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_);
-            descriptor_1d_      = MakeMerge1d(shape_, flatten_descriptor_);
-            merged_nests_descriptor_ =
-                TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_);
-        }
-    }
-
-    /**
-     * \brief Layout constructor (with default packed column-major strides).
-     *
-     * \param shape Shape for layout.
-     */
-    __host__ __device__ constexpr Layout(const Shape& shape)
-        : flatten_descriptor_{}, shape_(shape), strides_()
-    {
-        if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
-        {
-            flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_);
+            flatten_descriptor_ = flatten_descriptor;
             descriptor_1d_      = MakeMerge1d(shape_, flatten_descriptor_);
             merged_nests_descriptor_ =
                 TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_);
         }
-        else
-        {
-            static_assert(true, "Compiletime Layout require strides parameter.");
-        }
     }
 
     /**
@@ -399,24 +329,6 @@ struct Layout
      */
     __host__ __device__ constexpr const Shape& GetShape() const { return shape_; }
 
-    /**
-     * \brief Strides getter.
-     *
-     * \return Strides.
-     */
-    __host__ __device__ constexpr auto GetStrides() const
-    {
-        if constexpr(is_same_v<Strides, Tuple<>>)
-        {
-            index_t start_stride = 1;
-            return GenerateColumnMajorPackedStrides(shape_, start_stride);
-        }
-        else
-        {
-            return strides_;
-        }
-    }
-
     /**
      * \brief Get default lengths (tuple filled with Shape length elements).
      *
@@ -442,17 +354,26 @@ struct Layout
      *
      * \return Default descriptor.
      */
-    __host__ __device__ constexpr MergedNestsDescriptorType GetDefaultDescriptor()
+    __host__ __device__ constexpr const MergedNestsDescriptorType& GetDefaultDescriptor() const
     {
         return merged_nests_descriptor_;
     }
 
+    /**
+     * \brief Get flatten descriptor (with unrolled dims)
+     *
+     * \return Flatten descriptor.
+     */
+    __host__ __device__ constexpr const FlattenDescriptorType& GetFlattenDescriptor() const
+    {
+        return flatten_descriptor_;
+    }
+
     private:
     FlattenDescriptorType flatten_descriptor_;
     Descriptor1dType descriptor_1d_;
     MergedNestsDescriptorType merged_nests_descriptor_;
     const Shape shape_;
-    const Strides strides_;
 };
 
 } // namespace wrapper
diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp
index fad790a6902..bc598178752 100644
--- a/include/ck/wrapper/tensor.hpp
+++ b/include/ck/wrapper/tensor.hpp
@@ -16,14 +16,14 @@ namespace wrapper {
  * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR).
  * \tparam ElementType Element data type.
  * \tparam Shape Tensor shape (layout component).
- * \tparam Strides Tensor strides (layout component).
+ * \tparam FlattenDescriptorType Flatten descriptor (layout component).
  * \tparam NumVectors Number of vectors (only for VGPR, SGPR).
  * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR).
  */
 template <MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename Strides,
+          typename FlattenDescriptorType,
           index_t NumVectors,     // param for Register memory
           index_t ScalarPerVector // param for Register memory
           >
@@ -32,50 +32,20 @@ struct Tensor
     private:
     // Check if Tuple contains Slice object
     template <typename T>
-    constexpr static bool IsSlicing(T&&)
+    __host__ __device__ constexpr static bool IsSlicing(T&&)
     {
         return is_detected<is_slice, T>::value;
     }
     template <typename... Ts>
-    constexpr static bool IsSlicing(Tuple<Ts...>&&)
+    __host__ __device__ constexpr static bool IsSlicing(Tuple<Ts...>&&)
     {
         return (IsSlicing(Ts{}) || ...);
     }
 
-    // Calculate first index of new tensor after slice
-    // It is needed to calculate offset for new tensor
-    template <typename... Ts>
-    constexpr auto GetStartIdxForSlicedTensor(const Tuple<Ts...>& idx) const
-    {
-        const auto start_idx_for_sliced_tensor = generate_tuple(
-            [&](auto i) {
-                constexpr auto num_i = Number<i>{};
-                if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
-                {
-                    // if tuple then recurrence
-                    return GetStartIdxForSlicedTensor(idx.At(num_i));
-                }
-                else if constexpr(is_detected<is_slice,
-                                              tuple_element_t<i.value, Tuple<Ts...>>>::value)
-                {
-                    // if slice, return the beginning of the interval
-                    return idx.At(num_i).from_;
-                }
-                else
-                {
-                    // if one dim selected
-                    return idx.At(num_i);
-                }
-            },
-            Number<Tuple<Ts...>::Size()>{});
-
-        return start_idx_for_sliced_tensor;
-    }
-
     // Calculate new tensor shape after slice
     template <typename... Ts, typename ShapeTmpType>
-    constexpr auto GetShapeFromSlicedTensor(const Tuple<Ts...>& idx,
-                                            const ShapeTmpType& shape) const
+    __host__ __device__ constexpr auto GetShapeFromSlicedTensor(const Tuple<Ts...>& idx,
+                                                                const ShapeTmpType& shape) const
     {
         // Pack each value in tuple to remove empty tuples after generation
         auto new_shape = generate_tuple(
@@ -113,67 +83,133 @@ struct Tensor
         return UnrollNestedTuple<0, 1>(new_shape);
     }
 
-    template <typename... Ts, typename StridesTmpType>
-    constexpr auto GetStridesFromSlicedTensor(const Tuple<Ts...>& idx,
-                                              const StridesTmpType& strides) const
+    // Generate Freeze for each of nested shape
+    template <typename T, typename ShapeTmpType>
+    __host__ __device__ constexpr auto GenerateMultipleFreeze(const T& idx,
+                                                              const ShapeTmpType& shape) const
+    {
+        const auto unrolled_shape = UnrollNestedTuple(shape);
+        return generate_tuple(
+            [&](auto) {
+                // dimension offset from idx
+                return make_freeze_transform(idx);
+            },
+            Number<decltype(unrolled_shape)::Size()>{});
+    }
+
+    template <typename... Ts, typename ShapeTmpType>
+    __host__ __device__ constexpr auto
+    GetTransformsFromSlicedTensor(const Tuple<Ts...>& idx, const ShapeTmpType& shape) const
     {
         // Pack each value in tuple to remove empty tuples after generation
-        auto new_strides = generate_tuple(
+        auto transforms = generate_tuple(
             [&](auto i) {
                 constexpr auto num_i = Number<i>{};
                 if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
                 {
-                    if constexpr(!IsSlicing(tuple_element_t<i.value, Tuple<Ts...>>{}))
-                    {
-                        // if tuple does not have any slice then we can remove dimension
-                        return Tuple<>{};
-                    }
-                    else
-                    {
-                        // if tuple then recurrence
-                        return make_tuple(
-                            GetStridesFromSlicedTensor(idx.At(num_i), strides.At(num_i)));
-                    }
+                    return GetTransformsFromSlicedTensor(idx.At(num_i), shape.At(num_i));
                 }
                 else if constexpr(is_detected<is_slice,
                                               tuple_element_t<i.value, Tuple<Ts...>>>::value)
                 {
-                    // Stride will be the same
-                    return make_tuple(strides.At(num_i));
+
+                    const auto from  = idx.At(num_i).from_;
+                    const auto dim   = shape.At(num_i);
+                    const auto range = idx.At(num_i).range(dim);
+                    return make_slice_transform(dim, from, from + range);
                 }
                 else
                 {
                     // remove dimension for just value
-                    return Tuple<>{};
+                    return GenerateMultipleFreeze(idx.At(num_i), shape.At(num_i));
                 }
             },
             Number<Tuple<Ts...>::Size()>{});
         // Remove empty tuples (deleted elements) and return
-        return UnrollNestedTuple<0, 1>(new_strides);
+        return UnrollNestedTuple(transforms);
+    }
+
+    // There is no output for Freeze transform
+    template <index_t i, typename LowerIndex>
+    __host__ __device__ constexpr auto GetSequenceVal(const ck::Freeze<LowerIndex>&) const
+    {
+        return Sequence<>{};
+    }
+
+    template <index_t i, typename LowLength, typename SliceBegin, typename SliceEnd>
+    __host__ __device__ constexpr auto
+    GetSequenceVal(const ck::Slice<LowLength, SliceBegin, SliceEnd>&) const
+    {
+        return Sequence<i>{};
+    }
+
+    template <index_t i>
+    __host__ __device__ constexpr auto GenerateUpperDims(const Tuple<>&) const
+    {
+        return Tuple<>{};
+    }
+
+    template <index_t i, typename... Transforms>
+    __host__ __device__ constexpr auto
+    GenerateUpperDims(const Tuple<Transforms...>& transforms) const
+    {
+        constexpr auto num_transforms = Tuple<Transforms...>::Size();
+        // Deduce Sequence element for specific transform
+        const auto currect_elem = GetSequenceVal<i>(transforms.At(Number<0>{}));
+        if constexpr(is_same_v<decltype(currect_elem), const Sequence<>>)
+        {
+            const auto next_tuple = GenerateUpperDims<i>(TupleSlice<1, num_transforms>(transforms));
+            return concat_tuple(make_tuple(currect_elem), next_tuple);
+        }
+        else
+        {
+            // Increase i if current_elem is Slice transform
+            const auto next_tuple =
+                GenerateUpperDims<i + 1>(TupleSlice<1, num_transforms>(transforms));
+            return concat_tuple(make_tuple(currect_elem), next_tuple);
+        }
+    }
+
+    template <typename... Ts, typename ShapeTmpType, typename FlattenDescriptor>
+    __host__ __device__ constexpr auto
+    GetDescriptorFromSlicedTensor(const Tuple<Ts...>& idx,
+                                  const ShapeTmpType& shape,
+                                  const FlattenDescriptor& flatten_desc) const
+    {
+        constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size();
+
+        const auto transforms     = GetTransformsFromSlicedTensor(idx, shape);
+        using TransformsTupleType = decltype(transforms);
+
+        const auto lower_dims =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<old_shape_dims>{});
+        const auto upper_dims = decltype(GenerateUpperDims<0>(TransformsTupleType{})){};
+        return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
     }
 
     public:
-    using ElementSpaceSize  = decltype(Layout<Shape, Strides>{
-        Shape{}, Strides{}}.GetElementSpaceSize()); // SpaceSize type for buffer
-    using TensorElementType = ElementType;           // DataType
+    using ElementSpaceSize  = decltype(Layout<Shape, FlattenDescriptorType>{
+        Shape{}, FlattenDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer
+    using TensorElementType = ElementType;                         // DataType
 
     static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace;
     static constexpr bool IsDynamicBuffer = !(BufferAddressSpace == MemoryTypeEnum ::Sgpr ||
                                               BufferAddressSpace == MemoryTypeEnum ::Vgpr);
 
     __host__ __device__ Tensor() = delete;
-    __host__ __device__ Tensor(ElementType* pointer, const Layout<Shape, Strides>& layout)
+    __host__ __device__ Tensor(ElementType* pointer,
+                               const Layout<Shape, FlattenDescriptorType>& layout)
         : layout_(layout),
           buffer_(make_dynamic_buffer<BufferAddressSpace>(pointer, layout.GetElementSpaceSize()))
     {
     }
 
-    __host__ __device__ Tensor(const Layout<Shape, Strides>& layout) : layout_(layout)
+    __host__ __device__ Tensor(const Layout<Shape, FlattenDescriptorType>& layout) : layout_(layout)
     {
         static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register.");
     }
 
-    __host__ __device__ constexpr const Layout<Shape, Strides>& GetLayout() const
+    __host__ __device__ constexpr const Layout<Shape, FlattenDescriptorType>& GetLayout() const
     {
         return layout_;
     }
@@ -183,21 +219,14 @@ struct Tensor
     __host__ __device__ auto operator[](const Tuple<Ts...>& idx) const
     {
         static_assert(IsDynamicBuffer, "Register slice is not supported");
-        // Calculate offset based on first idx for new tensor
-        const index_t offset = layout_(GetStartIdxForSlicedTensor(idx));
+        const auto& shape = layout_.GetShape();
+        auto new_shape    = GetShapeFromSlicedTensor(idx, shape);
 
-        auto new_shape = GetShapeFromSlicedTensor(idx, layout_.GetShape());
-        if constexpr(is_same_v<Strides, Tuple<>>)
-        {
-            auto new_layout = make_layout(new_shape);
-            return make_tensor<BufferAddressSpace>(buffer_.p_data_ + offset, new_layout);
-        }
-        else
-        {
-            auto new_strides = GetStridesFromSlicedTensor(idx, layout_.GetStrides());
-            auto new_layout  = make_layout(new_shape, new_strides);
-            return make_tensor<BufferAddressSpace>(buffer_.p_data_ + offset, new_layout);
-        }
+        const auto& flatten_desc = layout_.GetFlattenDescriptor();
+        auto new_desc            = GetDescriptorFromSlicedTensor(idx, shape, flatten_desc);
+        const auto new_layout =
+            Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc);
+        return make_tensor<BufferAddressSpace>(buffer_.p_data_, new_layout);
     }
 
     template <typename... Ts, enable_if_t<IsSlicing(Tuple<Ts...>{}), bool> = false>
@@ -223,18 +252,10 @@ struct Tensor
         }
         else
         {
-            if constexpr(is_same_v<Strides, Tuple<>>)
-            {
-                constexpr index_t offset =
-                    Layout<Shape, Strides>{Shape{}}.template operator()<Tuple<Ts...>>();
-                return buffer_[Number<offset>{}];
-            }
-            else
-            {
-                constexpr index_t offset =
-                    Layout<Shape, Strides>{Shape{}, Strides{}}.template operator()<Tuple<Ts...>>();
-                return buffer_[Number<offset>{}];
-            }
+            constexpr index_t offset = Layout<Shape, FlattenDescriptorType>{
+                Shape{},
+                FlattenDescriptorType{}}.template operator()<Tuple<Ts...>>();
+            return buffer_[Number<offset>{}];
         }
     }
 
@@ -261,18 +282,10 @@ struct Tensor
         }
         else
         {
-            if constexpr(is_same_v<Strides, Tuple<>>)
-            {
-                constexpr index_t offset =
-                    Layout<Shape, Strides>{Shape{}}.template operator()<Tuple<Ts...>>();
-                return buffer_(Number<offset>{});
-            }
-            else
-            {
-                constexpr index_t offset =
-                    Layout<Shape, Strides>{Shape{}, Strides{}}.template operator()<Tuple<Ts...>>();
-                return buffer_(Number<offset>{});
-            }
+            constexpr index_t offset = Layout<Shape, FlattenDescriptorType>{
+                Shape{},
+                FlattenDescriptorType{}}.template operator()<Tuple<Ts...>>();
+            return buffer_(Number<offset>{});
         }
     }
 
@@ -309,7 +322,7 @@ struct Tensor
     // If register use static buffer, else use dynamic buffer
     using Buffer = std::conditional_t<IsDynamicBuffer, DynamicBufferType, StaticBufferType>;
 
-    const Layout<Shape, Strides> layout_;
+    const Layout<Shape, FlattenDescriptorType> layout_;
     Buffer buffer_;
 };
 
diff --git a/include/ck/wrapper/utils/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp
index e99596d8f50..a33311e1422 100644
--- a/include/ck/wrapper/utils/layout_utils.hpp
+++ b/include/ck/wrapper/utils/layout_utils.hpp
@@ -22,11 +22,61 @@ namespace wrapper {
 // Disable from doxygen docs generation
 /// @cond
 // forward declaration
-template <typename Shape, typename Strides>
+template <typename Shape, typename FlattenDescriptorType>
 struct Layout;
 
 template <typename T>
 using is_tuple = decltype(std::declval<T&>().IsTuple());
+
+namespace {
+// Generate packed (column-major) strides if not passed
+template <typename... Ts>
+__host__ __device__ constexpr static auto
+GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape, index_t& stride)
+{
+    return generate_tuple(
+        [&](auto i) {
+            const auto num_i = Number<i>{};
+            if constexpr(is_detected<is_tuple, tuple_element_t<i, Tuple<Ts...>>>::value)
+            {
+                return GenerateColumnMajorPackedStrides(shape.At(num_i), stride);
+            }
+            else
+            {
+                const index_t dim_stride = stride;
+                // update stride
+                stride *= shape.At(num_i);
+                return dim_stride;
+            }
+        },
+        Number<Tuple<Ts...>::Size()>{});
+}
+
+template <typename LayoutShape, typename LayoutStrides>
+__host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shape,
+                                                         const LayoutStrides& strides)
+{
+    const auto unrolled_shape = UnrollNestedTuple(shape);
+    if constexpr(is_same_v<LayoutStrides, Tuple<>>)
+    {
+        index_t start_stride = 1;
+        // if not passed, then generate
+        const auto unrolled_strides =
+            GenerateColumnMajorPackedStrides(unrolled_shape, start_stride);
+        static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
+                      "Size of strides and shape are not consistent.");
+        return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
+    }
+    else
+    {
+        const auto unrolled_strides = UnrollNestedTuple(strides);
+        static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
+                      "Size of strides and shape are not consistent.");
+        return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);
+    }
+}
+} // namespace
+
 /// @endcond
 
 // make_*
@@ -38,10 +88,10 @@ using is_tuple = decltype(std::declval<T&>().IsTuple());
  * \return Constructed layout.
  */
 template <typename Shape, typename Strides>
-__host__ __device__ constexpr Layout<Shape, Strides> make_layout(const Shape& shape,
-                                                                 const Strides& strides)
+__host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides& strides)
 {
-    return Layout<Shape, Strides>(shape, strides);
+    using FlattenDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Strides{}));
+    return Layout<Shape, FlattenDescriptorType>(shape, MakeFlattenDescriptor(shape, strides));
 }
 
 /**
@@ -52,9 +102,10 @@ __host__ __device__ constexpr Layout<Shape, Strides> make_layout(const Shape& sh
  * \return Constructed layout.
  */
 template <typename Shape>
-__host__ __device__ constexpr Layout<Shape, Tuple<>> make_layout(const Shape& shape)
+__host__ __device__ constexpr auto make_layout(const Shape& shape)
 {
-    return Layout<Shape, Tuple<>>(shape);
+    using FlattenDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Tuple<>{}));
+    return Layout<Shape, FlattenDescriptorType>(shape, MakeFlattenDescriptor(shape, Tuple<>{}));
 }
 
 // Layout helpers
@@ -89,26 +140,51 @@ __host__ __device__ constexpr auto get(const Tuple<Dims...>& tuple)
  * \param layout Layout to create sub layout.
  * \return Requsted sub layout.
  */
-template <index_t idx, typename Shape, typename Strides>
-__host__ __device__ constexpr auto get(const Layout<Shape, Strides>& layout)
+template <index_t idx, typename Shape, typename FlattenDesc>
+__host__ __device__ constexpr auto get(const Layout<Shape, FlattenDesc>& layout)
 {
-    const auto& shape     = layout.GetShape();
-    const auto& new_shape = get<idx>(shape);
+    const auto& shape    = layout.GetShape();
+    const auto new_shape = get<idx>(shape);
     static_assert(is_detected<is_tuple, decltype(new_shape)>::value,
                   "Shape of sub layout must be tuple");
-    if constexpr(is_same_v<Strides, Tuple<>>)
-    {
-        // If stride not passed, create without strides
-        return make_layout(new_shape);
-    }
-    else
-    {
-        const auto& strides     = layout.GetStrides();
-        const auto& new_strides = get<idx>(strides);
-        static_assert(is_detected<is_tuple, decltype(new_strides)>::value,
-                      "Strides of sub layout must be tuple");
-        return make_layout(new_shape, new_strides);
-    }
+
+    constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size();
+    constexpr auto new_shape_dims = decltype(UnrollNestedTuple(new_shape))::Size();
+    constexpr auto shape_offset   = decltype(UnrollNestedTuple(TupleSlice<0, idx>(shape)))::Size();
+
+    const auto unrolled_shape = UnrollNestedTuple(shape);
+    const auto transforms     = generate_tuple(
+        [&](auto i) {
+            // Compare Idx with shape
+            if constexpr(i < shape_offset || i >= shape_offset + new_shape_dims)
+            {
+                // Remove dimension
+                return make_freeze_transform(Number<0>{});
+            }
+            else
+            {
+                return make_pass_through_transform(unrolled_shape.At(i));
+            }
+        },
+        Number<old_shape_dims>{});
+
+    const auto lower_dims =
+        generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<old_shape_dims>{});
+    const auto upper_dims = generate_tuple(
+        [&](auto i) {
+            if constexpr(i < shape_offset || i >= shape_offset + new_shape_dims)
+                return Sequence<>{};
+
+            else
+            {
+                return Sequence<i.value - shape_offset>{};
+            }
+        },
+        Number<old_shape_dims>{});
+
+    const auto& flatten_desc = layout.GetFlattenDescriptor();
+    auto new_desc = transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
+    return Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc);
 }
 
 /**
@@ -142,8 +218,8 @@ __host__ __device__ T constexpr size(const T& dim)
  * \param layout Layout to get Shape of.
  * \return Requsted length.
  */
-template <index_t idx, typename Shape, typename Strides>
-__host__ __device__ constexpr auto size(const Layout<Shape, Strides>& layout)
+template <index_t idx, typename Shape, typename FlattenDescriptorType>
+__host__ __device__ constexpr auto size(const Layout<Shape, FlattenDescriptorType>& layout)
 {
     return layout.template GetLength<idx>();
 }
@@ -168,8 +244,8 @@ __host__ __device__ constexpr auto size(const Tuple<ShapeDims...>& shape)
  * \param layout Layout to calculate shape size.
  * \return Requsted size.
  */
-template <typename Shape, typename Strides>
-__host__ __device__ constexpr auto size(const Layout<Shape, Strides>& layout)
+template <typename Shape, typename FlattenDescriptorType>
+__host__ __device__ constexpr auto size(const Layout<Shape, FlattenDescriptorType>& layout)
 {
     return layout.GetLengths();
 }
@@ -208,8 +284,9 @@ __host__ __device__ constexpr auto size(const T& elem)
  * \param layout Layout to calculate rank.
  * \return Requsted rank.
  */
-template <typename Shape, typename Strides>
-__host__ __device__ constexpr auto rank([[maybe_unused]] const Layout<Shape, Strides>& layout)
+template <typename Shape, typename FlattenDescriptorType>
+__host__ __device__ constexpr auto
+rank([[maybe_unused]] const Layout<Shape, FlattenDescriptorType>& layout)
 {
     return Shape::Size();
 }
@@ -261,8 +338,8 @@ __host__ __device__ constexpr auto rank(const T& elem)
  * \param layout Layout to calculate depth.
  * \return Requsted depth.
  */
-template <typename Shape, typename Strides>
-__host__ __device__ constexpr auto depth(const Layout<Shape, Strides>& layout)
+template <typename Shape, typename FlattenDescriptorType>
+__host__ __device__ constexpr auto depth(const Layout<Shape, FlattenDescriptorType>& layout)
 {
     const auto& shape = layout.GetShape();
     return TupleDepth(shape);
@@ -307,26 +384,14 @@ __host__ __device__ constexpr auto depth(const T& elem)
     return depth(get<Idxs...>(elem));
 }
 
-/**
- * \brief Get Layout strides.
- *
- * \param layout Layout to get strides from.
- * \return Requsted strides.
- */
-template <typename Shape, typename Strides>
-__host__ __device__ constexpr auto stride(const Layout<Shape, Strides>& layout)
-{
-    return layout.GetStrides();
-}
-
 /**
  * \brief Get Layout shape.
  *
  * \param layout Layout to get shape from.
  * \return Requsted shape.
  */
-template <typename Shape, typename Strides>
-__host__ __device__ constexpr const auto& shape(const Layout<Shape, Strides>& layout)
+template <typename LayoutType>
+__host__ __device__ constexpr const auto& shape(const LayoutType& layout)
 {
     return layout.GetShape();
 }
diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp
index edc61e1e466..96a53591ef2 100644
--- a/include/ck/wrapper/utils/tensor_partition.hpp
+++ b/include/ck/wrapper/utils/tensor_partition.hpp
@@ -16,7 +16,6 @@ template <typename... Ts, typename... Ls>
 __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts...>& shape,
                                                                 const Tuple<Ls...>& thread_lengths)
 {
-
     static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
     return generate_tuple(
         [&](auto i) {
@@ -28,7 +27,7 @@ __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts..
             }
             else
             {
-                const auto slice_len = shape.At(num_i) / thread_lengths.At(i);
+                const auto slice_len = shape.At(num_i) / thread_lengths.At(num_i);
                 return slice_len;
             }
         },
@@ -37,70 +36,75 @@ __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple<Ts..
 
 // Calculate shape for partition based on number of threads per each dim,
 // previous strides and steps
-template <typename... Ts, typename... Ls, typename... Steps>
-__host__ __device__ constexpr auto CalculateLocalPartitionStrides(
-    const Tuple<Ts...>& strides, const Tuple<Ls...>& thread_lengths, const Tuple<Steps...>& steps)
+template <typename... Ts, typename... Ls, typename... Steps, typename FlattenDescType>
+__host__ __device__ constexpr auto
+CalculateLocalPartitionDescriptor(const Tuple<Ts...>& shape,
+                                  const Tuple<Ls...>& thread_lengths,
+                                  const Tuple<Steps...>& steps,
+                                  const FlattenDescType& flatten_desc)
 {
 
     static_assert(Tuple<Ts...>::Size() == Tuple<Ls...>::Size(), "Wrong thread_lengths shape.");
-    return generate_tuple(
+    const auto unrolled_thread_lengths = UnrollNestedTuple(thread_lengths);
+    const auto unrolled_shape          = UnrollNestedTuple(shape);
+    constexpr auto dims                = decltype(unrolled_thread_lengths)::Size();
+
+    using UnrolledStepsType = decltype(UnrollNestedTuple(steps));
+
+    using I1 = Number<1>;
+
+    const auto transforms = generate_tuple(
         [&](auto i) {
             constexpr auto num_i = Number<i>{};
-            if constexpr(is_detected<is_tuple, tuple_element_t<i.value, Tuple<Ts...>>>::value)
+            if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
             {
-                // if tuple then recurrence
-                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
+                // By default raked partition
+                const auto partition_stride = unrolled_thread_lengths.At(num_i);
+                return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
+                                            make_tuple(partition_stride));
+            }
+            else if constexpr(!is_same_v<tuple_element_t<i.value, UnrolledStepsType>, index_t>)
+            {
+                // Compiletime partition
+                if constexpr(is_same_v<tuple_element_t<i.value, UnrolledStepsType>, I1>)
                 {
-                    return CalculateLocalPartitionStrides(
-                        strides.At(num_i), thread_lengths.At(num_i), Tuple<>{});
+                    // raked
+                    const auto partition_stride = unrolled_thread_lengths.At(num_i);
+                    return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
+                                                make_tuple(partition_stride));
                 }
                 else
                 {
-                    return CalculateLocalPartitionStrides(
-                        strides.At(num_i), thread_lengths.At(num_i), steps.At(num_i));
+                    // packed
+                    return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
+                                                make_tuple(I1{}));
                 }
             }
             else
             {
-                if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
-                {
-                    // By default raked partition
-                    const auto partition_stride = thread_lengths.At(i);
-                    return partition_stride * strides.At(i);
-                }
-                else if constexpr(!is_same_v<tuple_element_t<i.value, Tuple<Steps...>>, index_t>)
+                // Runtime partition
+                if(steps.At(num_i) == 1)
                 {
-                    // Compiletime partition
-                    if constexpr(is_same_v<tuple_element_t<i.value, Tuple<Steps...>>, Number<1>>)
-                    {
-                        // raked
-                        const auto partition_stride = thread_lengths.At(i);
-                        return partition_stride * strides.At(i);
-                    }
-                    else
-                    {
-                        // packed
-                        return strides.At(i);
-                    }
+                    // raked
+                    const auto partition_stride = unrolled_thread_lengths.At(num_i);
+                    return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
+                                                make_tuple(partition_stride));
                 }
                 else
                 {
-                    // Runtime partition
-                    if(steps.At(i) == 1)
-                    {
-                        // raked
-                        const auto partition_stride = thread_lengths.At(i);
-                        return partition_stride * strides.At(i);
-                    }
-                    else
-                    {
-                        // packed
-                        return strides.At(i);
-                    }
+                    // packed
+                    return make_embed_transform(make_tuple(unrolled_shape.At(num_i)),
+                                                make_tuple(I1{}));
                 }
             }
         },
-        Number<Tuple<Ts...>::Size()>{});
+        Number<dims>{});
+
+    const auto lower_dims =
+        generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<dims>{});
+    const auto upper_dims =
+        generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<dims>{});
+    return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
 }
 
 // Convert interger thread_idx to tuple index with applied steps
@@ -128,8 +132,8 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Ls...>&
             else
             {
                 // Update thread_id after each dim
-                const auto dim_thread_id = thread_id % thread_lengths.At(i);
-                thread_id /= thread_lengths.At(i);
+                const auto dim_thread_id = thread_id % thread_lengths.At(num_i);
+                thread_id /= thread_lengths.At(num_i);
                 if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
                 {
                     return dim_thread_id;
@@ -220,9 +224,12 @@ __host__ __device__ constexpr auto make_local_partition(const TensorType& tensor
 {
     // Create shape, strides and layout for new partition tensor
     const auto partition_shape = CalculateLocalPartitionShape(shape(tensor), thread_lengths);
-    const auto partition_strides =
-        CalculateLocalPartitionStrides(stride(tensor), thread_lengths, steps);
-    const auto partition_layout = make_layout(partition_shape, partition_strides);
+    // Create new descriptor and layout
+    const auto& flatten_desc = layout(tensor).GetFlattenDescriptor();
+    auto partition_desc =
+        CalculateLocalPartitionDescriptor(shape(tensor), thread_lengths, steps, flatten_desc);
+    const auto partition_layout = Layout<decltype(partition_shape), decltype(partition_desc)>(
+        partition_shape, partition_desc);
     // Calculate offset for new partition tensor
     index_t thread_id_copy      = thread_id;
     const auto offset_idx       = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id_copy);
@@ -251,12 +258,16 @@ __host__ __device__ constexpr auto make_local_tile(const TensorType& tensor,
 {
     // Create block lengths, strides and layout for new tile tensor
     const auto block_lengths = CalculateBlockLengths(shape(tensor), tile_shape);
-    const auto block_strides = CalculateLocalPartitionStrides(stride(tensor), block_lengths, steps);
-    const auto tile_layout   = make_layout(tile_shape, block_strides);
+    // Create new descriptor and layout
+    const auto& flatten_desc = layout(tensor).GetFlattenDescriptor();
+    auto tile_desc =
+        CalculateLocalPartitionDescriptor(tile_shape, block_lengths, steps, flatten_desc);
+    const auto tile_layout = Layout<remove_reference_t<decltype(tile_shape)>, decltype(tile_desc)>(
+        tile_shape, tile_desc);
     // Calculate offset for new partition tensor
-    const auto offset_idx       = CalculateLayoutOffsetIdx(steps, block_idx);
-    const auto partition_offset = layout(tensor)(offset_idx);
-    return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + partition_offset,
+    const auto offset_idx  = CalculateLayoutOffsetIdx(steps, block_idx);
+    const auto tile_offset = layout(tensor)(offset_idx);
+    return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + tile_offset,
                                                              tile_layout);
 }
 
diff --git a/include/ck/wrapper/utils/tensor_utils.hpp b/include/ck/wrapper/utils/tensor_utils.hpp
index 35bb4f04be7..a2b36d29e95 100644
--- a/include/ck/wrapper/utils/tensor_utils.hpp
+++ b/include/ck/wrapper/utils/tensor_utils.hpp
@@ -27,12 +27,12 @@ using MemoryTypeEnum = AddressSpaceEnum;
 // Disable from doxygen docs generation
 /// @cond
 // forward declarations
-template <typename Shape, typename Strides>
+template <typename Shape, typename FlattenDescriptorType>
 struct Layout;
 template <MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename Strides,
+          typename FlattenDescriptorType,
           index_t NumVectors,     // params for Register memory
           index_t ScalarPerVector // param for Register memory
           >
@@ -98,11 +98,18 @@ using is_tuple = decltype(std::declval<T&>().IsTuple());
  * \param layout Tensor layout.
  * \return Constructed tensor.
  */
-template <MemoryTypeEnum MemoryType, typename ElementType, typename Shape, typename Strides>
-constexpr auto make_tensor(ElementType* pointer, const Layout<Shape, Strides>& layout)
+template <MemoryTypeEnum MemoryType,
+          typename ElementType,
+          typename Shape,
+          typename FlattenDescriptorType>
+constexpr auto make_tensor(ElementType* pointer, const Layout<Shape, FlattenDescriptorType>& layout)
 {
-    return Tensor<MemoryType, ElementType, Shape, Strides, 0 /*NumVectors*/, 0 /*ScalarPerVector*/>(
-        pointer, layout);
+    return Tensor<MemoryType,
+                  ElementType,
+                  Shape,
+                  FlattenDescriptorType,
+                  0 /*NumVectors*/,
+                  0 /*ScalarPerVector*/>(pointer, layout);
 }
 
 /**
@@ -124,7 +131,7 @@ constexpr auto make_register_tensor()
     return Tensor<MemoryType,
                   ElementType,
                   Tuple<Number<NumVectors>>,
-                  Tuple<Number<1>>,
+                  std::remove_const_t<remove_reference_t<decltype(layout.GetFlattenDescriptor())>>,
                   NumVectors,
                   ScalarPerVector>(layout);
 }
@@ -138,12 +145,15 @@ constexpr auto make_register_tensor()
 template <MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename Strides,
+          typename FlattenDescriptorType,
           index_t NumVectors,
           index_t ScalarPerVector>
-__host__ __device__ constexpr const auto&
-layout(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
-           tensor)
+__host__ __device__ constexpr const auto& layout(const Tensor<BufferAddressSpace,
+                                                              ElementType,
+                                                              Shape,
+                                                              FlattenDescriptorType,
+                                                              NumVectors,
+                                                              ScalarPerVector>& tensor)
 {
     return tensor.GetLayout();
 }
@@ -159,12 +169,15 @@ template <index_t... Idxs,
           MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename Strides,
+          typename FlattenDescriptorType,
           index_t NumVectors,
           index_t ScalarPerVector>
-__host__ __device__ constexpr auto
-size(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
-         tensor)
+__host__ __device__ constexpr auto size(const Tensor<BufferAddressSpace,
+                                                     ElementType,
+                                                     Shape,
+                                                     FlattenDescriptorType,
+                                                     NumVectors,
+                                                     ScalarPerVector>& tensor)
 {
     return size<Idxs...>(tensor.GetLayout());
 }
@@ -180,12 +193,15 @@ template <index_t... Idxs,
           MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename Strides,
+          typename FlattenDescriptorType,
           index_t NumVectors,
           index_t ScalarPerVector>
-__host__ __device__ constexpr auto
-rank(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
-         tensor)
+__host__ __device__ constexpr auto rank(const Tensor<BufferAddressSpace,
+                                                     ElementType,
+                                                     Shape,
+                                                     FlattenDescriptorType,
+                                                     NumVectors,
+                                                     ScalarPerVector>& tensor)
 {
     return rank<Idxs...>(tensor.GetLayout());
 }
@@ -201,35 +217,19 @@ template <index_t... Idxs,
           MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename Strides,
+          typename FlattenDescriptorType,
           index_t NumVectors,
           index_t ScalarPerVector>
-__host__ __device__ constexpr auto
-depth(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
-          tensor)
+__host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
+                                                      ElementType,
+                                                      Shape,
+                                                      FlattenDescriptorType,
+                                                      NumVectors,
+                                                      ScalarPerVector>& tensor)
 {
     return depth<Idxs...>(tensor.GetLayout());
 }
 
-/**
- * \brief Get Tensor strides.
- *
- * \param tensor Tensor to get strides from.
- * \return Requsted strides.
- */
-template <MemoryTypeEnum BufferAddressSpace,
-          typename ElementType,
-          typename Shape,
-          typename Strides,
-          index_t NumVectors,
-          index_t ScalarPerVector>
-__host__ __device__ constexpr auto
-stride(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
-           tensor)
-{
-    return stride(tensor.GetLayout());
-}
-
 /**
  * \brief Get Tensor shape.
  *
@@ -239,12 +239,15 @@ stride(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors,
 template <MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename Strides,
+          typename FlattenDescriptorType,
           index_t NumVectors,
           index_t ScalarPerVector>
-__host__ __device__ constexpr const auto&
-shape(const Tensor<BufferAddressSpace, ElementType, Shape, Strides, NumVectors, ScalarPerVector>&
-          tensor)
+__host__ __device__ constexpr const auto& shape(const Tensor<BufferAddressSpace,
+                                                             ElementType,
+                                                             Shape,
+                                                             FlattenDescriptorType,
+                                                             NumVectors,
+                                                             ScalarPerVector>& tensor)
 {
     return shape(tensor.GetLayout());
 }
diff --git a/test/wrapper/test_layout.cpp b/test/wrapper/test_layout.cpp
index d5dba4841f5..a128a6d84f5 100644
--- a/test/wrapper/test_layout.cpp
+++ b/test/wrapper/test_layout.cpp
@@ -436,26 +436,11 @@ TEST(TestLayoutHelpers, ShapeAndStrides)
     constexpr bool check_compiletime_shape =
         std::is_same_v<decltype(shape_compiletime),
                        std::remove_reference_t<decltype(shape(layout_compiletime))>>;
-    constexpr bool check_compiletime_strides =
-        std::is_same_v<std::remove_const_t<decltype(strides_compiletime)>,
-                       std::remove_reference_t<decltype(stride(layout_compiletime))>>;
     constexpr bool check_runtime_shape =
         std::is_same_v<decltype(shape_runtime),
                        std::remove_reference_t<decltype(shape(layout_runtime))>>;
-    constexpr bool check_runtime_strides =
-        std::is_same_v<std::remove_const_t<decltype(strides_runtime)>,
-                       std::remove_reference_t<decltype(stride(layout_runtime))>>;
     EXPECT_TRUE(check_compiletime_shape);
-    EXPECT_TRUE(check_compiletime_strides);
     EXPECT_TRUE(check_runtime_shape);
-    EXPECT_TRUE(check_runtime_strides);
-
-    // Check packed strides generation
-    const auto packed_layout = ck::wrapper::make_layout(shape_runtime);
-    constexpr bool check_packed_layout_strides =
-        std::is_same_v<std::remove_const_t<decltype(strides_runtime)>,
-                       std::remove_reference_t<decltype(stride(packed_layout))>>;
-    EXPECT_TRUE(check_packed_layout_strides);
 }
 
 TEST(TestLayoutHelpers, Hierarchical)
diff --git a/test/wrapper/test_partition.cpp b/test/wrapper/test_partition.cpp
index a2ae30f5de0..fe214e9ed07 100644
--- a/test/wrapper/test_partition.cpp
+++ b/test/wrapper/test_partition.cpp
@@ -46,21 +46,6 @@ TEST(TestPartition, LocalPartition)
             ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout);
         EXPECT_EQ(ck::wrapper::size(raked_partition), expected_partition_size);
         EXPECT_EQ(raked_partition(0), thread_id);
-
-        const auto expected_partition_stride_00 =
-            ck::wrapper::size<0, 0>(strides) * ck::wrapper::size<0, 0>(thread_layout);
-        const auto expected_partition_stride_01 =
-            ck::wrapper::size<0, 1>(strides) * ck::wrapper::size<0, 1>(thread_layout);
-        const auto expected_partition_stride_1 =
-            ck::wrapper::size<1>(strides) * ck::wrapper::size<1>(thread_layout);
-        const auto partition_stride_00 =
-            ck::wrapper::size<0, 0>(ck::wrapper::stride(raked_partition));
-        const auto partition_stride_01 =
-            ck::wrapper::size<0, 1>(ck::wrapper::stride(raked_partition));
-        const auto partition_stride_1 = ck::wrapper::size<1>(ck::wrapper::stride(raked_partition));
-        EXPECT_EQ(partition_stride_00, expected_partition_stride_00);
-        EXPECT_EQ(partition_stride_01, expected_partition_stride_01);
-        EXPECT_EQ(partition_stride_1, expected_partition_stride_1);
     }
 
     for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++)
@@ -73,18 +58,6 @@ TEST(TestPartition, LocalPartition)
         const auto expected_partition_first_val = thread_id * ck::wrapper::size<0, 0>(thread_steps);
         EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size);
         EXPECT_EQ(packed_partition(0), expected_partition_first_val);
-
-        const auto expected_partition_stride_00 = ck::wrapper::size<0, 0>(strides);
-        const auto expected_partition_stride_01 = ck::wrapper::size<0, 1>(strides);
-        const auto expected_partition_stride_1  = ck::wrapper::size<1>(strides);
-        const auto partition_stride_00 =
-            ck::wrapper::size<0, 0>(ck::wrapper::stride(packed_partition));
-        const auto partition_stride_01 =
-            ck::wrapper::size<0, 1>(ck::wrapper::stride(packed_partition));
-        const auto partition_stride_1 = ck::wrapper::size<1>(ck::wrapper::stride(packed_partition));
-        EXPECT_EQ(partition_stride_00, expected_partition_stride_00);
-        EXPECT_EQ(partition_stride_01, expected_partition_stride_01);
-        EXPECT_EQ(partition_stride_1, expected_partition_stride_1);
     }
 }
 
@@ -128,19 +101,6 @@ TEST(TestPartition, LocalTile)
         const auto expected_tile_size = ck::wrapper::size(block_shape);
         EXPECT_EQ(ck::wrapper::size(raked_tile), expected_tile_size);
         EXPECT_EQ(raked_tile(0), layout(block_idx));
-
-        const auto expected_tile_stride_00 =
-            ck::wrapper::size<0, 0>(strides) * ck::wrapper::size<0, 0>(block_layout);
-        const auto expected_tile_stride_01 =
-            ck::wrapper::size<0, 1>(strides) * ck::wrapper::size<0, 1>(block_layout);
-        const auto expected_tile_stride_1 =
-            ck::wrapper::size<1>(strides) * ck::wrapper::size<1>(block_layout);
-        const auto tile_stride_00 = ck::wrapper::size<0, 0>(ck::wrapper::stride(raked_tile));
-        const auto tile_stride_01 = ck::wrapper::size<0, 1>(ck::wrapper::stride(raked_tile));
-        const auto tile_stride_1  = ck::wrapper::size<1>(ck::wrapper::stride(raked_tile));
-        EXPECT_EQ(tile_stride_00, expected_tile_stride_00);
-        EXPECT_EQ(tile_stride_01, expected_tile_stride_01);
-        EXPECT_EQ(tile_stride_1, expected_tile_stride_1);
     }
 
     for(const auto& block_idx : block_idxs)
@@ -158,15 +118,5 @@ TEST(TestPartition, LocalTile)
                 ck::wrapper::size<1>(strides);
         EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size);
         EXPECT_EQ(packed_tile(0), expected_tile_first_val);
-
-        const auto expected_tile_stride_00 = ck::wrapper::size<0, 0>(strides);
-        const auto expected_tile_stride_01 = ck::wrapper::size<0, 1>(strides);
-        const auto expected_tile_stride_1  = ck::wrapper::size<1>(strides);
-        const auto tile_stride_00 = ck::wrapper::size<0, 0>(ck::wrapper::stride(packed_tile));
-        const auto tile_stride_01 = ck::wrapper::size<0, 1>(ck::wrapper::stride(packed_tile));
-        const auto tile_stride_1  = ck::wrapper::size<1>(ck::wrapper::stride(packed_tile));
-        EXPECT_EQ(tile_stride_00, expected_tile_stride_00);
-        EXPECT_EQ(tile_stride_01, expected_tile_stride_01);
-        EXPECT_EQ(tile_stride_1, expected_tile_stride_1);
     }
 }
diff --git a/test/wrapper/test_tensor.cpp b/test/wrapper/test_tensor.cpp
index 72b56068749..1392ee1b516 100644
--- a/test/wrapper/test_tensor.cpp
+++ b/test/wrapper/test_tensor.cpp
@@ -126,7 +126,7 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success)
     StaticInitTensor<nelems>(tensor_vgpr);
     StaticInitTensor<nelems>(tensor_sgpr);
 
-    *casted_success_ptr &= TestTensorCheck1d(tensor_global);
+    *casted_success_ptr = TestTensorCheck1d(tensor_global);
     *casted_success_ptr &= TestTensorCheck3d(tensor_global);
 
     *casted_success_ptr &= TestTensorCheck1d(tensor_lds);
@@ -172,33 +172,38 @@ TEST(TestTensor, Slicing)
 
     auto tensor2x2x2 =
         tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2));
+    EXPECT_EQ(tensor2x2x2(0), layout(ck::make_tuple(ck::make_tuple(0, 0), 0)));
     EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2);
     EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2);
     EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8);
     EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2));
 
     auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2));
+    EXPECT_EQ(tensor2x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
     EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2);
     EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2);
     EXPECT_EQ(ck::wrapper::size(tensor2x2), 4);
-    EXPECT_TRUE(TestTensorCheck1d(tensor2x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0))));
+    EXPECT_TRUE(TestTensorCheck1d(tensor2x2));
 
     auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2));
+    EXPECT_EQ(tensor1x1(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 1)));
     EXPECT_EQ(rank(tensor1x1), 2);
     EXPECT_EQ(depth(tensor1x1), 2);
     EXPECT_EQ(size(tensor1x1), 1);
-    EXPECT_TRUE(TestTensorCheck1d(tensor1x1, layout(ck::make_tuple(ck::make_tuple(1, 1), 1))));
+    EXPECT_TRUE(TestTensorCheck1d(tensor1x1));
 
     auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2));
+    EXPECT_EQ(tensor2(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 0)));
     EXPECT_EQ(ck::wrapper::rank(tensor2), 1);
     EXPECT_EQ(ck::wrapper::depth(tensor2), 1);
     EXPECT_EQ(ck::wrapper::size(tensor2), 2);
-    EXPECT_TRUE(TestTensorCheck1d(tensor2, layout(ck::make_tuple(ck::make_tuple(1, 1), 0))));
+    EXPECT_TRUE(TestTensorCheck1d(tensor2));
 
     // negative indexing
     auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice());
+    EXPECT_EQ(tensor1x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));
     EXPECT_EQ(rank(tensor1x2), 2);
     EXPECT_EQ(depth(tensor1x2), 2);
     EXPECT_EQ(size(tensor1x2), 2);
-    EXPECT_TRUE(TestTensorCheck1d(tensor1x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0))));
+    EXPECT_TRUE(TestTensorCheck1d(tensor1x2));
 }

From 12213d7dd8eb7f1d9cec999d4850e18f18367fd0 Mon Sep 17 00:00:00 2001
From: Bartlomiej Kocot <barkocot@amd.com>
Date: Fri, 29 Dec 2023 12:30:49 +0000
Subject: [PATCH 5/7] Fixes

---
 include/ck/wrapper/layout.hpp                 | 40 +++++++++----------
 include/ck/wrapper/tensor.hpp                 | 40 ++++++++++---------
 include/ck/wrapper/utils/layout_utils.hpp     | 28 ++++++-------
 include/ck/wrapper/utils/tensor_partition.hpp | 30 +++++++++-----
 include/ck/wrapper/utils/tensor_utils.hpp     | 33 +++++++--------
 test/wrapper/test_tensor.cpp                  |  7 ++++
 6 files changed, 100 insertions(+), 78 deletions(-)

diff --git a/include/ck/wrapper/layout.hpp b/include/ck/wrapper/layout.hpp
index c0a52c03327..1643eb73833 100644
--- a/include/ck/wrapper/layout.hpp
+++ b/include/ck/wrapper/layout.hpp
@@ -14,9 +14,9 @@ namespace wrapper {
  * \tparam Shape Tuple of Number<> (for compile-time layout) or index_t
  *         (dynamic layout). It is possible to pass nested shapes
  *         (e.g. ((4, 2), 2)), nested dimensions are merged.
- * \tparam FlattenDescriptorType Tensor descriptor for flatten shape dims.
+ * \tparam UnnestedDescriptorType Tensor descriptor for unnested shape dims.
  */
-template <typename Shape, typename FlattenDescriptorType>
+template <typename Shape, typename UnnestedDescriptorType>
 struct Layout
 {
     private:
@@ -29,7 +29,7 @@ struct Layout
     {
         return generate_tuple(
             [&](auto) {
-                if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
+                if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime())
                 {
                     // runtime layout
                     return index_t(0);
@@ -185,14 +185,14 @@ struct Layout
     }
 
     using Descriptor1dType =
-        remove_cvref_t<decltype(MakeMerge1d(Shape{}, FlattenDescriptorType{}))>;
+        remove_cvref_t<decltype(MakeMerge1d(Shape{}, UnnestedDescriptorType{}))>;
     using DefaultIdxsTupleType = remove_cvref_t<decltype(GenerateDefaultIdxsTuple(Shape{}))>;
 
     template <typename... ShapeDims, typename... IdxDims>
     __host__ __device__ constexpr static auto
     TransformDesc(const Tuple<ShapeDims...>& shape,
                   const Tuple<IdxDims...>& idx,
-                  const FlattenDescriptorType& naive_descriptor)
+                  const UnnestedDescriptorType& naive_descriptor)
     {
         if constexpr(Tuple<IdxDims...>::Size() == I1)
         {
@@ -215,12 +215,12 @@ struct Layout
     }
 
     using MergedNestsDescriptorType = remove_cvref_t<decltype(TransformDesc(
-        Shape{}, DefaultIdxsTupleType{}, FlattenDescriptorType{}))>;
+        Shape{}, DefaultIdxsTupleType{}, UnnestedDescriptorType{}))>;
 
     public:
     __host__ __device__ constexpr auto GetElementSpaceSize() const
     {
-        return flatten_descriptor_.GetElementSpaceSize();
+        return unnested_descriptor_.GetElementSpaceSize();
     }
 
     __host__ __device__ Layout() = delete;
@@ -229,19 +229,19 @@ struct Layout
      * \brief Layout constructor.
      *
      * \param shape Shape for layout.
-     * \param flatten_descriptor Descriptor
+     * \param unnested_descriptor Descriptor
      */
     __host__ __device__ constexpr Layout(const Shape& shape,
-                                         const FlattenDescriptorType& flatten_descriptor)
+                                         const UnnestedDescriptorType& unnested_descriptor)
         : shape_(shape)
     {
         // Construct if runtime mode
-        if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime())
+        if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime())
         {
-            flatten_descriptor_ = flatten_descriptor;
-            descriptor_1d_      = MakeMerge1d(shape_, flatten_descriptor_);
+            unnested_descriptor_ = unnested_descriptor;
+            descriptor_1d_       = MakeMerge1d(shape_, unnested_descriptor_);
             merged_nests_descriptor_ =
-                TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_);
+                TransformDesc(shape_, DefaultIdxsTupleType{}, unnested_descriptor_);
         }
     }
 
@@ -254,9 +254,9 @@ struct Layout
     template <typename Idxs>
     __host__ __device__ constexpr index_t operator()() const
     {
-        static_assert(FlattenDescriptorType::IsKnownAtCompileTime(),
+        static_assert(UnnestedDescriptorType::IsKnownAtCompileTime(),
                       "Compiletime operator used on runtime layout.");
-        using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, FlattenDescriptorType{}));
+        using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, UnnestedDescriptorType{}));
         using UnrolledIdx     = decltype(UnrollNestedTuple(Idxs{}));
         return TransformedDesc{}.CalculateOffset(UnrolledIdx{});
     }
@@ -283,7 +283,7 @@ struct Layout
         else
         {
             // Custom index, need to transform descriptor
-            const auto transformed_desc = TransformDesc(shape_, Idx, flatten_descriptor_);
+            const auto transformed_desc = TransformDesc(shape_, Idx, unnested_descriptor_);
             return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx));
         }
     }
@@ -360,17 +360,17 @@ struct Layout
     }
 
     /**
-     * \brief Get flatten descriptor (with unrolled dims)
+     * \brief Get unnested descriptor (with unrolled dims)
      *
      * \return Flatten descriptor.
      */
-    __host__ __device__ constexpr const FlattenDescriptorType& GetFlattenDescriptor() const
+    __host__ __device__ constexpr const UnnestedDescriptorType& GetUnnestedDescriptor() const
     {
-        return flatten_descriptor_;
+        return unnested_descriptor_;
     }
 
     private:
-    FlattenDescriptorType flatten_descriptor_;
+    UnnestedDescriptorType unnested_descriptor_;
     Descriptor1dType descriptor_1d_;
     MergedNestsDescriptorType merged_nests_descriptor_;
     const Shape shape_;
diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp
index bc598178752..787c5b2176e 100644
--- a/include/ck/wrapper/tensor.hpp
+++ b/include/ck/wrapper/tensor.hpp
@@ -16,14 +16,14 @@ namespace wrapper {
  * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR).
  * \tparam ElementType Element data type.
  * \tparam Shape Tensor shape (layout component).
- * \tparam FlattenDescriptorType Flatten descriptor (layout component).
+ * \tparam UnnestedDescriptorType Flatten descriptor (layout component).
  * \tparam NumVectors Number of vectors (only for VGPR, SGPR).
  * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR).
  */
 template <MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename FlattenDescriptorType,
+          typename UnnestedDescriptorType,
           index_t NumVectors,     // param for Register memory
           index_t ScalarPerVector // param for Register memory
           >
@@ -85,14 +85,17 @@ struct Tensor
 
     // Generate Freeze for each of nested shape
     template <typename T, typename ShapeTmpType>
-    __host__ __device__ constexpr auto GenerateMultipleFreeze(const T& idx,
+    __host__ __device__ constexpr auto GenerateMultipleFreeze(T idx,
                                                               const ShapeTmpType& shape) const
     {
         const auto unrolled_shape = UnrollNestedTuple(shape);
         return generate_tuple(
-            [&](auto) {
+            [&](auto i) {
                 // dimension offset from idx
-                return make_freeze_transform(idx);
+                const auto dim     = unrolled_shape.At(Number<i>{});
+                const auto dim_idx = idx % dim;
+                idx /= dim;
+                return make_freeze_transform(dim_idx);
             },
             Number<decltype(unrolled_shape)::Size()>{});
     }
@@ -116,7 +119,7 @@ struct Tensor
                     const auto from  = idx.At(num_i).from_;
                     const auto dim   = shape.At(num_i);
                     const auto range = idx.At(num_i).range(dim);
-                    return make_slice_transform(dim, from, from + range);
+                    return make_slice_transform(range, from, from + range);
                 }
                 else
                 {
@@ -188,9 +191,9 @@ struct Tensor
     }
 
     public:
-    using ElementSpaceSize  = decltype(Layout<Shape, FlattenDescriptorType>{
-        Shape{}, FlattenDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer
-    using TensorElementType = ElementType;                         // DataType
+    using ElementSpaceSize  = decltype(Layout<Shape, UnnestedDescriptorType>{
+        Shape{}, UnnestedDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer
+    using TensorElementType = ElementType;                          // DataType
 
     static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace;
     static constexpr bool IsDynamicBuffer = !(BufferAddressSpace == MemoryTypeEnum ::Sgpr ||
@@ -198,18 +201,19 @@ struct Tensor
 
     __host__ __device__ Tensor() = delete;
     __host__ __device__ Tensor(ElementType* pointer,
-                               const Layout<Shape, FlattenDescriptorType>& layout)
+                               const Layout<Shape, UnnestedDescriptorType>& layout)
         : layout_(layout),
           buffer_(make_dynamic_buffer<BufferAddressSpace>(pointer, layout.GetElementSpaceSize()))
     {
     }
 
-    __host__ __device__ Tensor(const Layout<Shape, FlattenDescriptorType>& layout) : layout_(layout)
+    __host__ __device__ Tensor(const Layout<Shape, UnnestedDescriptorType>& layout)
+        : layout_(layout)
     {
         static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register.");
     }
 
-    __host__ __device__ constexpr const Layout<Shape, FlattenDescriptorType>& GetLayout() const
+    __host__ __device__ constexpr const Layout<Shape, UnnestedDescriptorType>& GetLayout() const
     {
         return layout_;
     }
@@ -222,7 +226,7 @@ struct Tensor
         const auto& shape = layout_.GetShape();
         auto new_shape    = GetShapeFromSlicedTensor(idx, shape);
 
-        const auto& flatten_desc = layout_.GetFlattenDescriptor();
+        const auto& flatten_desc = layout_.GetUnnestedDescriptor();
         auto new_desc            = GetDescriptorFromSlicedTensor(idx, shape, flatten_desc);
         const auto new_layout =
             Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc);
@@ -252,9 +256,9 @@ struct Tensor
         }
         else
         {
-            constexpr index_t offset = Layout<Shape, FlattenDescriptorType>{
+            constexpr index_t offset = Layout<Shape, UnnestedDescriptorType>{
                 Shape{},
-                FlattenDescriptorType{}}.template operator()<Tuple<Ts...>>();
+                UnnestedDescriptorType{}}.template operator()<Tuple<Ts...>>();
             return buffer_[Number<offset>{}];
         }
     }
@@ -282,9 +286,9 @@ struct Tensor
         }
         else
         {
-            constexpr index_t offset = Layout<Shape, FlattenDescriptorType>{
+            constexpr index_t offset = Layout<Shape, UnnestedDescriptorType>{
                 Shape{},
-                FlattenDescriptorType{}}.template operator()<Tuple<Ts...>>();
+                UnnestedDescriptorType{}}.template operator()<Tuple<Ts...>>();
             return buffer_(Number<offset>{});
         }
     }
@@ -322,7 +326,7 @@ struct Tensor
     // If register use static buffer, else use dynamic buffer
     using Buffer = std::conditional_t<IsDynamicBuffer, DynamicBufferType, StaticBufferType>;
 
-    const Layout<Shape, FlattenDescriptorType> layout_;
+    const Layout<Shape, UnnestedDescriptorType> layout_;
     Buffer buffer_;
 };
 
diff --git a/include/ck/wrapper/utils/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp
index a33311e1422..684cec42453 100644
--- a/include/ck/wrapper/utils/layout_utils.hpp
+++ b/include/ck/wrapper/utils/layout_utils.hpp
@@ -22,7 +22,7 @@ namespace wrapper {
 // Disable from doxygen docs generation
 /// @cond
 // forward declaration
-template <typename Shape, typename FlattenDescriptorType>
+template <typename Shape, typename UnnestedDescriptorType>
 struct Layout;
 
 template <typename T>
@@ -90,8 +90,8 @@ __host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shap
 template <typename Shape, typename Strides>
 __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides& strides)
 {
-    using FlattenDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Strides{}));
-    return Layout<Shape, FlattenDescriptorType>(shape, MakeFlattenDescriptor(shape, strides));
+    using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Strides{}));
+    return Layout<Shape, UnnestedDescriptorType>(shape, MakeFlattenDescriptor(shape, strides));
 }
 
 /**
@@ -104,8 +104,8 @@ __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides
 template <typename Shape>
 __host__ __device__ constexpr auto make_layout(const Shape& shape)
 {
-    using FlattenDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Tuple<>{}));
-    return Layout<Shape, FlattenDescriptorType>(shape, MakeFlattenDescriptor(shape, Tuple<>{}));
+    using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Tuple<>{}));
+    return Layout<Shape, UnnestedDescriptorType>(shape, MakeFlattenDescriptor(shape, Tuple<>{}));
 }
 
 // Layout helpers
@@ -182,7 +182,7 @@ __host__ __device__ constexpr auto get(const Layout<Shape, FlattenDesc>& layout)
         },
         Number<old_shape_dims>{});
 
-    const auto& flatten_desc = layout.GetFlattenDescriptor();
+    const auto& flatten_desc = layout.GetUnnestedDescriptor();
     auto new_desc = transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
     return Layout<decltype(new_shape), decltype(new_desc)>(new_shape, new_desc);
 }
@@ -218,8 +218,8 @@ __host__ __device__ T constexpr size(const T& dim)
  * \param layout Layout to get Shape of.
  * \return Requsted length.
  */
-template <index_t idx, typename Shape, typename FlattenDescriptorType>
-__host__ __device__ constexpr auto size(const Layout<Shape, FlattenDescriptorType>& layout)
+template <index_t idx, typename Shape, typename UnnestedDescriptorType>
+__host__ __device__ constexpr auto size(const Layout<Shape, UnnestedDescriptorType>& layout)
 {
     return layout.template GetLength<idx>();
 }
@@ -244,8 +244,8 @@ __host__ __device__ constexpr auto size(const Tuple<ShapeDims...>& shape)
  * \param layout Layout to calculate shape size.
  * \return Requsted size.
  */
-template <typename Shape, typename FlattenDescriptorType>
-__host__ __device__ constexpr auto size(const Layout<Shape, FlattenDescriptorType>& layout)
+template <typename Shape, typename UnnestedDescriptorType>
+__host__ __device__ constexpr auto size(const Layout<Shape, UnnestedDescriptorType>& layout)
 {
     return layout.GetLengths();
 }
@@ -284,9 +284,9 @@ __host__ __device__ constexpr auto size(const T& elem)
  * \param layout Layout to calculate rank.
  * \return Requsted rank.
  */
-template <typename Shape, typename FlattenDescriptorType>
+template <typename Shape, typename UnnestedDescriptorType>
 __host__ __device__ constexpr auto
-rank([[maybe_unused]] const Layout<Shape, FlattenDescriptorType>& layout)
+rank([[maybe_unused]] const Layout<Shape, UnnestedDescriptorType>& layout)
 {
     return Shape::Size();
 }
@@ -338,8 +338,8 @@ __host__ __device__ constexpr auto rank(const T& elem)
  * \param layout Layout to calculate depth.
  * \return Requsted depth.
  */
-template <typename Shape, typename FlattenDescriptorType>
-__host__ __device__ constexpr auto depth(const Layout<Shape, FlattenDescriptorType>& layout)
+template <typename Shape, typename UnnestedDescriptorType>
+__host__ __device__ constexpr auto depth(const Layout<Shape, UnnestedDescriptorType>& layout)
 {
     const auto& shape = layout.GetShape();
     return TupleDepth(shape);
diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp
index 96a53591ef2..38df6531aec 100644
--- a/include/ck/wrapper/utils/tensor_partition.hpp
+++ b/include/ck/wrapper/utils/tensor_partition.hpp
@@ -107,11 +107,10 @@ CalculateLocalPartitionDescriptor(const Tuple<Ts...>& shape,
     return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims);
 }
 
-// Convert interger thread_idx to tuple index with applied steps
 template <typename... Ls, typename... Steps>
-__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Ls...>& thread_lengths,
-                                                            const Tuple<Steps...>& steps,
-                                                            index_t& thread_id)
+__host__ __device__ constexpr auto CalculateLayoutOffsetIdxImpl(const Tuple<Ls...>& thread_lengths,
+                                                                const Tuple<Steps...>& steps,
+                                                                index_t& thread_id)
 {
     return generate_tuple(
         [&](auto i) {
@@ -121,11 +120,12 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Ls...>&
                 // if tuple then recurrence
                 if constexpr(is_same_v<Tuple<Steps...>, Tuple<>>)
                 {
-                    return CalculateLayoutOffsetIdx(thread_lengths.At(num_i), Tuple<>{}, thread_id);
+                    return CalculateLayoutOffsetIdxImpl(
+                        thread_lengths.At(num_i), Tuple<>{}, thread_id);
                 }
                 else
                 {
-                    return CalculateLayoutOffsetIdx(
+                    return CalculateLayoutOffsetIdxImpl(
                         thread_lengths.At(num_i), steps.At(num_i), thread_id);
                 }
             }
@@ -148,6 +148,17 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Ls...>&
         Number<Tuple<Ls...>::Size()>{});
 }
 
+// Convert interger thread_idx to tuple index with applied steps
+template <typename... Ls, typename... Steps>
+__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Ls...>& thread_lengths,
+                                                            const Tuple<Steps...>& steps,
+                                                            const index_t thread_id)
+{
+    // Create tmp thread_id copy for CalculateLayoutOffsetIdxImpl updates
+    index_t thread_id_copy = thread_id;
+    return CalculateLayoutOffsetIdxImpl(thread_lengths, steps, thread_id_copy);
+}
+
 // Aply steps to index represented as tuple
 template <typename... Steps, typename... Idxs>
 __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Steps...>& steps,
@@ -225,14 +236,13 @@ __host__ __device__ constexpr auto make_local_partition(const TensorType& tensor
     // Create shape, strides and layout for new partition tensor
     const auto partition_shape = CalculateLocalPartitionShape(shape(tensor), thread_lengths);
     // Create new descriptor and layout
-    const auto& flatten_desc = layout(tensor).GetFlattenDescriptor();
+    const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor();
     auto partition_desc =
         CalculateLocalPartitionDescriptor(shape(tensor), thread_lengths, steps, flatten_desc);
     const auto partition_layout = Layout<decltype(partition_shape), decltype(partition_desc)>(
         partition_shape, partition_desc);
     // Calculate offset for new partition tensor
-    index_t thread_id_copy      = thread_id;
-    const auto offset_idx       = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id_copy);
+    const auto offset_idx       = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id);
     const auto partition_offset = layout(tensor)(offset_idx);
     return make_tensor<TensorType::TensorBufferAddressSpace>(tensor.GetPointer() + partition_offset,
                                                              partition_layout);
@@ -259,7 +269,7 @@ __host__ __device__ constexpr auto make_local_tile(const TensorType& tensor,
     // Create block lengths, strides and layout for new tile tensor
     const auto block_lengths = CalculateBlockLengths(shape(tensor), tile_shape);
     // Create new descriptor and layout
-    const auto& flatten_desc = layout(tensor).GetFlattenDescriptor();
+    const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor();
     auto tile_desc =
         CalculateLocalPartitionDescriptor(tile_shape, block_lengths, steps, flatten_desc);
     const auto tile_layout = Layout<remove_reference_t<decltype(tile_shape)>, decltype(tile_desc)>(
diff --git a/include/ck/wrapper/utils/tensor_utils.hpp b/include/ck/wrapper/utils/tensor_utils.hpp
index a2b36d29e95..1e932e62e16 100644
--- a/include/ck/wrapper/utils/tensor_utils.hpp
+++ b/include/ck/wrapper/utils/tensor_utils.hpp
@@ -27,12 +27,12 @@ using MemoryTypeEnum = AddressSpaceEnum;
 // Disable from doxygen docs generation
 /// @cond
 // forward declarations
-template <typename Shape, typename FlattenDescriptorType>
+template <typename Shape, typename UnnestedDescriptorType>
 struct Layout;
 template <MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename FlattenDescriptorType,
+          typename UnnestedDescriptorType,
           index_t NumVectors,     // params for Register memory
           index_t ScalarPerVector // param for Register memory
           >
@@ -101,13 +101,14 @@ using is_tuple = decltype(std::declval<T&>().IsTuple());
 template <MemoryTypeEnum MemoryType,
           typename ElementType,
           typename Shape,
-          typename FlattenDescriptorType>
-constexpr auto make_tensor(ElementType* pointer, const Layout<Shape, FlattenDescriptorType>& layout)
+          typename UnnestedDescriptorType>
+constexpr auto make_tensor(ElementType* pointer,
+                           const Layout<Shape, UnnestedDescriptorType>& layout)
 {
     return Tensor<MemoryType,
                   ElementType,
                   Shape,
-                  FlattenDescriptorType,
+                  UnnestedDescriptorType,
                   0 /*NumVectors*/,
                   0 /*ScalarPerVector*/>(pointer, layout);
 }
@@ -131,7 +132,7 @@ constexpr auto make_register_tensor()
     return Tensor<MemoryType,
                   ElementType,
                   Tuple<Number<NumVectors>>,
-                  std::remove_const_t<remove_reference_t<decltype(layout.GetFlattenDescriptor())>>,
+                  std::remove_const_t<remove_reference_t<decltype(layout.GetUnnestedDescriptor())>>,
                   NumVectors,
                   ScalarPerVector>(layout);
 }
@@ -145,13 +146,13 @@ constexpr auto make_register_tensor()
 template <MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename FlattenDescriptorType,
+          typename UnnestedDescriptorType,
           index_t NumVectors,
           index_t ScalarPerVector>
 __host__ __device__ constexpr const auto& layout(const Tensor<BufferAddressSpace,
                                                               ElementType,
                                                               Shape,
-                                                              FlattenDescriptorType,
+                                                              UnnestedDescriptorType,
                                                               NumVectors,
                                                               ScalarPerVector>& tensor)
 {
@@ -169,13 +170,13 @@ template <index_t... Idxs,
           MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename FlattenDescriptorType,
+          typename UnnestedDescriptorType,
           index_t NumVectors,
           index_t ScalarPerVector>
 __host__ __device__ constexpr auto size(const Tensor<BufferAddressSpace,
                                                      ElementType,
                                                      Shape,
-                                                     FlattenDescriptorType,
+                                                     UnnestedDescriptorType,
                                                      NumVectors,
                                                      ScalarPerVector>& tensor)
 {
@@ -193,13 +194,13 @@ template <index_t... Idxs,
           MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename FlattenDescriptorType,
+          typename UnnestedDescriptorType,
           index_t NumVectors,
           index_t ScalarPerVector>
 __host__ __device__ constexpr auto rank(const Tensor<BufferAddressSpace,
                                                      ElementType,
                                                      Shape,
-                                                     FlattenDescriptorType,
+                                                     UnnestedDescriptorType,
                                                      NumVectors,
                                                      ScalarPerVector>& tensor)
 {
@@ -217,13 +218,13 @@ template <index_t... Idxs,
           MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename FlattenDescriptorType,
+          typename UnnestedDescriptorType,
           index_t NumVectors,
           index_t ScalarPerVector>
 __host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
                                                       ElementType,
                                                       Shape,
-                                                      FlattenDescriptorType,
+                                                      UnnestedDescriptorType,
                                                       NumVectors,
                                                       ScalarPerVector>& tensor)
 {
@@ -239,13 +240,13 @@ __host__ __device__ constexpr auto depth(const Tensor<BufferAddressSpace,
 template <MemoryTypeEnum BufferAddressSpace,
           typename ElementType,
           typename Shape,
-          typename FlattenDescriptorType,
+          typename UnnestedDescriptorType,
           index_t NumVectors,
           index_t ScalarPerVector>
 __host__ __device__ constexpr const auto& shape(const Tensor<BufferAddressSpace,
                                                              ElementType,
                                                              Shape,
-                                                             FlattenDescriptorType,
+                                                             UnnestedDescriptorType,
                                                              NumVectors,
                                                              ScalarPerVector>& tensor)
 {
diff --git a/test/wrapper/test_tensor.cpp b/test/wrapper/test_tensor.cpp
index 1392ee1b516..2d4d6f2750b 100644
--- a/test/wrapper/test_tensor.cpp
+++ b/test/wrapper/test_tensor.cpp
@@ -199,6 +199,13 @@ TEST(TestTensor, Slicing)
     EXPECT_EQ(ck::wrapper::size(tensor2), 2);
     EXPECT_TRUE(TestTensorCheck1d(tensor2));
 
+    auto tensor2_v2 = tensor(2, ck::wrapper::slice(0, 2));
+    EXPECT_EQ(tensor2_v2(0), layout(ck::make_tuple(2, 0)));
+    EXPECT_EQ(ck::wrapper::rank(tensor2_v2), 1);
+    EXPECT_EQ(ck::wrapper::depth(tensor2_v2), 1);
+    EXPECT_EQ(ck::wrapper::size(tensor2_v2), 2);
+    EXPECT_TRUE(TestTensorCheck1d(tensor2_v2));
+
     // negative indexing
     auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice());
     EXPECT_EQ(tensor1x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0)));

From 309b77b00e11411584bc6b57acdc22a78ea33daf Mon Sep 17 00:00:00 2001
From: Bartlomiej Kocot <barkocot@amd.com>
Date: Mon, 1 Jan 2024 22:26:34 +0000
Subject: [PATCH 6/7] Fix client example

---
 include/ck/wrapper/utils/layout_utils.hpp | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/include/ck/wrapper/utils/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp
index 684cec42453..f4ba0a969ff 100644
--- a/include/ck/wrapper/utils/layout_utils.hpp
+++ b/include/ck/wrapper/utils/layout_utils.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -32,24 +32,22 @@ namespace {
 // Generate packed (column-major) strides if not passed
 template <typename... Ts>
 __host__ __device__ constexpr static auto
-GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape, index_t& stride)
+GenerateColumnMajorPackedStrides(const Tuple<Ts...>& shape)
 {
+    const auto unrolled_shape = UnrollNestedTuple(shape);
     return generate_tuple(
         [&](auto i) {
-            const auto num_i = Number<i>{};
-            if constexpr(is_detected<is_tuple, tuple_element_t<i, Tuple<Ts...>>>::value)
+            if constexpr(i.value == 0)
             {
-                return GenerateColumnMajorPackedStrides(shape.At(num_i), stride);
+                return Number<1>{};
             }
             else
             {
-                const index_t dim_stride = stride;
-                // update stride
-                stride *= shape.At(num_i);
-                return dim_stride;
+                return TupleReduce<Number<0>{}.value, i.value>([](auto x, auto y) { return x * y; },
+                                                               unrolled_shape);
             }
         },
-        Number<Tuple<Ts...>::Size()>{});
+        Number<decltype(unrolled_shape)::Size()>{});
 }
 
 template <typename LayoutShape, typename LayoutStrides>
@@ -59,10 +57,8 @@ __host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shap
     const auto unrolled_shape = UnrollNestedTuple(shape);
     if constexpr(is_same_v<LayoutStrides, Tuple<>>)
     {
-        index_t start_stride = 1;
         // if not passed, then generate
-        const auto unrolled_strides =
-            GenerateColumnMajorPackedStrides(unrolled_shape, start_stride);
+        const auto unrolled_strides = GenerateColumnMajorPackedStrides(unrolled_shape);
         static_assert(unrolled_shape.Size() == unrolled_strides.Size(),
                       "Size of strides and shape are not consistent.");
         return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides);

From 9a9f572b1ab8f6fea121772c8714cac77af32bd4 Mon Sep 17 00:00:00 2001
From: Bartlomiej Kocot <barkocot@amd.com>
Date: Tue, 2 Jan 2024 11:22:25 +0000
Subject: [PATCH 7/7] Fix comments

---
 include/ck/wrapper/tensor.hpp                 |  4 ++--
 include/ck/wrapper/utils/tensor_partition.hpp | 10 +++++-----
 test/wrapper/test_copy.cpp                    |  9 +++------
 test/wrapper/test_partition.cpp               |  7 ++-----
 4 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp
index 787c5b2176e..a3636413730 100644
--- a/include/ck/wrapper/tensor.hpp
+++ b/include/ck/wrapper/tensor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,7 +16,7 @@ namespace wrapper {
  * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR).
  * \tparam ElementType Element data type.
  * \tparam Shape Tensor shape (layout component).
- * \tparam UnnestedDescriptorType Flatten descriptor (layout component).
+ * \tparam UnnestedDescriptorType Unnested descriptor (layout component).
  * \tparam NumVectors Number of vectors (only for VGPR, SGPR).
  * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR).
  */
diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp
index 38df6531aec..a0634f6b38d 100644
--- a/include/ck/wrapper/utils/tensor_partition.hpp
+++ b/include/ck/wrapper/utils/tensor_partition.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -148,7 +148,7 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdxImpl(const Tuple<Ls..
         Number<Tuple<Ls...>::Size()>{});
 }
 
-// Convert interger thread_idx to tuple index with applied steps
+// Convert integer thread_idx to tuple index with steps applied
 template <typename... Ls, typename... Steps>
 __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Ls...>& thread_lengths,
                                                             const Tuple<Steps...>& steps,
@@ -159,7 +159,7 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Ls...>&
     return CalculateLayoutOffsetIdxImpl(thread_lengths, steps, thread_id_copy);
 }
 
-// Aply steps to index represented as tuple
+// Apply steps to index represented as tuple
 template <typename... Steps, typename... Idxs>
 __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Steps...>& steps,
                                                             const Tuple<Idxs...>& block_idxs)
@@ -195,8 +195,8 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple<Steps...
         Number<Tuple<Idxs...>::Size()>{});
 }
 
-// For make_local_tile user pass only shape per block. This function calculates
-// block layout based on shape.
+// User passes only shape per block to the make_local_tile function. This function calculates
+// block layout based on the shape.
 template <typename... Ts, typename... BlockDims>
 __host__ __device__ constexpr auto CalculateBlockLengths(const Tuple<Ts...>& shape,
                                                          const Tuple<BlockDims...>& tile_shape)
diff --git a/test/wrapper/test_copy.cpp b/test/wrapper/test_copy.cpp
index 457288a4a97..5cf09a54be9 100644
--- a/test/wrapper/test_copy.cpp
+++ b/test/wrapper/test_copy.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <numeric>
 #include <cstdlib>
@@ -8,13 +8,10 @@
 #include <vector>
 #include <gtest/gtest.h>
 
+#include "ck/host_utility/kernel_launch.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/check_err.hpp"
-
-#include "ck/host_utility/kernel_launch.hpp"
-
 #include "ck/utility/common_header.hpp"
-
 #include "ck/wrapper/layout.hpp"
 #include "ck/wrapper/tensor.hpp"
 #include "ck/wrapper/operations/copy.hpp"
@@ -75,7 +72,7 @@ void PerformCopyGlobalToGlobalViaLDS()
         ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<2>{}), ck::Number<4>{});
     const auto layout = ck::wrapper::make_layout(shape, strides);
 
-    // 0,1,2...size(shape) - 1
+    // 0, 1, 2, ..., size(shape) - 1
     std::vector<ck::index_t> input_data(ck::wrapper::size(shape));
     std::iota(input_data.begin(), input_data.end(), 0);
 
diff --git a/test/wrapper/test_partition.cpp b/test/wrapper/test_partition.cpp
index fe214e9ed07..df56b879f69 100644
--- a/test/wrapper/test_partition.cpp
+++ b/test/wrapper/test_partition.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <numeric>
 #include <cstdlib>
@@ -8,13 +8,10 @@
 #include <vector>
 #include <gtest/gtest.h>
 
+#include "ck/host_utility/kernel_launch.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/check_err.hpp"
-
-#include "ck/host_utility/kernel_launch.hpp"
-
 #include "ck/utility/common_header.hpp"
-
 #include "ck/wrapper/layout.hpp"
 #include "ck/wrapper/tensor.hpp"