From 3d920d9a84f14135a79902e48d2a04359e5cf166 Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Wed, 20 Dec 2023 20:20:02 +0000 Subject: [PATCH 1/7] Add tensor partition and generic copy for ck wrapper --- docs/wrapper.rst | 8 + include/ck/wrapper/layout.hpp | 71 +++-- include/ck/wrapper/operations/copy.hpp | 42 +++ include/ck/wrapper/tensor.hpp | 3 + include/ck/wrapper/utils/layout_utils.hpp | 10 +- include/ck/wrapper/utils/tensor_partition.hpp | 264 ++++++++++++++++++ include/ck/wrapper/utils/tensor_utils.hpp | 24 +- test/wrapper/CMakeLists.txt | 4 + test/wrapper/test_copy.cpp | 132 +++++++++ test/wrapper/test_layout.cpp | 14 +- test/wrapper/test_partition.cpp | 172 ++++++++++++ test/wrapper/test_tensor.cpp | 7 +- 12 files changed, 705 insertions(+), 46 deletions(-) create mode 100644 include/ck/wrapper/operations/copy.hpp create mode 100644 include/ck/wrapper/utils/tensor_partition.hpp create mode 100644 test/wrapper/test_copy.cpp create mode 100644 test/wrapper/test_partition.cpp diff --git a/docs/wrapper.rst b/docs/wrapper.rst index a2f60b97ae4..da3a79eda85 100644 --- a/docs/wrapper.rst +++ b/docs/wrapper.rst @@ -71,3 +71,11 @@ Tensor helpers ------------------------------------- .. doxygenfile:: tensor_utils.hpp + +.. doxygenfile:: tensor_partition.hpp + +------------------------------------- +Operations +------------------------------------- + +.. doxygenfile:: copy.hpp diff --git a/include/ck/wrapper/layout.hpp b/include/ck/wrapper/layout.hpp index f20d985b497..4b6f1b19bf4 100644 --- a/include/ck/wrapper/layout.hpp +++ b/include/ck/wrapper/layout.hpp @@ -48,22 +48,24 @@ struct Layout // Generate packed (column-major) strides if not passed template __host__ __device__ constexpr static auto - GenerateColumnMajorPackedStrides(const Tuple& shape) + GenerateColumnMajorPackedStrides(const Tuple& shape, index_t& stride) { - const auto unrolled_shape = UnrollNestedTuple(shape); return generate_tuple( [&](auto i) { - if constexpr(i.value == 0) + const auto num_i = Number{}; + if constexpr(is_detected>>::value) { - return I1; + return GenerateColumnMajorPackedStrides(shape.At(num_i), stride); } else { - return TupleReduce([](auto x, auto y) { return x * y; }, - unrolled_shape); + const index_t dim_stride = stride; + // update stride + stride *= shape.At(num_i); + return dim_stride; } }, - Number{}); + Number::Size()>{}); } // Generate LowerDims in Compile-time for MergeTrasform using passed Type @@ -211,20 +213,28 @@ struct Layout __host__ __device__ static auto MakeFlattenDescriptor(const LayoutShape& shape, const LayoutStrides& strides) { - const auto unrolled_shape = UnrollNestedTuple(shape); - const auto unrolled_strides = UnrollNestedTuple(strides); - static_assert(unrolled_shape.Size() == unrolled_strides.Size(), - "Size of strides and shape are not consistent."); - return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); + const auto unrolled_shape = UnrollNestedTuple(shape); + if constexpr(is_same_v>) + { + index_t start_stride = 1; + // if not passed, then generate + const auto unrolled_strides = + GenerateColumnMajorPackedStrides(unrolled_shape, start_stride); + static_assert(unrolled_shape.Size() == unrolled_strides.Size(), + "Size of strides and shape are not consistent."); + return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); + } + else + { + const auto unrolled_strides = UnrollNestedTuple(strides); + static_assert(unrolled_shape.Size() == unrolled_strides.Size(), + "Size of strides and shape are not consistent."); + return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); + } } - // If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`. - using DeducedStrides = - std::conditional_t>, - remove_cvref_t, - Strides>; using FlattenDescriptorType = - remove_cvref_t; + remove_cvref_t; using Descriptor1dType = remove_cvref_t; using DefaultIdxsTupleType = remove_cvref_t; @@ -290,7 +300,7 @@ struct Layout * \param shape Shape for layout. */ __host__ __device__ constexpr Layout(const Shape& shape) - : flatten_descriptor_{}, shape_(shape), strides_(GenerateColumnMajorPackedStrides(shape_)) + : flatten_descriptor_{}, shape_(shape), strides_() { if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) { @@ -299,6 +309,10 @@ struct Layout merged_nests_descriptor_ = TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_); } + else + { + static_assert(true, "Compiletime Layout require strides parameter."); + } } /** @@ -351,7 +365,7 @@ struct Layout * \return Calculated size. */ template - __host__ __device__ constexpr index_t GetLength() const + __host__ __device__ constexpr auto GetLength() const { const auto elem = shape_.At(Number{}); if constexpr(is_detected>::value) @@ -371,7 +385,7 @@ struct Layout * * \return Calculated size. */ - __host__ __device__ constexpr index_t GetLengths() const + __host__ __device__ constexpr auto GetLengths() const { const auto unrolled_shape = UnrollNestedTuple(shape_); return TupleReduce([](auto x, auto y) { return x * y; }, @@ -390,7 +404,18 @@ struct Layout * * \return Strides. */ - __host__ __device__ constexpr const DeducedStrides& GetStrides() const { return strides_; } + __host__ __device__ constexpr auto GetStrides() const + { + if constexpr(is_same_v>) + { + index_t start_stride = 1; + return GenerateColumnMajorPackedStrides(shape_, start_stride); + } + else + { + return strides_; + } + } /** * \brief Get default lengths (tuple filled with Shape length elements). @@ -427,7 +452,7 @@ struct Layout Descriptor1dType descriptor_1d_; MergedNestsDescriptorType merged_nests_descriptor_; const Shape shape_; - const DeducedStrides strides_; + const Strides strides_; }; } // namespace wrapper diff --git a/include/ck/wrapper/operations/copy.hpp b/include/ck/wrapper/operations/copy.hpp new file mode 100644 index 00000000000..267464c0963 --- /dev/null +++ b/include/ck/wrapper/operations/copy.hpp @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "../utils/tensor_utils.hpp" + +namespace ck { +namespace wrapper { + +/** + * \brief Perform generic copy between two tensors. Tensors must have the + * same size. + * + * \param src_tensor Source tensor. + * \param dst_tensor Destination tensor. + */ +template +__host__ __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor) +{ + assert(size(src_tensor) == size(dst_tensor)); + using SrcSizeTensor = decltype(size(src_tensor)); + using DstSizeTensor = decltype(size(dst_tensor)); + if constexpr(is_known_at_compile_time::value) + { + static_for<0, SrcSizeTensor{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); }); + } + else if constexpr(is_known_at_compile_time::value) + { + static_for<0, DstSizeTensor{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); }); + } + else + { + for(int i = 0; i < size(src_tensor); i++) + { + dst_tensor(i) = src_tensor(i); + } + } +} + +} // namespace wrapper +} // namespace ck diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp index 4ec6498fbc6..fad790a6902 100644 --- a/include/ck/wrapper/tensor.hpp +++ b/include/ck/wrapper/tensor.hpp @@ -4,6 +4,7 @@ #pragma once #include "utils/tensor_utils.hpp" +#include "utils/tensor_partition.hpp" #include "utils/layout_utils.hpp" namespace ck { @@ -292,6 +293,8 @@ struct Tensor return layout_.GetDefaultDescriptor(); } + __host__ __device__ ElementType* GetPointer() const { return buffer_.p_data_; } + private: using DynamicBufferType = DynamicBuffer -__host__ __device__ constexpr index_t size(const Layout& layout) +__host__ __device__ constexpr auto size(const Layout& layout) { return layout.template GetLength(); } @@ -155,7 +155,7 @@ __host__ __device__ constexpr index_t size(const Layout& layout) * \return Requsted size. */ template -__host__ __device__ constexpr index_t size(const Tuple& shape) +__host__ __device__ constexpr auto size(const Tuple& shape) { const auto unrolled_shape = UnrollNestedTuple(shape); return TupleReduce<0, unrolled_shape.Size()>([](auto x, auto y) { return x * y; }, @@ -169,7 +169,7 @@ __host__ __device__ constexpr index_t size(const Tuple& shape) * \return Requsted size. */ template -__host__ __device__ constexpr index_t size(const Layout& layout) +__host__ __device__ constexpr auto size(const Layout& layout) { return layout.GetLengths(); } @@ -182,7 +182,7 @@ __host__ __device__ constexpr index_t size(const Layout& layout) * \return Requsted length. */ template -__host__ __device__ constexpr index_t size(const Tuple& tuple) +__host__ __device__ constexpr auto size(const Tuple& tuple) { return size(tuple.At(Number{})); } @@ -314,7 +314,7 @@ __host__ __device__ constexpr auto depth(const T& elem) * \return Requsted strides. */ template -__host__ __device__ constexpr const auto& stride(const Layout& layout) +__host__ __device__ constexpr auto stride(const Layout& layout) { return layout.GetStrides(); } diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp new file mode 100644 index 00000000000..8639c91b7bb --- /dev/null +++ b/include/ck/wrapper/utils/tensor_partition.hpp @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "tensor_utils.hpp" +#include "layout_utils.hpp" + +namespace ck { +namespace wrapper { + +namespace { +// Calculate shape for partition based on number of threads per each dim and +// previous shape +template +__host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple& shape, + const Tuple& thread_lengths) +{ + + static_assert(Tuple::Size() == Tuple::Size(), "Wrong thread_lengths shape."); + return generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + // if tuple then recurrence + return CalculateLocalPartitionShape(shape.At(num_i), thread_lengths.At(num_i)); + } + else + { + const auto slice_len = shape.At(num_i) / thread_lengths.At(i); + return slice_len; + } + }, + Number::Size()>{}); +} + +// Calculate shape for partition based on number of threads per each dim, +// previous strides and steps +template +__host__ __device__ constexpr auto CalculateLocalPartitionStrides( + const Tuple& strides, const Tuple& thread_lengths, const Tuple& steps) +{ + + static_assert(Tuple::Size() == Tuple::Size(), "Wrong thread_lengths shape."); + return generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + // if tuple then recurrence + if constexpr(is_same_v, Tuple<>>) + { + return CalculateLocalPartitionStrides( + strides.At(num_i), thread_lengths.At(num_i), Tuple<>{}); + } + else + { + return CalculateLocalPartitionStrides( + strides.At(num_i), thread_lengths.At(num_i), steps.At(num_i)); + } + } + else + { + if constexpr(is_same_v, Tuple<>>) + { + // By default raked partition + const auto partition_stride = thread_lengths.At(i); + return partition_stride * strides.At(i); + } + else if constexpr(!is_same_v>, index_t>) + { + // Compiletime partition + if constexpr(is_same_v>, Number<1>>) + { + // raked + const auto partition_stride = thread_lengths.At(i); + return partition_stride * strides.At(i); + } + else + { + // packed + return strides.At(i); + } + } + else + { + // Runtime partition + if(steps.At(i) == 1) + { + // raked + const auto partition_stride = thread_lengths.At(i); + return partition_stride * strides.At(i); + } + else + { + // packed + return strides.At(i); + } + } + } + }, + Number::Size()>{}); +} + +// Convert interger thread_idx to tuple index with applied steps +template +__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& thread_lengths, + const Tuple& steps, + index_t& thread_id) +{ + return generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + // if tuple then recurrence + if constexpr(is_same_v, Tuple<>>) + { + return CalculateLayoutOffsetIdx(thread_lengths.At(num_i), Tuple<>{}, thread_id); + } + else + { + return CalculateLayoutOffsetIdx( + thread_lengths.At(num_i), steps.At(num_i), thread_id); + } + } + else + { + // Update thread_id after each dim + const auto dim_thread_id = thread_id % thread_lengths.At(i); + thread_id /= thread_lengths.At(i); + if constexpr(is_same_v, Tuple<>>) + { + return dim_thread_id; + } + else + { + // Apply step + return steps.At(num_i) * dim_thread_id; + } + } + }, + Number::Size()>{}); +} + +// Aply steps to index represented as tuple +template +__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& steps, + const Tuple& block_idxs) +{ + return generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + // if tuple then recurrence + if constexpr(is_same_v, Tuple<>>) + { + return CalculateLayoutOffsetIdx(Tuple<>{}, block_idxs.At(num_i)); + } + else + { + return CalculateLayoutOffsetIdx(steps.At(num_i), block_idxs.At(num_i)); + } + } + else + { + if constexpr(is_same_v, Tuple<>>) + { + return block_idxs.At(num_i); + } + else + { + // apply step + return steps.At(num_i) * block_idxs.At(num_i); + } + } + }, + Number::Size()>{}); +} + +// For make_local_tile user pass only shape per block. This function calculates +// block layout based on shape. +template +__host__ __device__ constexpr auto CalculateBlockLengths(const Tuple& shape, + const Tuple& block_shape) +{ + return generate_tuple( + [&](auto i) { + constexpr auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + // if tuple then recurrence + return CalculateBlockLengths(shape.At(num_i), block_shape.At(num_i)); + } + else + { + return shape.At(num_i) / block_shape.At(num_i); + } + }, + Number::Size()>{}); +} +} // namespace + +/** + * \brief Create local partition for thread. + * + * \param tensor Tensor for partition. + * \param thread_lengths Layout of threads. + * \param thread_id Thread index represented as integer. + * \param steps Thread step (default=1, raked partition) + * \return Partition tensor. + */ +template > +__host__ __device__ constexpr auto make_local_partition(const TensorType& tensor, + const ThreadLengthsTuple& thread_lengths, + const index_t thread_id, + const StepsTuple steps = StepsTuple{}) +{ + // Create shape, strides and layout for new partition tensor + const auto partition_shape = CalculateLocalPartitionShape(shape(tensor), thread_lengths); + const auto partition_strides = + CalculateLocalPartitionStrides(stride(tensor), thread_lengths, steps); + const auto partition_layout = make_layout(partition_shape, partition_strides); + // Calculate offset for new partition tensor + index_t thread_id_copy = thread_id; + const auto offset_idx = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id_copy); + const auto partition_offset = layout(tensor)(offset_idx); + return make_tensor(tensor.GetPointer() + partition_offset, + partition_layout); +} + +/** + * \brief Create local tile for thread block. + * + * \param tensor Tensor for partition. + * \param block_shape Shapes of requested tile. + * \param block_idx Block index represented as tuple. + * \param steps Block step (default=1, raked partition) + * \return Tile tensor. + */ +template > +__host__ __device__ constexpr auto make_local_tile(const TensorType& tensor, + const BlockShapeTuple& block_shape, + const BlockIdxTuple& block_idx, + const StepsTuple steps = StepsTuple{}) +{ + // Create block lengths, strides and layout for new tile tensor + const auto block_lengths = CalculateBlockLengths(shape(tensor), block_shape); + const auto block_strides = CalculateLocalPartitionStrides(stride(tensor), block_lengths, steps); + const auto tile_layout = make_layout(block_shape, block_strides); + // Calculate offset for new partition tensor + const auto offset_idx = CalculateLayoutOffsetIdx(steps, block_idx); + const auto partition_offset = layout(tensor)(offset_idx); + return make_tensor(tensor.GetPointer() + partition_offset, + tile_layout); +} + +} // namespace wrapper +} // namespace ck diff --git a/include/ck/wrapper/utils/tensor_utils.hpp b/include/ck/wrapper/utils/tensor_utils.hpp index 5f0dc3e5006..35bb4f04be7 100644 --- a/include/ck/wrapper/utils/tensor_utils.hpp +++ b/include/ck/wrapper/utils/tensor_utils.hpp @@ -112,19 +112,21 @@ constexpr auto make_tensor(ElementType* pointer, const Layout& l * \tparam NumVectors Number of vectors. * \tparam ScalarPerVector Scalars per vector. * \tparam ElementType Memory data type. - * \param layout Tensor layout. * \return Constructed tensor. */ template -constexpr auto make_register_tensor(const Layout& layout) + typename ElementType> +constexpr auto make_register_tensor() { - static_assert(!IsNestedTuple(Shape{}), "Register tensor with nested layout is not supported"); - return Tensor(layout); + const auto layout = make_layout(make_tuple(Number{}), make_tuple(Number<1>{})); + return Tensor>, + Tuple>, + NumVectors, + ScalarPerVector>(layout); } /** @@ -160,7 +162,7 @@ template -__host__ __device__ constexpr index_t +__host__ __device__ constexpr auto size(const Tensor& tensor) { @@ -181,7 +183,7 @@ template -__host__ __device__ constexpr index_t +__host__ __device__ constexpr auto rank(const Tensor& tensor) { @@ -202,7 +204,7 @@ template -__host__ __device__ constexpr index_t +__host__ __device__ constexpr auto depth(const Tensor& tensor) { @@ -221,7 +223,7 @@ template -__host__ __device__ constexpr const auto& +__host__ __device__ constexpr auto stride(const Tensor& tensor) { diff --git a/test/wrapper/CMakeLists.txt b/test/wrapper/CMakeLists.txt index 6b25c08a8a5..6c3e29ab87d 100644 --- a/test/wrapper/CMakeLists.txt +++ b/test/wrapper/CMakeLists.txt @@ -2,3 +2,7 @@ add_gtest_executable(test_layout test_layout.cpp) target_link_libraries(test_layout PRIVATE utility) add_gtest_executable(test_tensor test_tensor.cpp) target_link_libraries(test_tensor PRIVATE utility) +add_gtest_executable(test_copy test_copy.cpp) +target_link_libraries(test_copy PRIVATE utility) +add_gtest_executable(test_partition test_partition.cpp) +target_link_libraries(test_partition PRIVATE utility) diff --git a/test/wrapper/test_copy.cpp b/test/wrapper/test_copy.cpp new file mode 100644 index 00000000000..21693ad3ce1 --- /dev/null +++ b/test/wrapper/test_copy.cpp @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/check_err.hpp" + +#include "ck/host_utility/kernel_launch.hpp" + +#include "ck/utility/common_header.hpp" + +#include "ck/wrapper/layout.hpp" +#include "ck/wrapper/tensor.hpp" +#include "ck/wrapper/operations/copy.hpp" + +// Test copy from Global to Global through LDS and SGPR +template +__global__ void TestCopyDevice(const InputTensor input_tensor, + OutputTensor output_tensor, + const BlockShape block_shape, + const ThreadLayoutShape thread_layout, + const LocalTileSteps block_steps, + const LocalPartitionSteps thread_steps) +{ + __shared__ ck::index_t p_shared[ck::wrapper::size(block_shape)]; + auto tensor_lds = ck::wrapper::make_tensor( + p_shared, ck::wrapper::make_layout(block_shape)); + + const auto block_idxs = ck::make_tuple(ck::make_tuple(0, 0), blockIdx.x); + + // Get local tiles for global memory + const auto input_make_local_tile = + ck::wrapper::make_local_tile(input_tensor, block_shape, block_idxs, block_steps); + const auto output_make_local_tile = + ck::wrapper::make_local_tile(output_tensor, block_shape, block_idxs, block_steps); + + // Get partition per thread + const auto input_make_local_partition = ck::wrapper::make_local_partition( + input_make_local_tile, thread_layout, threadIdx.x, thread_steps); + auto lds_make_local_partition = + ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x, thread_steps); + auto output_make_local_partition = ck::wrapper::make_local_partition( + output_make_local_tile, thread_layout, threadIdx.x, thread_steps); + + // Allocate sgpr + constexpr ck::index_t scalar_per_vector = 1; + constexpr ck::index_t sgpr_size = ck::wrapper::size(lds_make_local_partition); + auto tensor_sgpr = ck::wrapper::make_register_tensor(); + + // Perform copy + ck::wrapper::copy(input_make_local_partition, lds_make_local_partition); + ck::wrapper::copy(lds_make_local_partition, tensor_sgpr); + ck::wrapper::copy(tensor_sgpr, output_make_local_partition); +} + +void PerformCopyGlobalToGlobalViaLDS() +{ + const auto shape = + ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<256>{}); + const auto strides = + ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<2>{}), ck::Number<4>{}); + const auto layout = ck::wrapper::make_layout(shape, strides); + + // 0,1,2...size(shape) - 1 + std::vector input_data(ck::wrapper::size(shape)); + std::iota(input_data.begin(), input_data.end(), 0); + + // Global memory buffers + DeviceMem in_buf(ck::wrapper::size(layout) * sizeof(ck::index_t)); + DeviceMem out_buf(ck::wrapper::size(layout) * sizeof(ck::index_t)); + + in_buf.ToDevice(input_data.data()); + out_buf.SetZero(); + + // Create tensors for global memory + const auto input_tensor_global = ck::wrapper::make_tensor( + static_cast(in_buf.GetDeviceBuffer()), layout); + auto output_tensor_global = ck::wrapper::make_tensor( + static_cast(out_buf.GetDeviceBuffer()), layout); + + const auto thread_layout = + ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<32>{}); + const auto block_shape = + ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<64>{}); + + const auto thread_steps = + ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<2>{}); + const auto block_steps = + ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<64>{}); + + const ck::index_t grid_size = ck::math::integer_divide_ceil( + ck::wrapper::size(input_tensor_global), ck::wrapper::size(block_shape)); + + const auto kernel = TestCopyDevice; + launch_and_time_kernel(StreamConfig{}, + kernel, + dim3(grid_size), + dim3(ck::wrapper::size(thread_layout)), + 0, + input_tensor_global, + output_tensor_global, + block_shape, + thread_layout, + block_steps, + thread_steps); + + // Verify results + std::vector output_data(ck::wrapper::size(shape)); + out_buf.FromDevice(output_data.data()); + EXPECT_TRUE(ck::utils::check_err(output_data, input_data)); +} + +TEST(TestCopy, CopyGlobalToGlobalViaLDS) { PerformCopyGlobalToGlobalViaLDS(); } diff --git a/test/wrapper/test_layout.cpp b/test/wrapper/test_layout.cpp index 14a8b964628..d5dba4841f5 100644 --- a/test/wrapper/test_layout.cpp +++ b/test/wrapper/test_layout.cpp @@ -84,7 +84,8 @@ TEST_F(TestWrapperLayout, 2d) ck::make_tuple(ck::Sequence<0>{})); const auto layout_runtime = ck::wrapper::make_layout(ck::make_tuple(d1, d0)); const auto layout_compiletime = - ck::wrapper::make_layout(ck::make_tuple(ck::Number{}, ck::Number{})); + ck::wrapper::make_layout(ck::make_tuple(ck::Number{}, ck::Number{}), + ck::make_tuple(ck::Number{}, ck::Number{})); std::vector> idxs; for(ck::index_t h = 0; h < d1; h++) @@ -436,18 +437,25 @@ TEST(TestLayoutHelpers, ShapeAndStrides) std::is_same_v>; constexpr bool check_compiletime_strides = - std::is_same_v, std::remove_reference_t>; constexpr bool check_runtime_shape = std::is_same_v>; constexpr bool check_runtime_strides = - std::is_same_v, std::remove_reference_t>; EXPECT_TRUE(check_compiletime_shape); EXPECT_TRUE(check_compiletime_strides); EXPECT_TRUE(check_runtime_shape); EXPECT_TRUE(check_runtime_strides); + + // Check packed strides generation + const auto packed_layout = ck::wrapper::make_layout(shape_runtime); + constexpr bool check_packed_layout_strides = + std::is_same_v, + std::remove_reference_t>; + EXPECT_TRUE(check_packed_layout_strides); } TEST(TestLayoutHelpers, Hierarchical) diff --git a/test/wrapper/test_partition.cpp b/test/wrapper/test_partition.cpp new file mode 100644 index 00000000000..a2ae30f5de0 --- /dev/null +++ b/test/wrapper/test_partition.cpp @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/check_err.hpp" + +#include "ck/host_utility/kernel_launch.hpp" + +#include "ck/utility/common_header.hpp" + +#include "ck/wrapper/layout.hpp" +#include "ck/wrapper/tensor.hpp" + +TEST(TestPartition, LocalPartition) +{ + const auto shape = + ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{}); + const auto strides = + ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{}); + const auto layout = ck::wrapper::make_layout(shape, strides); + + std::vector data(ck::wrapper::size(layout)); + std::iota(data.begin(), data.end(), 0); + + const auto tensor = + ck::wrapper::make_tensor(data.data(), layout); + + const auto thread_steps = + ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<1>{}), ck::Number<1>{}); + const auto thread_layout = + ck::make_tuple(ck::make_tuple(ck::Number<8>{}, ck::Number<1>{}), ck::Number<1>{}); + + for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++) + { + const auto raked_partition = + ck::wrapper::make_local_partition(tensor, thread_layout, thread_id); + + const auto expected_partition_size = + ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout); + EXPECT_EQ(ck::wrapper::size(raked_partition), expected_partition_size); + EXPECT_EQ(raked_partition(0), thread_id); + + const auto expected_partition_stride_00 = + ck::wrapper::size<0, 0>(strides) * ck::wrapper::size<0, 0>(thread_layout); + const auto expected_partition_stride_01 = + ck::wrapper::size<0, 1>(strides) * ck::wrapper::size<0, 1>(thread_layout); + const auto expected_partition_stride_1 = + ck::wrapper::size<1>(strides) * ck::wrapper::size<1>(thread_layout); + const auto partition_stride_00 = + ck::wrapper::size<0, 0>(ck::wrapper::stride(raked_partition)); + const auto partition_stride_01 = + ck::wrapper::size<0, 1>(ck::wrapper::stride(raked_partition)); + const auto partition_stride_1 = ck::wrapper::size<1>(ck::wrapper::stride(raked_partition)); + EXPECT_EQ(partition_stride_00, expected_partition_stride_00); + EXPECT_EQ(partition_stride_01, expected_partition_stride_01); + EXPECT_EQ(partition_stride_1, expected_partition_stride_1); + } + + for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++) + { + const auto packed_partition = + ck::wrapper::make_local_partition(tensor, thread_layout, thread_id, thread_steps); + + const auto expected_partition_size = + ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout); + const auto expected_partition_first_val = thread_id * ck::wrapper::size<0, 0>(thread_steps); + EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size); + EXPECT_EQ(packed_partition(0), expected_partition_first_val); + + const auto expected_partition_stride_00 = ck::wrapper::size<0, 0>(strides); + const auto expected_partition_stride_01 = ck::wrapper::size<0, 1>(strides); + const auto expected_partition_stride_1 = ck::wrapper::size<1>(strides); + const auto partition_stride_00 = + ck::wrapper::size<0, 0>(ck::wrapper::stride(packed_partition)); + const auto partition_stride_01 = + ck::wrapper::size<0, 1>(ck::wrapper::stride(packed_partition)); + const auto partition_stride_1 = ck::wrapper::size<1>(ck::wrapper::stride(packed_partition)); + EXPECT_EQ(partition_stride_00, expected_partition_stride_00); + EXPECT_EQ(partition_stride_01, expected_partition_stride_01); + EXPECT_EQ(partition_stride_1, expected_partition_stride_1); + } +} + +TEST(TestPartition, LocalTile) +{ + const auto shape = + ck::make_tuple(ck::make_tuple(ck::Number<16>{}, ck::Number<4>{}), ck::Number<4>{}); + const auto strides = + ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<16>{}), ck::Number<64>{}); + const auto layout = ck::wrapper::make_layout(shape, strides); + + std::vector data(ck::wrapper::size(layout)); + std::iota(data.begin(), data.end(), 0); + + const auto tensor = + ck::wrapper::make_tensor(data.data(), layout); + + const auto block_steps = + ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{}); + const auto block_shape = + ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{}); + const auto block_layout = + ck::make_tuple(ck::make_tuple(ck::Number<4>{}, ck::Number<2>{}), ck::Number<2>{}); + + std::vector, ck::index_t>> block_idxs; + for(ck::index_t x = 0; x < ck::wrapper::size<0, 0>(block_layout); x++) + { + for(ck::index_t y = 0; y < ck::wrapper::size<0, 1>(block_layout); y++) + { + for(ck::index_t z = 0; z < ck::wrapper::size<1>(block_layout); z++) + { + block_idxs.emplace_back(ck::make_tuple(x, y), z); + } + } + } + + for(const auto& block_idx : block_idxs) + { + const auto raked_tile = ck::wrapper::make_local_tile(tensor, block_shape, block_idx); + + const auto expected_tile_size = ck::wrapper::size(block_shape); + EXPECT_EQ(ck::wrapper::size(raked_tile), expected_tile_size); + EXPECT_EQ(raked_tile(0), layout(block_idx)); + + const auto expected_tile_stride_00 = + ck::wrapper::size<0, 0>(strides) * ck::wrapper::size<0, 0>(block_layout); + const auto expected_tile_stride_01 = + ck::wrapper::size<0, 1>(strides) * ck::wrapper::size<0, 1>(block_layout); + const auto expected_tile_stride_1 = + ck::wrapper::size<1>(strides) * ck::wrapper::size<1>(block_layout); + const auto tile_stride_00 = ck::wrapper::size<0, 0>(ck::wrapper::stride(raked_tile)); + const auto tile_stride_01 = ck::wrapper::size<0, 1>(ck::wrapper::stride(raked_tile)); + const auto tile_stride_1 = ck::wrapper::size<1>(ck::wrapper::stride(raked_tile)); + EXPECT_EQ(tile_stride_00, expected_tile_stride_00); + EXPECT_EQ(tile_stride_01, expected_tile_stride_01); + EXPECT_EQ(tile_stride_1, expected_tile_stride_1); + } + + for(const auto& block_idx : block_idxs) + { + const auto packed_tile = + ck::wrapper::make_local_tile(tensor, block_shape, block_idx, block_steps); + + const auto expected_tile_size = ck::wrapper::size(block_shape); + const auto expected_tile_first_val = + ck::wrapper::size<0, 0>(block_idx) * ck::wrapper::size<0, 0>(block_shape) * + ck::wrapper::size<0, 0>(strides) + + ck::wrapper::size<0, 1>(block_idx) * ck::wrapper::size<0, 1>(block_shape) * + ck::wrapper::size<0, 1>(strides) + + ck::wrapper::size<1>(block_idx) * ck::wrapper::size<1>(block_shape) * + ck::wrapper::size<1>(strides); + EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size); + EXPECT_EQ(packed_tile(0), expected_tile_first_val); + + const auto expected_tile_stride_00 = ck::wrapper::size<0, 0>(strides); + const auto expected_tile_stride_01 = ck::wrapper::size<0, 1>(strides); + const auto expected_tile_stride_1 = ck::wrapper::size<1>(strides); + const auto tile_stride_00 = ck::wrapper::size<0, 0>(ck::wrapper::stride(packed_tile)); + const auto tile_stride_01 = ck::wrapper::size<0, 1>(ck::wrapper::stride(packed_tile)); + const auto tile_stride_1 = ck::wrapper::size<1>(ck::wrapper::stride(packed_tile)); + EXPECT_EQ(tile_stride_00, expected_tile_stride_00); + EXPECT_EQ(tile_stride_01, expected_tile_stride_01); + EXPECT_EQ(tile_stride_1, expected_tile_stride_1); + } +} diff --git a/test/wrapper/test_tensor.cpp b/test/wrapper/test_tensor.cpp index 92f8e2e1bdc..72b56068749 100644 --- a/test/wrapper/test_tensor.cpp +++ b/test/wrapper/test_tensor.cpp @@ -108,7 +108,6 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success) bool* casted_success_ptr = static_cast(success); const auto layout = ck::wrapper::make_layout(ck::make_tuple(ck::make_tuple(2, 2), 2)); - constexpr auto register_layout = ck::wrapper::make_layout(ck::make_tuple(ck::Number<8>{})); auto tensor_global = ck::wrapper::make_tensor(casted_data_ptr, layout); @@ -116,11 +115,11 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success) auto tensor_vgpr = ck::wrapper::make_register_tensor(register_layout); + ck::index_t>(); auto tensor_sgpr = ck::wrapper::make_register_tensor(register_layout); + ck::index_t>(); InitTensor(tensor_global); InitTensor(tensor_lds); @@ -151,7 +150,7 @@ TEST(TestTensor, ReadWriteGlobalLdsRegistersMemory) TestTensorReadWriteDevice, dim3(1), dim3(1), - nelems * sizeof(ck::index_t), + 0, data_buf.GetDeviceBuffer(), success_buf.GetDeviceBuffer()); From 40f423aaf4cf409912bcb5999445e0ad5e2e93e8 Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Wed, 20 Dec 2023 20:22:25 +0000 Subject: [PATCH 2/7] Update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2891b8585b6..abca69142e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ None - Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804) - Support for bf16/f32/f16 and NHWGC (2D and 3D) grouped convolution backward data (#757 #799) - Support for Batched Gemm DL (#732) -- Introduce wrapper sublibrary (limited functionality). (#1071, #1098) +- Introduce wrapper sublibrary (limited functionality). (#1071, #1098, #1108) ### Changes - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817) From c609ee7b69068b8d4472fc61ec9198f83e78313b Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Thu, 21 Dec 2023 16:32:12 +0000 Subject: [PATCH 3/7] Stylistic fixes --- include/ck/wrapper/operations/copy.hpp | 13 +++-- include/ck/wrapper/utils/tensor_partition.hpp | 14 +++--- test/wrapper/test_copy.cpp | 48 +++++++++---------- 3 files changed, 37 insertions(+), 38 deletions(-) diff --git a/include/ck/wrapper/operations/copy.hpp b/include/ck/wrapper/operations/copy.hpp index 267464c0963..aec80f9ca72 100644 --- a/include/ck/wrapper/operations/copy.hpp +++ b/include/ck/wrapper/operations/copy.hpp @@ -18,16 +18,15 @@ namespace wrapper { template __host__ __device__ void copy(const SrcTensorType& src_tensor, DstTensorType& dst_tensor) { - assert(size(src_tensor) == size(dst_tensor)); - using SrcSizeTensor = decltype(size(src_tensor)); - using DstSizeTensor = decltype(size(dst_tensor)); - if constexpr(is_known_at_compile_time::value) + if constexpr(!SrcTensorType::IsDynamicBuffer) { - static_for<0, SrcSizeTensor{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); }); + using SizeType = decltype(size(src_tensor)); + static_for<0, SizeType{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); }); } - else if constexpr(is_known_at_compile_time::value) + else if constexpr(!DstTensorType::IsDynamicBuffer) { - static_for<0, DstSizeTensor{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); }); + using SizeType = decltype(size(dst_tensor)); + static_for<0, SizeType{}, 1>{}([&](auto i) { dst_tensor(i) = src_tensor(i); }); } else { diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp index 8639c91b7bb..edc61e1e466 100644 --- a/include/ck/wrapper/utils/tensor_partition.hpp +++ b/include/ck/wrapper/utils/tensor_partition.hpp @@ -184,7 +184,7 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple __host__ __device__ constexpr auto CalculateBlockLengths(const Tuple& shape, - const Tuple& block_shape) + const Tuple& tile_shape) { return generate_tuple( [&](auto i) { @@ -192,11 +192,11 @@ __host__ __device__ constexpr auto CalculateBlockLengths(const Tuple& sha if constexpr(is_detected>>::value) { // if tuple then recurrence - return CalculateBlockLengths(shape.At(num_i), block_shape.At(num_i)); + return CalculateBlockLengths(shape.At(num_i), tile_shape.At(num_i)); } else { - return shape.At(num_i) / block_shape.At(num_i); + return shape.At(num_i) / tile_shape.At(num_i); } }, Number::Size()>{}); @@ -235,7 +235,7 @@ __host__ __device__ constexpr auto make_local_partition(const TensorType& tensor * \brief Create local tile for thread block. * * \param tensor Tensor for partition. - * \param block_shape Shapes of requested tile. + * \param tile_shape Shapes of requested tile. * \param block_idx Block index represented as tuple. * \param steps Block step (default=1, raked partition) * \return Tile tensor. @@ -245,14 +245,14 @@ template > __host__ __device__ constexpr auto make_local_tile(const TensorType& tensor, - const BlockShapeTuple& block_shape, + const BlockShapeTuple& tile_shape, const BlockIdxTuple& block_idx, const StepsTuple steps = StepsTuple{}) { // Create block lengths, strides and layout for new tile tensor - const auto block_lengths = CalculateBlockLengths(shape(tensor), block_shape); + const auto block_lengths = CalculateBlockLengths(shape(tensor), tile_shape); const auto block_strides = CalculateLocalPartitionStrides(stride(tensor), block_lengths, steps); - const auto tile_layout = make_layout(block_shape, block_strides); + const auto tile_layout = make_layout(tile_shape, block_strides); // Calculate offset for new partition tensor const auto offset_idx = CalculateLayoutOffsetIdx(steps, block_idx); const auto partition_offset = layout(tensor)(offset_idx); diff --git a/test/wrapper/test_copy.cpp b/test/wrapper/test_copy.cpp index 21693ad3ce1..457288a4a97 100644 --- a/test/wrapper/test_copy.cpp +++ b/test/wrapper/test_copy.cpp @@ -19,7 +19,7 @@ #include "ck/wrapper/tensor.hpp" #include "ck/wrapper/operations/copy.hpp" -// Test copy from Global to Global through LDS and SGPR +// Test copy from Global to Global through LDS and VGPR template __global__ void TestCopyDevice(const InputTensor input_tensor, OutputTensor output_tensor, - const BlockShape block_shape, + const BlockShape tile_shape, const ThreadLayoutShape thread_layout, const LocalTileSteps block_steps, const LocalPartitionSteps thread_steps) { - __shared__ ck::index_t p_shared[ck::wrapper::size(block_shape)]; + __shared__ ck::index_t p_shared[ck::wrapper::size(tile_shape)]; auto tensor_lds = ck::wrapper::make_tensor( - p_shared, ck::wrapper::make_layout(block_shape)); + p_shared, ck::wrapper::make_layout(tile_shape)); const auto block_idxs = ck::make_tuple(ck::make_tuple(0, 0), blockIdx.x); // Get local tiles for global memory - const auto input_make_local_tile = - ck::wrapper::make_local_tile(input_tensor, block_shape, block_idxs, block_steps); - const auto output_make_local_tile = - ck::wrapper::make_local_tile(output_tensor, block_shape, block_idxs, block_steps); + const auto input_local_tile = + ck::wrapper::make_local_tile(input_tensor, tile_shape, block_idxs, block_steps); + const auto output_local_tile = + ck::wrapper::make_local_tile(output_tensor, tile_shape, block_idxs, block_steps); // Get partition per thread - const auto input_make_local_partition = ck::wrapper::make_local_partition( - input_make_local_tile, thread_layout, threadIdx.x, thread_steps); - auto lds_make_local_partition = + const auto input_local_partition = ck::wrapper::make_local_partition( + input_local_tile, thread_layout, threadIdx.x, thread_steps); + auto lds_local_partition = ck::wrapper::make_local_partition(tensor_lds, thread_layout, threadIdx.x, thread_steps); - auto output_make_local_partition = ck::wrapper::make_local_partition( - output_make_local_tile, thread_layout, threadIdx.x, thread_steps); + auto output_local_partition = ck::wrapper::make_local_partition( + output_local_tile, thread_layout, threadIdx.x, thread_steps); - // Allocate sgpr + // Allocate VGPR constexpr ck::index_t scalar_per_vector = 1; - constexpr ck::index_t sgpr_size = ck::wrapper::size(lds_make_local_partition); - auto tensor_sgpr = ck::wrapper::make_register_tensor(); // Perform copy - ck::wrapper::copy(input_make_local_partition, lds_make_local_partition); - ck::wrapper::copy(lds_make_local_partition, tensor_sgpr); - ck::wrapper::copy(tensor_sgpr, output_make_local_partition); + ck::wrapper::copy(input_local_partition, lds_local_partition); + ck::wrapper::copy(lds_local_partition, tensor_vgpr); + ck::wrapper::copy(tensor_vgpr, output_local_partition); } void PerformCopyGlobalToGlobalViaLDS() @@ -94,7 +94,7 @@ void PerformCopyGlobalToGlobalViaLDS() const auto thread_layout = ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<32>{}); - const auto block_shape = + const auto tile_shape = ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<64>{}); const auto thread_steps = @@ -103,11 +103,11 @@ void PerformCopyGlobalToGlobalViaLDS() ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<1>{}), ck::Number<64>{}); const ck::index_t grid_size = ck::math::integer_divide_ceil( - ck::wrapper::size(input_tensor_global), ck::wrapper::size(block_shape)); + ck::wrapper::size(input_tensor_global), ck::wrapper::size(tile_shape)); const auto kernel = TestCopyDevice; @@ -118,7 +118,7 @@ void PerformCopyGlobalToGlobalViaLDS() 0, input_tensor_global, output_tensor_global, - block_shape, + tile_shape, thread_layout, block_steps, thread_steps); From 0147b9431ec10d7d2b2a6df5a92b3a4c35a54e7e Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Fri, 29 Dec 2023 00:05:51 +0000 Subject: [PATCH 4/7] Change shape/strides logic to descriptor transforms --- include/ck/utility/tuple_helper.hpp | 11 + include/ck/wrapper/layout.hpp | 117 ++-------- include/ck/wrapper/tensor.hpp | 213 ++++++++++-------- include/ck/wrapper/utils/layout_utils.hpp | 155 +++++++++---- include/ck/wrapper/utils/tensor_partition.hpp | 121 +++++----- include/ck/wrapper/utils/tensor_utils.hpp | 95 ++++---- test/wrapper/test_layout.cpp | 15 -- test/wrapper/test_partition.cpp | 50 ---- test/wrapper/test_tensor.cpp | 15 +- 9 files changed, 378 insertions(+), 414 deletions(-) diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp index 75f2693f205..f3652300541 100644 --- a/include/ck/utility/tuple_helper.hpp +++ b/include/ck/utility/tuple_helper.hpp @@ -178,4 +178,15 @@ __host__ __device__ constexpr auto TupleDepth(const Tuple&) return math::max(TupleDepth(Ts{})...); } +template +__host__ __device__ constexpr auto TupleSlice(const Tuple& tuple) +{ + return generate_tuple( + [&](auto i) { + using Idx = Number; + return tuple.At(Idx{}); + }, + Number{}); +} + } // namespace ck diff --git a/include/ck/wrapper/layout.hpp b/include/ck/wrapper/layout.hpp index 4b6f1b19bf4..c0a52c03327 100644 --- a/include/ck/wrapper/layout.hpp +++ b/include/ck/wrapper/layout.hpp @@ -14,11 +14,9 @@ namespace wrapper { * \tparam Shape Tuple of Number<> (for compile-time layout) or index_t * (dynamic layout). It is possible to pass nested shapes * (e.g. ((4, 2), 2)), nested dimensions are merged. - * \tparam Strides Tuple of Number<> (for compile-time layout) or index_t - * (dynamic layout). Stride tuple should be nested if shape tuple is - * nested. + * \tparam FlattenDescriptorType Tensor descriptor for flatten shape dims. */ -template +template struct Layout { private: @@ -45,29 +43,6 @@ struct Layout Number::Size()>{}); } - // Generate packed (column-major) strides if not passed - template - __host__ __device__ constexpr static auto - GenerateColumnMajorPackedStrides(const Tuple& shape, index_t& stride) - { - return generate_tuple( - [&](auto i) { - const auto num_i = Number{}; - if constexpr(is_detected>>::value) - { - return GenerateColumnMajorPackedStrides(shape.At(num_i), stride); - } - else - { - const index_t dim_stride = stride; - // update stride - stride *= shape.At(num_i); - return dim_stride; - } - }, - Number::Size()>{}); - } - // Generate LowerDims in Compile-time for MergeTrasform using passed Type // If element of Tuple is also tuple, then merge (generate sequence for merge) // If tuple is element, then pass through (sequence with one element) @@ -209,32 +184,6 @@ struct Layout return transform_tensor_descriptor(desc, transforms, lower_dims, upper_dims); } - template - __host__ __device__ static auto MakeFlattenDescriptor(const LayoutShape& shape, - const LayoutStrides& strides) - { - const auto unrolled_shape = UnrollNestedTuple(shape); - if constexpr(is_same_v>) - { - index_t start_stride = 1; - // if not passed, then generate - const auto unrolled_strides = - GenerateColumnMajorPackedStrides(unrolled_shape, start_stride); - static_assert(unrolled_shape.Size() == unrolled_strides.Size(), - "Size of strides and shape are not consistent."); - return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); - } - else - { - const auto unrolled_strides = UnrollNestedTuple(strides); - static_assert(unrolled_shape.Size() == unrolled_strides.Size(), - "Size of strides and shape are not consistent."); - return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); - } - } - - using FlattenDescriptorType = - remove_cvref_t; using Descriptor1dType = remove_cvref_t; using DefaultIdxsTupleType = remove_cvref_t; @@ -275,44 +224,25 @@ struct Layout } __host__ __device__ Layout() = delete; + /** * \brief Layout constructor. * * \param shape Shape for layout. - * \param strides Strides for layout (optional if tensor is packed). + * \param flatten_descriptor Descriptor */ - __host__ __device__ constexpr Layout(const Shape& shape, const Strides& strides) - : flatten_descriptor_{}, shape_(shape), strides_(strides) + __host__ __device__ constexpr Layout(const Shape& shape, + const FlattenDescriptorType& flatten_descriptor) + : shape_(shape) { // Construct if runtime mode if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) { - flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_); - descriptor_1d_ = MakeMerge1d(shape_, flatten_descriptor_); - merged_nests_descriptor_ = - TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_); - } - } - - /** - * \brief Layout constructor (with default packed column-major strides). - * - * \param shape Shape for layout. - */ - __host__ __device__ constexpr Layout(const Shape& shape) - : flatten_descriptor_{}, shape_(shape), strides_() - { - if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) - { - flatten_descriptor_ = MakeFlattenDescriptor(shape_, strides_); + flatten_descriptor_ = flatten_descriptor; descriptor_1d_ = MakeMerge1d(shape_, flatten_descriptor_); merged_nests_descriptor_ = TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_); } - else - { - static_assert(true, "Compiletime Layout require strides parameter."); - } } /** @@ -399,24 +329,6 @@ struct Layout */ __host__ __device__ constexpr const Shape& GetShape() const { return shape_; } - /** - * \brief Strides getter. - * - * \return Strides. - */ - __host__ __device__ constexpr auto GetStrides() const - { - if constexpr(is_same_v>) - { - index_t start_stride = 1; - return GenerateColumnMajorPackedStrides(shape_, start_stride); - } - else - { - return strides_; - } - } - /** * \brief Get default lengths (tuple filled with Shape length elements). * @@ -442,17 +354,26 @@ struct Layout * * \return Default descriptor. */ - __host__ __device__ constexpr MergedNestsDescriptorType GetDefaultDescriptor() + __host__ __device__ constexpr const MergedNestsDescriptorType& GetDefaultDescriptor() const { return merged_nests_descriptor_; } + /** + * \brief Get flatten descriptor (with unrolled dims) + * + * \return Flatten descriptor. + */ + __host__ __device__ constexpr const FlattenDescriptorType& GetFlattenDescriptor() const + { + return flatten_descriptor_; + } + private: FlattenDescriptorType flatten_descriptor_; Descriptor1dType descriptor_1d_; MergedNestsDescriptorType merged_nests_descriptor_; const Shape shape_; - const Strides strides_; }; } // namespace wrapper diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp index fad790a6902..bc598178752 100644 --- a/include/ck/wrapper/tensor.hpp +++ b/include/ck/wrapper/tensor.hpp @@ -16,14 +16,14 @@ namespace wrapper { * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR). * \tparam ElementType Element data type. * \tparam Shape Tensor shape (layout component). - * \tparam Strides Tensor strides (layout component). + * \tparam FlattenDescriptorType Flatten descriptor (layout component). * \tparam NumVectors Number of vectors (only for VGPR, SGPR). * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR). */ template @@ -32,50 +32,20 @@ struct Tensor private: // Check if Tuple contains Slice object template - constexpr static bool IsSlicing(T&&) + __host__ __device__ constexpr static bool IsSlicing(T&&) { return is_detected::value; } template - constexpr static bool IsSlicing(Tuple&&) + __host__ __device__ constexpr static bool IsSlicing(Tuple&&) { return (IsSlicing(Ts{}) || ...); } - // Calculate first index of new tensor after slice - // It is needed to calculate offset for new tensor - template - constexpr auto GetStartIdxForSlicedTensor(const Tuple& idx) const - { - const auto start_idx_for_sliced_tensor = generate_tuple( - [&](auto i) { - constexpr auto num_i = Number{}; - if constexpr(is_detected>>::value) - { - // if tuple then recurrence - return GetStartIdxForSlicedTensor(idx.At(num_i)); - } - else if constexpr(is_detected>>::value) - { - // if slice, return the beginning of the interval - return idx.At(num_i).from_; - } - else - { - // if one dim selected - return idx.At(num_i); - } - }, - Number::Size()>{}); - - return start_idx_for_sliced_tensor; - } - // Calculate new tensor shape after slice template - constexpr auto GetShapeFromSlicedTensor(const Tuple& idx, - const ShapeTmpType& shape) const + __host__ __device__ constexpr auto GetShapeFromSlicedTensor(const Tuple& idx, + const ShapeTmpType& shape) const { // Pack each value in tuple to remove empty tuples after generation auto new_shape = generate_tuple( @@ -113,67 +83,133 @@ struct Tensor return UnrollNestedTuple<0, 1>(new_shape); } - template - constexpr auto GetStridesFromSlicedTensor(const Tuple& idx, - const StridesTmpType& strides) const + // Generate Freeze for each of nested shape + template + __host__ __device__ constexpr auto GenerateMultipleFreeze(const T& idx, + const ShapeTmpType& shape) const + { + const auto unrolled_shape = UnrollNestedTuple(shape); + return generate_tuple( + [&](auto) { + // dimension offset from idx + return make_freeze_transform(idx); + }, + Number{}); + } + + template + __host__ __device__ constexpr auto + GetTransformsFromSlicedTensor(const Tuple& idx, const ShapeTmpType& shape) const { // Pack each value in tuple to remove empty tuples after generation - auto new_strides = generate_tuple( + auto transforms = generate_tuple( [&](auto i) { constexpr auto num_i = Number{}; if constexpr(is_detected>>::value) { - if constexpr(!IsSlicing(tuple_element_t>{})) - { - // if tuple does not have any slice then we can remove dimension - return Tuple<>{}; - } - else - { - // if tuple then recurrence - return make_tuple( - GetStridesFromSlicedTensor(idx.At(num_i), strides.At(num_i))); - } + return GetTransformsFromSlicedTensor(idx.At(num_i), shape.At(num_i)); } else if constexpr(is_detected>>::value) { - // Stride will be the same - return make_tuple(strides.At(num_i)); + + const auto from = idx.At(num_i).from_; + const auto dim = shape.At(num_i); + const auto range = idx.At(num_i).range(dim); + return make_slice_transform(dim, from, from + range); } else { // remove dimension for just value - return Tuple<>{}; + return GenerateMultipleFreeze(idx.At(num_i), shape.At(num_i)); } }, Number::Size()>{}); // Remove empty tuples (deleted elements) and return - return UnrollNestedTuple<0, 1>(new_strides); + return UnrollNestedTuple(transforms); + } + + // There is no output for Freeze transform + template + __host__ __device__ constexpr auto GetSequenceVal(const ck::Freeze&) const + { + return Sequence<>{}; + } + + template + __host__ __device__ constexpr auto + GetSequenceVal(const ck::Slice&) const + { + return Sequence{}; + } + + template + __host__ __device__ constexpr auto GenerateUpperDims(const Tuple<>&) const + { + return Tuple<>{}; + } + + template + __host__ __device__ constexpr auto + GenerateUpperDims(const Tuple& transforms) const + { + constexpr auto num_transforms = Tuple::Size(); + // Deduce Sequence element for specific transform + const auto currect_elem = GetSequenceVal(transforms.At(Number<0>{})); + if constexpr(is_same_v>) + { + const auto next_tuple = GenerateUpperDims(TupleSlice<1, num_transforms>(transforms)); + return concat_tuple(make_tuple(currect_elem), next_tuple); + } + else + { + // Increase i if current_elem is Slice transform + const auto next_tuple = + GenerateUpperDims(TupleSlice<1, num_transforms>(transforms)); + return concat_tuple(make_tuple(currect_elem), next_tuple); + } + } + + template + __host__ __device__ constexpr auto + GetDescriptorFromSlicedTensor(const Tuple& idx, + const ShapeTmpType& shape, + const FlattenDescriptor& flatten_desc) const + { + constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size(); + + const auto transforms = GetTransformsFromSlicedTensor(idx, shape); + using TransformsTupleType = decltype(transforms); + + const auto lower_dims = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + const auto upper_dims = decltype(GenerateUpperDims<0>(TransformsTupleType{})){}; + return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); } public: - using ElementSpaceSize = decltype(Layout{ - Shape{}, Strides{}}.GetElementSpaceSize()); // SpaceSize type for buffer - using TensorElementType = ElementType; // DataType + using ElementSpaceSize = decltype(Layout{ + Shape{}, FlattenDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer + using TensorElementType = ElementType; // DataType static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace; static constexpr bool IsDynamicBuffer = !(BufferAddressSpace == MemoryTypeEnum ::Sgpr || BufferAddressSpace == MemoryTypeEnum ::Vgpr); __host__ __device__ Tensor() = delete; - __host__ __device__ Tensor(ElementType* pointer, const Layout& layout) + __host__ __device__ Tensor(ElementType* pointer, + const Layout& layout) : layout_(layout), buffer_(make_dynamic_buffer(pointer, layout.GetElementSpaceSize())) { } - __host__ __device__ Tensor(const Layout& layout) : layout_(layout) + __host__ __device__ Tensor(const Layout& layout) : layout_(layout) { static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register."); } - __host__ __device__ constexpr const Layout& GetLayout() const + __host__ __device__ constexpr const Layout& GetLayout() const { return layout_; } @@ -183,21 +219,14 @@ struct Tensor __host__ __device__ auto operator[](const Tuple& idx) const { static_assert(IsDynamicBuffer, "Register slice is not supported"); - // Calculate offset based on first idx for new tensor - const index_t offset = layout_(GetStartIdxForSlicedTensor(idx)); + const auto& shape = layout_.GetShape(); + auto new_shape = GetShapeFromSlicedTensor(idx, shape); - auto new_shape = GetShapeFromSlicedTensor(idx, layout_.GetShape()); - if constexpr(is_same_v>) - { - auto new_layout = make_layout(new_shape); - return make_tensor(buffer_.p_data_ + offset, new_layout); - } - else - { - auto new_strides = GetStridesFromSlicedTensor(idx, layout_.GetStrides()); - auto new_layout = make_layout(new_shape, new_strides); - return make_tensor(buffer_.p_data_ + offset, new_layout); - } + const auto& flatten_desc = layout_.GetFlattenDescriptor(); + auto new_desc = GetDescriptorFromSlicedTensor(idx, shape, flatten_desc); + const auto new_layout = + Layout(new_shape, new_desc); + return make_tensor(buffer_.p_data_, new_layout); } template {}), bool> = false> @@ -223,18 +252,10 @@ struct Tensor } else { - if constexpr(is_same_v>) - { - constexpr index_t offset = - Layout{Shape{}}.template operator()>(); - return buffer_[Number{}]; - } - else - { - constexpr index_t offset = - Layout{Shape{}, Strides{}}.template operator()>(); - return buffer_[Number{}]; - } + constexpr index_t offset = Layout{ + Shape{}, + FlattenDescriptorType{}}.template operator()>(); + return buffer_[Number{}]; } } @@ -261,18 +282,10 @@ struct Tensor } else { - if constexpr(is_same_v>) - { - constexpr index_t offset = - Layout{Shape{}}.template operator()>(); - return buffer_(Number{}); - } - else - { - constexpr index_t offset = - Layout{Shape{}, Strides{}}.template operator()>(); - return buffer_(Number{}); - } + constexpr index_t offset = Layout{ + Shape{}, + FlattenDescriptorType{}}.template operator()>(); + return buffer_(Number{}); } } @@ -309,7 +322,7 @@ struct Tensor // If register use static buffer, else use dynamic buffer using Buffer = std::conditional_t; - const Layout layout_; + const Layout layout_; Buffer buffer_; }; diff --git a/include/ck/wrapper/utils/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp index e99596d8f50..a33311e1422 100644 --- a/include/ck/wrapper/utils/layout_utils.hpp +++ b/include/ck/wrapper/utils/layout_utils.hpp @@ -22,11 +22,61 @@ namespace wrapper { // Disable from doxygen docs generation /// @cond // forward declaration -template +template struct Layout; template using is_tuple = decltype(std::declval().IsTuple()); + +namespace { +// Generate packed (column-major) strides if not passed +template +__host__ __device__ constexpr static auto +GenerateColumnMajorPackedStrides(const Tuple& shape, index_t& stride) +{ + return generate_tuple( + [&](auto i) { + const auto num_i = Number{}; + if constexpr(is_detected>>::value) + { + return GenerateColumnMajorPackedStrides(shape.At(num_i), stride); + } + else + { + const index_t dim_stride = stride; + // update stride + stride *= shape.At(num_i); + return dim_stride; + } + }, + Number::Size()>{}); +} + +template +__host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shape, + const LayoutStrides& strides) +{ + const auto unrolled_shape = UnrollNestedTuple(shape); + if constexpr(is_same_v>) + { + index_t start_stride = 1; + // if not passed, then generate + const auto unrolled_strides = + GenerateColumnMajorPackedStrides(unrolled_shape, start_stride); + static_assert(unrolled_shape.Size() == unrolled_strides.Size(), + "Size of strides and shape are not consistent."); + return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); + } + else + { + const auto unrolled_strides = UnrollNestedTuple(strides); + static_assert(unrolled_shape.Size() == unrolled_strides.Size(), + "Size of strides and shape are not consistent."); + return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); + } +} +} // namespace + /// @endcond // make_* @@ -38,10 +88,10 @@ using is_tuple = decltype(std::declval().IsTuple()); * \return Constructed layout. */ template -__host__ __device__ constexpr Layout make_layout(const Shape& shape, - const Strides& strides) +__host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides& strides) { - return Layout(shape, strides); + using FlattenDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Strides{})); + return Layout(shape, MakeFlattenDescriptor(shape, strides)); } /** @@ -52,9 +102,10 @@ __host__ __device__ constexpr Layout make_layout(const Shape& sh * \return Constructed layout. */ template -__host__ __device__ constexpr Layout> make_layout(const Shape& shape) +__host__ __device__ constexpr auto make_layout(const Shape& shape) { - return Layout>(shape); + using FlattenDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Tuple<>{})); + return Layout(shape, MakeFlattenDescriptor(shape, Tuple<>{})); } // Layout helpers @@ -89,26 +140,51 @@ __host__ __device__ constexpr auto get(const Tuple& tuple) * \param layout Layout to create sub layout. * \return Requsted sub layout. */ -template -__host__ __device__ constexpr auto get(const Layout& layout) +template +__host__ __device__ constexpr auto get(const Layout& layout) { - const auto& shape = layout.GetShape(); - const auto& new_shape = get(shape); + const auto& shape = layout.GetShape(); + const auto new_shape = get(shape); static_assert(is_detected::value, "Shape of sub layout must be tuple"); - if constexpr(is_same_v>) - { - // If stride not passed, create without strides - return make_layout(new_shape); - } - else - { - const auto& strides = layout.GetStrides(); - const auto& new_strides = get(strides); - static_assert(is_detected::value, - "Strides of sub layout must be tuple"); - return make_layout(new_shape, new_strides); - } + + constexpr auto old_shape_dims = decltype(UnrollNestedTuple(shape))::Size(); + constexpr auto new_shape_dims = decltype(UnrollNestedTuple(new_shape))::Size(); + constexpr auto shape_offset = decltype(UnrollNestedTuple(TupleSlice<0, idx>(shape)))::Size(); + + const auto unrolled_shape = UnrollNestedTuple(shape); + const auto transforms = generate_tuple( + [&](auto i) { + // Compare Idx with shape + if constexpr(i < shape_offset || i >= shape_offset + new_shape_dims) + { + // Remove dimension + return make_freeze_transform(Number<0>{}); + } + else + { + return make_pass_through_transform(unrolled_shape.At(i)); + } + }, + Number{}); + + const auto lower_dims = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + const auto upper_dims = generate_tuple( + [&](auto i) { + if constexpr(i < shape_offset || i >= shape_offset + new_shape_dims) + return Sequence<>{}; + + else + { + return Sequence{}; + } + }, + Number{}); + + const auto& flatten_desc = layout.GetFlattenDescriptor(); + auto new_desc = transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); + return Layout(new_shape, new_desc); } /** @@ -142,8 +218,8 @@ __host__ __device__ T constexpr size(const T& dim) * \param layout Layout to get Shape of. * \return Requsted length. */ -template -__host__ __device__ constexpr auto size(const Layout& layout) +template +__host__ __device__ constexpr auto size(const Layout& layout) { return layout.template GetLength(); } @@ -168,8 +244,8 @@ __host__ __device__ constexpr auto size(const Tuple& shape) * \param layout Layout to calculate shape size. * \return Requsted size. */ -template -__host__ __device__ constexpr auto size(const Layout& layout) +template +__host__ __device__ constexpr auto size(const Layout& layout) { return layout.GetLengths(); } @@ -208,8 +284,9 @@ __host__ __device__ constexpr auto size(const T& elem) * \param layout Layout to calculate rank. * \return Requsted rank. */ -template -__host__ __device__ constexpr auto rank([[maybe_unused]] const Layout& layout) +template +__host__ __device__ constexpr auto +rank([[maybe_unused]] const Layout& layout) { return Shape::Size(); } @@ -261,8 +338,8 @@ __host__ __device__ constexpr auto rank(const T& elem) * \param layout Layout to calculate depth. * \return Requsted depth. */ -template -__host__ __device__ constexpr auto depth(const Layout& layout) +template +__host__ __device__ constexpr auto depth(const Layout& layout) { const auto& shape = layout.GetShape(); return TupleDepth(shape); @@ -307,26 +384,14 @@ __host__ __device__ constexpr auto depth(const T& elem) return depth(get(elem)); } -/** - * \brief Get Layout strides. - * - * \param layout Layout to get strides from. - * \return Requsted strides. - */ -template -__host__ __device__ constexpr auto stride(const Layout& layout) -{ - return layout.GetStrides(); -} - /** * \brief Get Layout shape. * * \param layout Layout to get shape from. * \return Requsted shape. */ -template -__host__ __device__ constexpr const auto& shape(const Layout& layout) +template +__host__ __device__ constexpr const auto& shape(const LayoutType& layout) { return layout.GetShape(); } diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp index edc61e1e466..96a53591ef2 100644 --- a/include/ck/wrapper/utils/tensor_partition.hpp +++ b/include/ck/wrapper/utils/tensor_partition.hpp @@ -16,7 +16,6 @@ template __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple& shape, const Tuple& thread_lengths) { - static_assert(Tuple::Size() == Tuple::Size(), "Wrong thread_lengths shape."); return generate_tuple( [&](auto i) { @@ -28,7 +27,7 @@ __host__ __device__ constexpr auto CalculateLocalPartitionShape(const Tuple -__host__ __device__ constexpr auto CalculateLocalPartitionStrides( - const Tuple& strides, const Tuple& thread_lengths, const Tuple& steps) +template +__host__ __device__ constexpr auto +CalculateLocalPartitionDescriptor(const Tuple& shape, + const Tuple& thread_lengths, + const Tuple& steps, + const FlattenDescType& flatten_desc) { static_assert(Tuple::Size() == Tuple::Size(), "Wrong thread_lengths shape."); - return generate_tuple( + const auto unrolled_thread_lengths = UnrollNestedTuple(thread_lengths); + const auto unrolled_shape = UnrollNestedTuple(shape); + constexpr auto dims = decltype(unrolled_thread_lengths)::Size(); + + using UnrolledStepsType = decltype(UnrollNestedTuple(steps)); + + using I1 = Number<1>; + + const auto transforms = generate_tuple( [&](auto i) { constexpr auto num_i = Number{}; - if constexpr(is_detected>>::value) + if constexpr(is_same_v, Tuple<>>) { - // if tuple then recurrence - if constexpr(is_same_v, Tuple<>>) + // By default raked partition + const auto partition_stride = unrolled_thread_lengths.At(num_i); + return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), + make_tuple(partition_stride)); + } + else if constexpr(!is_same_v, index_t>) + { + // Compiletime partition + if constexpr(is_same_v, I1>) { - return CalculateLocalPartitionStrides( - strides.At(num_i), thread_lengths.At(num_i), Tuple<>{}); + // raked + const auto partition_stride = unrolled_thread_lengths.At(num_i); + return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), + make_tuple(partition_stride)); } else { - return CalculateLocalPartitionStrides( - strides.At(num_i), thread_lengths.At(num_i), steps.At(num_i)); + // packed + return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), + make_tuple(I1{})); } } else { - if constexpr(is_same_v, Tuple<>>) - { - // By default raked partition - const auto partition_stride = thread_lengths.At(i); - return partition_stride * strides.At(i); - } - else if constexpr(!is_same_v>, index_t>) + // Runtime partition + if(steps.At(num_i) == 1) { - // Compiletime partition - if constexpr(is_same_v>, Number<1>>) - { - // raked - const auto partition_stride = thread_lengths.At(i); - return partition_stride * strides.At(i); - } - else - { - // packed - return strides.At(i); - } + // raked + const auto partition_stride = unrolled_thread_lengths.At(num_i); + return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), + make_tuple(partition_stride)); } else { - // Runtime partition - if(steps.At(i) == 1) - { - // raked - const auto partition_stride = thread_lengths.At(i); - return partition_stride * strides.At(i); - } - else - { - // packed - return strides.At(i); - } + // packed + return make_embed_transform(make_tuple(unrolled_shape.At(num_i)), + make_tuple(I1{})); } } }, - Number::Size()>{}); + Number{}); + + const auto lower_dims = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + const auto upper_dims = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); } // Convert interger thread_idx to tuple index with applied steps @@ -128,8 +132,8 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& else { // Update thread_id after each dim - const auto dim_thread_id = thread_id % thread_lengths.At(i); - thread_id /= thread_lengths.At(i); + const auto dim_thread_id = thread_id % thread_lengths.At(num_i); + thread_id /= thread_lengths.At(num_i); if constexpr(is_same_v, Tuple<>>) { return dim_thread_id; @@ -220,9 +224,12 @@ __host__ __device__ constexpr auto make_local_partition(const TensorType& tensor { // Create shape, strides and layout for new partition tensor const auto partition_shape = CalculateLocalPartitionShape(shape(tensor), thread_lengths); - const auto partition_strides = - CalculateLocalPartitionStrides(stride(tensor), thread_lengths, steps); - const auto partition_layout = make_layout(partition_shape, partition_strides); + // Create new descriptor and layout + const auto& flatten_desc = layout(tensor).GetFlattenDescriptor(); + auto partition_desc = + CalculateLocalPartitionDescriptor(shape(tensor), thread_lengths, steps, flatten_desc); + const auto partition_layout = Layout( + partition_shape, partition_desc); // Calculate offset for new partition tensor index_t thread_id_copy = thread_id; const auto offset_idx = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id_copy); @@ -251,12 +258,16 @@ __host__ __device__ constexpr auto make_local_tile(const TensorType& tensor, { // Create block lengths, strides and layout for new tile tensor const auto block_lengths = CalculateBlockLengths(shape(tensor), tile_shape); - const auto block_strides = CalculateLocalPartitionStrides(stride(tensor), block_lengths, steps); - const auto tile_layout = make_layout(tile_shape, block_strides); + // Create new descriptor and layout + const auto& flatten_desc = layout(tensor).GetFlattenDescriptor(); + auto tile_desc = + CalculateLocalPartitionDescriptor(tile_shape, block_lengths, steps, flatten_desc); + const auto tile_layout = Layout, decltype(tile_desc)>( + tile_shape, tile_desc); // Calculate offset for new partition tensor - const auto offset_idx = CalculateLayoutOffsetIdx(steps, block_idx); - const auto partition_offset = layout(tensor)(offset_idx); - return make_tensor(tensor.GetPointer() + partition_offset, + const auto offset_idx = CalculateLayoutOffsetIdx(steps, block_idx); + const auto tile_offset = layout(tensor)(offset_idx); + return make_tensor(tensor.GetPointer() + tile_offset, tile_layout); } diff --git a/include/ck/wrapper/utils/tensor_utils.hpp b/include/ck/wrapper/utils/tensor_utils.hpp index 35bb4f04be7..a2b36d29e95 100644 --- a/include/ck/wrapper/utils/tensor_utils.hpp +++ b/include/ck/wrapper/utils/tensor_utils.hpp @@ -27,12 +27,12 @@ using MemoryTypeEnum = AddressSpaceEnum; // Disable from doxygen docs generation /// @cond // forward declarations -template +template struct Layout; template @@ -98,11 +98,18 @@ using is_tuple = decltype(std::declval().IsTuple()); * \param layout Tensor layout. * \return Constructed tensor. */ -template -constexpr auto make_tensor(ElementType* pointer, const Layout& layout) +template +constexpr auto make_tensor(ElementType* pointer, const Layout& layout) { - return Tensor( - pointer, layout); + return Tensor(pointer, layout); } /** @@ -124,7 +131,7 @@ constexpr auto make_register_tensor() return Tensor>, - Tuple>, + std::remove_const_t>, NumVectors, ScalarPerVector>(layout); } @@ -138,12 +145,15 @@ constexpr auto make_register_tensor() template -__host__ __device__ constexpr const auto& -layout(const Tensor& - tensor) +__host__ __device__ constexpr const auto& layout(const Tensor& tensor) { return tensor.GetLayout(); } @@ -159,12 +169,15 @@ template -__host__ __device__ constexpr auto -size(const Tensor& - tensor) +__host__ __device__ constexpr auto size(const Tensor& tensor) { return size(tensor.GetLayout()); } @@ -180,12 +193,15 @@ template -__host__ __device__ constexpr auto -rank(const Tensor& - tensor) +__host__ __device__ constexpr auto rank(const Tensor& tensor) { return rank(tensor.GetLayout()); } @@ -201,35 +217,19 @@ template -__host__ __device__ constexpr auto -depth(const Tensor& - tensor) +__host__ __device__ constexpr auto depth(const Tensor& tensor) { return depth(tensor.GetLayout()); } -/** - * \brief Get Tensor strides. - * - * \param tensor Tensor to get strides from. - * \return Requsted strides. - */ -template -__host__ __device__ constexpr auto -stride(const Tensor& - tensor) -{ - return stride(tensor.GetLayout()); -} - /** * \brief Get Tensor shape. * @@ -239,12 +239,15 @@ stride(const Tensor -__host__ __device__ constexpr const auto& -shape(const Tensor& - tensor) +__host__ __device__ constexpr const auto& shape(const Tensor& tensor) { return shape(tensor.GetLayout()); } diff --git a/test/wrapper/test_layout.cpp b/test/wrapper/test_layout.cpp index d5dba4841f5..a128a6d84f5 100644 --- a/test/wrapper/test_layout.cpp +++ b/test/wrapper/test_layout.cpp @@ -436,26 +436,11 @@ TEST(TestLayoutHelpers, ShapeAndStrides) constexpr bool check_compiletime_shape = std::is_same_v>; - constexpr bool check_compiletime_strides = - std::is_same_v, - std::remove_reference_t>; constexpr bool check_runtime_shape = std::is_same_v>; - constexpr bool check_runtime_strides = - std::is_same_v, - std::remove_reference_t>; EXPECT_TRUE(check_compiletime_shape); - EXPECT_TRUE(check_compiletime_strides); EXPECT_TRUE(check_runtime_shape); - EXPECT_TRUE(check_runtime_strides); - - // Check packed strides generation - const auto packed_layout = ck::wrapper::make_layout(shape_runtime); - constexpr bool check_packed_layout_strides = - std::is_same_v, - std::remove_reference_t>; - EXPECT_TRUE(check_packed_layout_strides); } TEST(TestLayoutHelpers, Hierarchical) diff --git a/test/wrapper/test_partition.cpp b/test/wrapper/test_partition.cpp index a2ae30f5de0..fe214e9ed07 100644 --- a/test/wrapper/test_partition.cpp +++ b/test/wrapper/test_partition.cpp @@ -46,21 +46,6 @@ TEST(TestPartition, LocalPartition) ck::wrapper::size(tensor) / ck::wrapper::size(thread_layout); EXPECT_EQ(ck::wrapper::size(raked_partition), expected_partition_size); EXPECT_EQ(raked_partition(0), thread_id); - - const auto expected_partition_stride_00 = - ck::wrapper::size<0, 0>(strides) * ck::wrapper::size<0, 0>(thread_layout); - const auto expected_partition_stride_01 = - ck::wrapper::size<0, 1>(strides) * ck::wrapper::size<0, 1>(thread_layout); - const auto expected_partition_stride_1 = - ck::wrapper::size<1>(strides) * ck::wrapper::size<1>(thread_layout); - const auto partition_stride_00 = - ck::wrapper::size<0, 0>(ck::wrapper::stride(raked_partition)); - const auto partition_stride_01 = - ck::wrapper::size<0, 1>(ck::wrapper::stride(raked_partition)); - const auto partition_stride_1 = ck::wrapper::size<1>(ck::wrapper::stride(raked_partition)); - EXPECT_EQ(partition_stride_00, expected_partition_stride_00); - EXPECT_EQ(partition_stride_01, expected_partition_stride_01); - EXPECT_EQ(partition_stride_1, expected_partition_stride_1); } for(ck::index_t thread_id = 0; thread_id < ck::wrapper::size(thread_layout); thread_id++) @@ -73,18 +58,6 @@ TEST(TestPartition, LocalPartition) const auto expected_partition_first_val = thread_id * ck::wrapper::size<0, 0>(thread_steps); EXPECT_EQ(ck::wrapper::size(packed_partition), expected_partition_size); EXPECT_EQ(packed_partition(0), expected_partition_first_val); - - const auto expected_partition_stride_00 = ck::wrapper::size<0, 0>(strides); - const auto expected_partition_stride_01 = ck::wrapper::size<0, 1>(strides); - const auto expected_partition_stride_1 = ck::wrapper::size<1>(strides); - const auto partition_stride_00 = - ck::wrapper::size<0, 0>(ck::wrapper::stride(packed_partition)); - const auto partition_stride_01 = - ck::wrapper::size<0, 1>(ck::wrapper::stride(packed_partition)); - const auto partition_stride_1 = ck::wrapper::size<1>(ck::wrapper::stride(packed_partition)); - EXPECT_EQ(partition_stride_00, expected_partition_stride_00); - EXPECT_EQ(partition_stride_01, expected_partition_stride_01); - EXPECT_EQ(partition_stride_1, expected_partition_stride_1); } } @@ -128,19 +101,6 @@ TEST(TestPartition, LocalTile) const auto expected_tile_size = ck::wrapper::size(block_shape); EXPECT_EQ(ck::wrapper::size(raked_tile), expected_tile_size); EXPECT_EQ(raked_tile(0), layout(block_idx)); - - const auto expected_tile_stride_00 = - ck::wrapper::size<0, 0>(strides) * ck::wrapper::size<0, 0>(block_layout); - const auto expected_tile_stride_01 = - ck::wrapper::size<0, 1>(strides) * ck::wrapper::size<0, 1>(block_layout); - const auto expected_tile_stride_1 = - ck::wrapper::size<1>(strides) * ck::wrapper::size<1>(block_layout); - const auto tile_stride_00 = ck::wrapper::size<0, 0>(ck::wrapper::stride(raked_tile)); - const auto tile_stride_01 = ck::wrapper::size<0, 1>(ck::wrapper::stride(raked_tile)); - const auto tile_stride_1 = ck::wrapper::size<1>(ck::wrapper::stride(raked_tile)); - EXPECT_EQ(tile_stride_00, expected_tile_stride_00); - EXPECT_EQ(tile_stride_01, expected_tile_stride_01); - EXPECT_EQ(tile_stride_1, expected_tile_stride_1); } for(const auto& block_idx : block_idxs) @@ -158,15 +118,5 @@ TEST(TestPartition, LocalTile) ck::wrapper::size<1>(strides); EXPECT_EQ(ck::wrapper::size(packed_tile), expected_tile_size); EXPECT_EQ(packed_tile(0), expected_tile_first_val); - - const auto expected_tile_stride_00 = ck::wrapper::size<0, 0>(strides); - const auto expected_tile_stride_01 = ck::wrapper::size<0, 1>(strides); - const auto expected_tile_stride_1 = ck::wrapper::size<1>(strides); - const auto tile_stride_00 = ck::wrapper::size<0, 0>(ck::wrapper::stride(packed_tile)); - const auto tile_stride_01 = ck::wrapper::size<0, 1>(ck::wrapper::stride(packed_tile)); - const auto tile_stride_1 = ck::wrapper::size<1>(ck::wrapper::stride(packed_tile)); - EXPECT_EQ(tile_stride_00, expected_tile_stride_00); - EXPECT_EQ(tile_stride_01, expected_tile_stride_01); - EXPECT_EQ(tile_stride_1, expected_tile_stride_1); } } diff --git a/test/wrapper/test_tensor.cpp b/test/wrapper/test_tensor.cpp index 72b56068749..1392ee1b516 100644 --- a/test/wrapper/test_tensor.cpp +++ b/test/wrapper/test_tensor.cpp @@ -126,7 +126,7 @@ __global__ void TestTensorReadWriteDevice(void* data, void* success) StaticInitTensor(tensor_vgpr); StaticInitTensor(tensor_sgpr); - *casted_success_ptr &= TestTensorCheck1d(tensor_global); + *casted_success_ptr = TestTensorCheck1d(tensor_global); *casted_success_ptr &= TestTensorCheck3d(tensor_global); *casted_success_ptr &= TestTensorCheck1d(tensor_lds); @@ -172,33 +172,38 @@ TEST(TestTensor, Slicing) auto tensor2x2x2 = tensor(ck::make_tuple(ck::wrapper::slice(2), ck::wrapper::slice(2)), ck::wrapper::slice(2)); + EXPECT_EQ(tensor2x2x2(0), layout(ck::make_tuple(ck::make_tuple(0, 0), 0))); EXPECT_EQ(ck::wrapper::rank(tensor2x2x2), 2); EXPECT_EQ(ck::wrapper::depth(tensor2x2x2), 2); EXPECT_EQ(ck::wrapper::size(tensor2x2x2), 8); EXPECT_TRUE(TestTensorCheck1d(tensor2x2x2)); auto tensor2x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(2)), ck::wrapper::slice(2)); + EXPECT_EQ(tensor2x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0))); EXPECT_EQ(ck::wrapper::rank(tensor2x2), 2); EXPECT_EQ(ck::wrapper::depth(tensor2x2), 2); EXPECT_EQ(ck::wrapper::size(tensor2x2), 4); - EXPECT_TRUE(TestTensorCheck1d(tensor2x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0)))); + EXPECT_TRUE(TestTensorCheck1d(tensor2x2)); auto tensor1x1 = tensor(ck::make_tuple(1, ck::wrapper::slice(1, 2)), ck::wrapper::slice(1, 2)); + EXPECT_EQ(tensor1x1(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 1))); EXPECT_EQ(rank(tensor1x1), 2); EXPECT_EQ(depth(tensor1x1), 2); EXPECT_EQ(size(tensor1x1), 1); - EXPECT_TRUE(TestTensorCheck1d(tensor1x1, layout(ck::make_tuple(ck::make_tuple(1, 1), 1)))); + EXPECT_TRUE(TestTensorCheck1d(tensor1x1)); auto tensor2 = tensor(ck::make_tuple(1, 1), ck::wrapper::slice(0, 2)); + EXPECT_EQ(tensor2(0), layout(ck::make_tuple(ck::make_tuple(1, 1), 0))); EXPECT_EQ(ck::wrapper::rank(tensor2), 1); EXPECT_EQ(ck::wrapper::depth(tensor2), 1); EXPECT_EQ(ck::wrapper::size(tensor2), 2); - EXPECT_TRUE(TestTensorCheck1d(tensor2, layout(ck::make_tuple(ck::make_tuple(1, 1), 0)))); + EXPECT_TRUE(TestTensorCheck1d(tensor2)); // negative indexing auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice()); + EXPECT_EQ(tensor1x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0))); EXPECT_EQ(rank(tensor1x2), 2); EXPECT_EQ(depth(tensor1x2), 2); EXPECT_EQ(size(tensor1x2), 2); - EXPECT_TRUE(TestTensorCheck1d(tensor1x2, layout(ck::make_tuple(ck::make_tuple(1, 0), 0)))); + EXPECT_TRUE(TestTensorCheck1d(tensor1x2)); } From 12213d7dd8eb7f1d9cec999d4850e18f18367fd0 Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Fri, 29 Dec 2023 12:30:49 +0000 Subject: [PATCH 5/7] Fixes --- include/ck/wrapper/layout.hpp | 40 +++++++++---------- include/ck/wrapper/tensor.hpp | 40 ++++++++++--------- include/ck/wrapper/utils/layout_utils.hpp | 28 ++++++------- include/ck/wrapper/utils/tensor_partition.hpp | 30 +++++++++----- include/ck/wrapper/utils/tensor_utils.hpp | 33 +++++++-------- test/wrapper/test_tensor.cpp | 7 ++++ 6 files changed, 100 insertions(+), 78 deletions(-) diff --git a/include/ck/wrapper/layout.hpp b/include/ck/wrapper/layout.hpp index c0a52c03327..1643eb73833 100644 --- a/include/ck/wrapper/layout.hpp +++ b/include/ck/wrapper/layout.hpp @@ -14,9 +14,9 @@ namespace wrapper { * \tparam Shape Tuple of Number<> (for compile-time layout) or index_t * (dynamic layout). It is possible to pass nested shapes * (e.g. ((4, 2), 2)), nested dimensions are merged. - * \tparam FlattenDescriptorType Tensor descriptor for flatten shape dims. + * \tparam UnnestedDescriptorType Tensor descriptor for unnested shape dims. */ -template +template struct Layout { private: @@ -29,7 +29,7 @@ struct Layout { return generate_tuple( [&](auto) { - if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) + if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime()) { // runtime layout return index_t(0); @@ -185,14 +185,14 @@ struct Layout } using Descriptor1dType = - remove_cvref_t; + remove_cvref_t; using DefaultIdxsTupleType = remove_cvref_t; template __host__ __device__ constexpr static auto TransformDesc(const Tuple& shape, const Tuple& idx, - const FlattenDescriptorType& naive_descriptor) + const UnnestedDescriptorType& naive_descriptor) { if constexpr(Tuple::Size() == I1) { @@ -215,12 +215,12 @@ struct Layout } using MergedNestsDescriptorType = remove_cvref_t; + Shape{}, DefaultIdxsTupleType{}, UnnestedDescriptorType{}))>; public: __host__ __device__ constexpr auto GetElementSpaceSize() const { - return flatten_descriptor_.GetElementSpaceSize(); + return unnested_descriptor_.GetElementSpaceSize(); } __host__ __device__ Layout() = delete; @@ -229,19 +229,19 @@ struct Layout * \brief Layout constructor. * * \param shape Shape for layout. - * \param flatten_descriptor Descriptor + * \param unnested_descriptor Descriptor */ __host__ __device__ constexpr Layout(const Shape& shape, - const FlattenDescriptorType& flatten_descriptor) + const UnnestedDescriptorType& unnested_descriptor) : shape_(shape) { // Construct if runtime mode - if constexpr(!FlattenDescriptorType::IsKnownAtCompileTime()) + if constexpr(!UnnestedDescriptorType::IsKnownAtCompileTime()) { - flatten_descriptor_ = flatten_descriptor; - descriptor_1d_ = MakeMerge1d(shape_, flatten_descriptor_); + unnested_descriptor_ = unnested_descriptor; + descriptor_1d_ = MakeMerge1d(shape_, unnested_descriptor_); merged_nests_descriptor_ = - TransformDesc(shape_, DefaultIdxsTupleType{}, flatten_descriptor_); + TransformDesc(shape_, DefaultIdxsTupleType{}, unnested_descriptor_); } } @@ -254,9 +254,9 @@ struct Layout template __host__ __device__ constexpr index_t operator()() const { - static_assert(FlattenDescriptorType::IsKnownAtCompileTime(), + static_assert(UnnestedDescriptorType::IsKnownAtCompileTime(), "Compiletime operator used on runtime layout."); - using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, FlattenDescriptorType{})); + using TransformedDesc = decltype(TransformDesc(Shape{}, Idxs{}, UnnestedDescriptorType{})); using UnrolledIdx = decltype(UnrollNestedTuple(Idxs{})); return TransformedDesc{}.CalculateOffset(UnrolledIdx{}); } @@ -283,7 +283,7 @@ struct Layout else { // Custom index, need to transform descriptor - const auto transformed_desc = TransformDesc(shape_, Idx, flatten_descriptor_); + const auto transformed_desc = TransformDesc(shape_, Idx, unnested_descriptor_); return transformed_desc.CalculateOffset(UnrollNestedTuple(Idx)); } } @@ -360,17 +360,17 @@ struct Layout } /** - * \brief Get flatten descriptor (with unrolled dims) + * \brief Get unnested descriptor (with unrolled dims) * * \return Flatten descriptor. */ - __host__ __device__ constexpr const FlattenDescriptorType& GetFlattenDescriptor() const + __host__ __device__ constexpr const UnnestedDescriptorType& GetUnnestedDescriptor() const { - return flatten_descriptor_; + return unnested_descriptor_; } private: - FlattenDescriptorType flatten_descriptor_; + UnnestedDescriptorType unnested_descriptor_; Descriptor1dType descriptor_1d_; MergedNestsDescriptorType merged_nests_descriptor_; const Shape shape_; diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp index bc598178752..787c5b2176e 100644 --- a/include/ck/wrapper/tensor.hpp +++ b/include/ck/wrapper/tensor.hpp @@ -16,14 +16,14 @@ namespace wrapper { * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR). * \tparam ElementType Element data type. * \tparam Shape Tensor shape (layout component). - * \tparam FlattenDescriptorType Flatten descriptor (layout component). + * \tparam UnnestedDescriptorType Flatten descriptor (layout component). * \tparam NumVectors Number of vectors (only for VGPR, SGPR). * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR). */ template @@ -85,14 +85,17 @@ struct Tensor // Generate Freeze for each of nested shape template - __host__ __device__ constexpr auto GenerateMultipleFreeze(const T& idx, + __host__ __device__ constexpr auto GenerateMultipleFreeze(T idx, const ShapeTmpType& shape) const { const auto unrolled_shape = UnrollNestedTuple(shape); return generate_tuple( - [&](auto) { + [&](auto i) { // dimension offset from idx - return make_freeze_transform(idx); + const auto dim = unrolled_shape.At(Number{}); + const auto dim_idx = idx % dim; + idx /= dim; + return make_freeze_transform(dim_idx); }, Number{}); } @@ -116,7 +119,7 @@ struct Tensor const auto from = idx.At(num_i).from_; const auto dim = shape.At(num_i); const auto range = idx.At(num_i).range(dim); - return make_slice_transform(dim, from, from + range); + return make_slice_transform(range, from, from + range); } else { @@ -188,9 +191,9 @@ struct Tensor } public: - using ElementSpaceSize = decltype(Layout{ - Shape{}, FlattenDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer - using TensorElementType = ElementType; // DataType + using ElementSpaceSize = decltype(Layout{ + Shape{}, UnnestedDescriptorType{}}.GetElementSpaceSize()); // SpaceSize type for buffer + using TensorElementType = ElementType; // DataType static constexpr MemoryTypeEnum TensorBufferAddressSpace = BufferAddressSpace; static constexpr bool IsDynamicBuffer = !(BufferAddressSpace == MemoryTypeEnum ::Sgpr || @@ -198,18 +201,19 @@ struct Tensor __host__ __device__ Tensor() = delete; __host__ __device__ Tensor(ElementType* pointer, - const Layout& layout) + const Layout& layout) : layout_(layout), buffer_(make_dynamic_buffer(pointer, layout.GetElementSpaceSize())) { } - __host__ __device__ Tensor(const Layout& layout) : layout_(layout) + __host__ __device__ Tensor(const Layout& layout) + : layout_(layout) { static_assert(!IsDynamicBuffer, "Wrong BufferAddressSpace for register."); } - __host__ __device__ constexpr const Layout& GetLayout() const + __host__ __device__ constexpr const Layout& GetLayout() const { return layout_; } @@ -222,7 +226,7 @@ struct Tensor const auto& shape = layout_.GetShape(); auto new_shape = GetShapeFromSlicedTensor(idx, shape); - const auto& flatten_desc = layout_.GetFlattenDescriptor(); + const auto& flatten_desc = layout_.GetUnnestedDescriptor(); auto new_desc = GetDescriptorFromSlicedTensor(idx, shape, flatten_desc); const auto new_layout = Layout(new_shape, new_desc); @@ -252,9 +256,9 @@ struct Tensor } else { - constexpr index_t offset = Layout{ + constexpr index_t offset = Layout{ Shape{}, - FlattenDescriptorType{}}.template operator()>(); + UnnestedDescriptorType{}}.template operator()>(); return buffer_[Number{}]; } } @@ -282,9 +286,9 @@ struct Tensor } else { - constexpr index_t offset = Layout{ + constexpr index_t offset = Layout{ Shape{}, - FlattenDescriptorType{}}.template operator()>(); + UnnestedDescriptorType{}}.template operator()>(); return buffer_(Number{}); } } @@ -322,7 +326,7 @@ struct Tensor // If register use static buffer, else use dynamic buffer using Buffer = std::conditional_t; - const Layout layout_; + const Layout layout_; Buffer buffer_; }; diff --git a/include/ck/wrapper/utils/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp index a33311e1422..684cec42453 100644 --- a/include/ck/wrapper/utils/layout_utils.hpp +++ b/include/ck/wrapper/utils/layout_utils.hpp @@ -22,7 +22,7 @@ namespace wrapper { // Disable from doxygen docs generation /// @cond // forward declaration -template +template struct Layout; template @@ -90,8 +90,8 @@ __host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shap template __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides& strides) { - using FlattenDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Strides{})); - return Layout(shape, MakeFlattenDescriptor(shape, strides)); + using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Strides{})); + return Layout(shape, MakeFlattenDescriptor(shape, strides)); } /** @@ -104,8 +104,8 @@ __host__ __device__ constexpr auto make_layout(const Shape& shape, const Strides template __host__ __device__ constexpr auto make_layout(const Shape& shape) { - using FlattenDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Tuple<>{})); - return Layout(shape, MakeFlattenDescriptor(shape, Tuple<>{})); + using UnnestedDescriptorType = decltype(MakeFlattenDescriptor(Shape{}, Tuple<>{})); + return Layout(shape, MakeFlattenDescriptor(shape, Tuple<>{})); } // Layout helpers @@ -182,7 +182,7 @@ __host__ __device__ constexpr auto get(const Layout& layout) }, Number{}); - const auto& flatten_desc = layout.GetFlattenDescriptor(); + const auto& flatten_desc = layout.GetUnnestedDescriptor(); auto new_desc = transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); return Layout(new_shape, new_desc); } @@ -218,8 +218,8 @@ __host__ __device__ T constexpr size(const T& dim) * \param layout Layout to get Shape of. * \return Requsted length. */ -template -__host__ __device__ constexpr auto size(const Layout& layout) +template +__host__ __device__ constexpr auto size(const Layout& layout) { return layout.template GetLength(); } @@ -244,8 +244,8 @@ __host__ __device__ constexpr auto size(const Tuple& shape) * \param layout Layout to calculate shape size. * \return Requsted size. */ -template -__host__ __device__ constexpr auto size(const Layout& layout) +template +__host__ __device__ constexpr auto size(const Layout& layout) { return layout.GetLengths(); } @@ -284,9 +284,9 @@ __host__ __device__ constexpr auto size(const T& elem) * \param layout Layout to calculate rank. * \return Requsted rank. */ -template +template __host__ __device__ constexpr auto -rank([[maybe_unused]] const Layout& layout) +rank([[maybe_unused]] const Layout& layout) { return Shape::Size(); } @@ -338,8 +338,8 @@ __host__ __device__ constexpr auto rank(const T& elem) * \param layout Layout to calculate depth. * \return Requsted depth. */ -template -__host__ __device__ constexpr auto depth(const Layout& layout) +template +__host__ __device__ constexpr auto depth(const Layout& layout) { const auto& shape = layout.GetShape(); return TupleDepth(shape); diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp index 96a53591ef2..38df6531aec 100644 --- a/include/ck/wrapper/utils/tensor_partition.hpp +++ b/include/ck/wrapper/utils/tensor_partition.hpp @@ -107,11 +107,10 @@ CalculateLocalPartitionDescriptor(const Tuple& shape, return transform_tensor_descriptor(flatten_desc, transforms, lower_dims, upper_dims); } -// Convert interger thread_idx to tuple index with applied steps template -__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& thread_lengths, - const Tuple& steps, - index_t& thread_id) +__host__ __device__ constexpr auto CalculateLayoutOffsetIdxImpl(const Tuple& thread_lengths, + const Tuple& steps, + index_t& thread_id) { return generate_tuple( [&](auto i) { @@ -121,11 +120,12 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& // if tuple then recurrence if constexpr(is_same_v, Tuple<>>) { - return CalculateLayoutOffsetIdx(thread_lengths.At(num_i), Tuple<>{}, thread_id); + return CalculateLayoutOffsetIdxImpl( + thread_lengths.At(num_i), Tuple<>{}, thread_id); } else { - return CalculateLayoutOffsetIdx( + return CalculateLayoutOffsetIdxImpl( thread_lengths.At(num_i), steps.At(num_i), thread_id); } } @@ -148,6 +148,17 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& Number::Size()>{}); } +// Convert interger thread_idx to tuple index with applied steps +template +__host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& thread_lengths, + const Tuple& steps, + const index_t thread_id) +{ + // Create tmp thread_id copy for CalculateLayoutOffsetIdxImpl updates + index_t thread_id_copy = thread_id; + return CalculateLayoutOffsetIdxImpl(thread_lengths, steps, thread_id_copy); +} + // Aply steps to index represented as tuple template __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& steps, @@ -225,14 +236,13 @@ __host__ __device__ constexpr auto make_local_partition(const TensorType& tensor // Create shape, strides and layout for new partition tensor const auto partition_shape = CalculateLocalPartitionShape(shape(tensor), thread_lengths); // Create new descriptor and layout - const auto& flatten_desc = layout(tensor).GetFlattenDescriptor(); + const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor(); auto partition_desc = CalculateLocalPartitionDescriptor(shape(tensor), thread_lengths, steps, flatten_desc); const auto partition_layout = Layout( partition_shape, partition_desc); // Calculate offset for new partition tensor - index_t thread_id_copy = thread_id; - const auto offset_idx = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id_copy); + const auto offset_idx = CalculateLayoutOffsetIdx(thread_lengths, steps, thread_id); const auto partition_offset = layout(tensor)(offset_idx); return make_tensor(tensor.GetPointer() + partition_offset, partition_layout); @@ -259,7 +269,7 @@ __host__ __device__ constexpr auto make_local_tile(const TensorType& tensor, // Create block lengths, strides and layout for new tile tensor const auto block_lengths = CalculateBlockLengths(shape(tensor), tile_shape); // Create new descriptor and layout - const auto& flatten_desc = layout(tensor).GetFlattenDescriptor(); + const auto& flatten_desc = layout(tensor).GetUnnestedDescriptor(); auto tile_desc = CalculateLocalPartitionDescriptor(tile_shape, block_lengths, steps, flatten_desc); const auto tile_layout = Layout, decltype(tile_desc)>( diff --git a/include/ck/wrapper/utils/tensor_utils.hpp b/include/ck/wrapper/utils/tensor_utils.hpp index a2b36d29e95..1e932e62e16 100644 --- a/include/ck/wrapper/utils/tensor_utils.hpp +++ b/include/ck/wrapper/utils/tensor_utils.hpp @@ -27,12 +27,12 @@ using MemoryTypeEnum = AddressSpaceEnum; // Disable from doxygen docs generation /// @cond // forward declarations -template +template struct Layout; template @@ -101,13 +101,14 @@ using is_tuple = decltype(std::declval().IsTuple()); template -constexpr auto make_tensor(ElementType* pointer, const Layout& layout) + typename UnnestedDescriptorType> +constexpr auto make_tensor(ElementType* pointer, + const Layout& layout) { return Tensor(pointer, layout); } @@ -131,7 +132,7 @@ constexpr auto make_register_tensor() return Tensor>, - std::remove_const_t>, + std::remove_const_t>, NumVectors, ScalarPerVector>(layout); } @@ -145,13 +146,13 @@ constexpr auto make_register_tensor() template __host__ __device__ constexpr const auto& layout(const Tensor& tensor) { @@ -169,13 +170,13 @@ template __host__ __device__ constexpr auto size(const Tensor& tensor) { @@ -193,13 +194,13 @@ template __host__ __device__ constexpr auto rank(const Tensor& tensor) { @@ -217,13 +218,13 @@ template __host__ __device__ constexpr auto depth(const Tensor& tensor) { @@ -239,13 +240,13 @@ __host__ __device__ constexpr auto depth(const Tensor __host__ __device__ constexpr const auto& shape(const Tensor& tensor) { diff --git a/test/wrapper/test_tensor.cpp b/test/wrapper/test_tensor.cpp index 1392ee1b516..2d4d6f2750b 100644 --- a/test/wrapper/test_tensor.cpp +++ b/test/wrapper/test_tensor.cpp @@ -199,6 +199,13 @@ TEST(TestTensor, Slicing) EXPECT_EQ(ck::wrapper::size(tensor2), 2); EXPECT_TRUE(TestTensorCheck1d(tensor2)); + auto tensor2_v2 = tensor(2, ck::wrapper::slice(0, 2)); + EXPECT_EQ(tensor2_v2(0), layout(ck::make_tuple(2, 0))); + EXPECT_EQ(ck::wrapper::rank(tensor2_v2), 1); + EXPECT_EQ(ck::wrapper::depth(tensor2_v2), 1); + EXPECT_EQ(ck::wrapper::size(tensor2_v2), 2); + EXPECT_TRUE(TestTensorCheck1d(tensor2_v2)); + // negative indexing auto tensor1x2 = tensor(ck::make_tuple(1, ck::wrapper::slice(0, -2)), ck::wrapper::slice()); EXPECT_EQ(tensor1x2(0), layout(ck::make_tuple(ck::make_tuple(1, 0), 0))); From 309b77b00e11411584bc6b57acdc22a78ea33daf Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Mon, 1 Jan 2024 22:26:34 +0000 Subject: [PATCH 6/7] Fix client example --- include/ck/wrapper/utils/layout_utils.hpp | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/include/ck/wrapper/utils/layout_utils.hpp b/include/ck/wrapper/utils/layout_utils.hpp index 684cec42453..f4ba0a969ff 100644 --- a/include/ck/wrapper/utils/layout_utils.hpp +++ b/include/ck/wrapper/utils/layout_utils.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -32,24 +32,22 @@ namespace { // Generate packed (column-major) strides if not passed template __host__ __device__ constexpr static auto -GenerateColumnMajorPackedStrides(const Tuple& shape, index_t& stride) +GenerateColumnMajorPackedStrides(const Tuple& shape) { + const auto unrolled_shape = UnrollNestedTuple(shape); return generate_tuple( [&](auto i) { - const auto num_i = Number{}; - if constexpr(is_detected>>::value) + if constexpr(i.value == 0) { - return GenerateColumnMajorPackedStrides(shape.At(num_i), stride); + return Number<1>{}; } else { - const index_t dim_stride = stride; - // update stride - stride *= shape.At(num_i); - return dim_stride; + return TupleReduce{}.value, i.value>([](auto x, auto y) { return x * y; }, + unrolled_shape); } }, - Number::Size()>{}); + Number{}); } template @@ -59,10 +57,8 @@ __host__ __device__ constexpr auto MakeFlattenDescriptor(const LayoutShape& shap const auto unrolled_shape = UnrollNestedTuple(shape); if constexpr(is_same_v>) { - index_t start_stride = 1; // if not passed, then generate - const auto unrolled_strides = - GenerateColumnMajorPackedStrides(unrolled_shape, start_stride); + const auto unrolled_strides = GenerateColumnMajorPackedStrides(unrolled_shape); static_assert(unrolled_shape.Size() == unrolled_strides.Size(), "Size of strides and shape are not consistent."); return make_naive_tensor_descriptor(unrolled_shape, unrolled_strides); From 9a9f572b1ab8f6fea121772c8714cac77af32bd4 Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Tue, 2 Jan 2024 11:22:25 +0000 Subject: [PATCH 7/7] Fix comments --- include/ck/wrapper/tensor.hpp | 4 ++-- include/ck/wrapper/utils/tensor_partition.hpp | 10 +++++----- test/wrapper/test_copy.cpp | 9 +++------ test/wrapper/test_partition.cpp | 7 ++----- 4 files changed, 12 insertions(+), 18 deletions(-) diff --git a/include/ck/wrapper/tensor.hpp b/include/ck/wrapper/tensor.hpp index 787c5b2176e..a3636413730 100644 --- a/include/ck/wrapper/tensor.hpp +++ b/include/ck/wrapper/tensor.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -16,7 +16,7 @@ namespace wrapper { * \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR). * \tparam ElementType Element data type. * \tparam Shape Tensor shape (layout component). - * \tparam UnnestedDescriptorType Flatten descriptor (layout component). + * \tparam UnnestedDescriptorType Unnested descriptor (layout component). * \tparam NumVectors Number of vectors (only for VGPR, SGPR). * \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR). */ diff --git a/include/ck/wrapper/utils/tensor_partition.hpp b/include/ck/wrapper/utils/tensor_partition.hpp index 38df6531aec..a0634f6b38d 100644 --- a/include/ck/wrapper/utils/tensor_partition.hpp +++ b/include/ck/wrapper/utils/tensor_partition.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -148,7 +148,7 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdxImpl(const Tuple::Size()>{}); } -// Convert interger thread_idx to tuple index with applied steps +// Convert integer thread_idx to tuple index with steps applied template __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& thread_lengths, const Tuple& steps, @@ -159,7 +159,7 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& return CalculateLayoutOffsetIdxImpl(thread_lengths, steps, thread_id_copy); } -// Aply steps to index represented as tuple +// Apply steps to index represented as tuple template __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple& steps, const Tuple& block_idxs) @@ -195,8 +195,8 @@ __host__ __device__ constexpr auto CalculateLayoutOffsetIdx(const Tuple::Size()>{}); } -// For make_local_tile user pass only shape per block. This function calculates -// block layout based on shape. +// User passes only shape per block to the make_local_tile function. This function calculates +// block layout based on the shape. template __host__ __device__ constexpr auto CalculateBlockLengths(const Tuple& shape, const Tuple& tile_shape) diff --git a/test/wrapper/test_copy.cpp b/test/wrapper/test_copy.cpp index 457288a4a97..5cf09a54be9 100644 --- a/test/wrapper/test_copy.cpp +++ b/test/wrapper/test_copy.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -8,13 +8,10 @@ #include #include +#include "ck/host_utility/kernel_launch.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/check_err.hpp" - -#include "ck/host_utility/kernel_launch.hpp" - #include "ck/utility/common_header.hpp" - #include "ck/wrapper/layout.hpp" #include "ck/wrapper/tensor.hpp" #include "ck/wrapper/operations/copy.hpp" @@ -75,7 +72,7 @@ void PerformCopyGlobalToGlobalViaLDS() ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<2>{}), ck::Number<4>{}); const auto layout = ck::wrapper::make_layout(shape, strides); - // 0,1,2...size(shape) - 1 + // 0, 1, 2, ..., size(shape) - 1 std::vector input_data(ck::wrapper::size(shape)); std::iota(input_data.begin(), input_data.end(), 0); diff --git a/test/wrapper/test_partition.cpp b/test/wrapper/test_partition.cpp index fe214e9ed07..df56b879f69 100644 --- a/test/wrapper/test_partition.cpp +++ b/test/wrapper/test_partition.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #include #include @@ -8,13 +8,10 @@ #include #include +#include "ck/host_utility/kernel_launch.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/check_err.hpp" - -#include "ck/host_utility/kernel_launch.hpp" - #include "ck/utility/common_header.hpp" - #include "ck/wrapper/layout.hpp" #include "ck/wrapper/tensor.hpp"