diff --git a/backends/vulkan/runtime/api/Tensor.h b/backends/vulkan/runtime/api/Tensor.h index e1e28bf582c..110e94ab943 100644 --- a/backends/vulkan/runtime/api/Tensor.h +++ b/backends/vulkan/runtime/api/Tensor.h @@ -149,13 +149,14 @@ class vTensor final { // to be interpreted as a tensor with a different size. api::utils::uvec3 virtual_extents_; - // A Vulkan uniform buffer containing the tensor sizes that can be passed into - // a shader. + // A Vulkan uniform buffer containing the tensor sizes in WHCN that can be + // passed into a shader. std::shared_ptr cpu_sizes_uniform_; - // A Vulkan uniform buffer containing the GPU tensor sizes that can be passed - // into a shader. GPU sizes refers to the sizes of the tensor after padding - // has been applied to one dimension to align it to the next multiple of 4. + // A Vulkan uniform buffer containing the GPU tensor sizes in WHCN that can + // be passed into a shader. GPU sizes refers to the sizes of the tensor after + // padding has been applied to one dimension to align it to the next multiple + // of 4. std::shared_ptr gpu_sizes_uniform_; // A Vulkan uniform buffer containing the image extents of the underlying diff --git a/backends/vulkan/runtime/api/Utils.h b/backends/vulkan/runtime/api/Utils.h index f04c11ba030..3b0139b8efb 100644 --- a/backends/vulkan/runtime/api/Utils.h +++ b/backends/vulkan/runtime/api/Utils.h @@ -262,6 +262,12 @@ inline std::ostream& operator<<(std::ostream& os, const uvec3& v) { return os; } +inline std::ostream& operator<<(std::ostream& os, const uvec4& v) { + os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", " + << v.data[3u] << ")"; + return os; +} + // // std::vector Handling // diff --git a/backends/vulkan/runtime/graph/Logging.h b/backends/vulkan/runtime/graph/Logging.h index f2684081332..2c42b78fa5e 100644 --- a/backends/vulkan/runtime/graph/Logging.h +++ b/backends/vulkan/runtime/graph/Logging.h @@ -29,4 +29,8 @@ inline std::ostream& operator<<(std::ostream& os, const api::utils::uvec3& v) { return api::utils::operator<<(os, v); } +inline std::ostream& operator<<(std::ostream& os, const api::utils::uvec4& v) { + return api::utils::operator<<(os, v); +} + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl new file mode 100644 index 00000000000..53883c68e3b --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl @@ -0,0 +1,79 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} + +layout(std430) buffer; + +#include "indexing_utils.h" + +layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; +layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; + +layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents { + // tensor size in WHCN. + uvec4 data; +} +out_sizes; + +/* + * Params Buffer + */ +layout(set = 0, binding = 3) uniform PRECISION restrict Block { + // output dims + uvec4 out_ndims; + // x = output channels aligned to 4, y = input channels aligned to 4 + uvec2 ch_info; +} +uBlock; + +/* + * Local Work Group + */ +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 posOut = ivec3(gl_GlobalInvocationID); + + const ivec4 idx = to_tensor_idx_C_packed(posOut, out_sizes.data); + if (any(greaterThanEqual(idx, out_sizes.data))) { + return; + } + + const int out_channel_4up = int(uBlock.ch_info.x); + const int in_channel_4up = int(uBlock.ch_info.y); + const int out_batch = int(out_sizes.data[3]); + const int max_dst_index = out_batch * out_channel_4up; + VEC4_T outval = VEC4_T(0.0); + + for (int j = 0; j < 4; ++j) { + int dst_index = posOut.z * 4 + j; + if (dst_index >= max_dst_index) { + // out of range + break; + } + + ivec4 v = ivec4(0); // holds b,c,h,w + v[uBlock.out_ndims[0]] = dst_index / out_channel_4up; + v[uBlock.out_ndims[1]] = dst_index % out_channel_4up; + v[uBlock.out_ndims[2]] = posOut.y; + v[uBlock.out_ndims[3]] = posOut.x; + + int src_index = v[0] * in_channel_4up + v[1]; + int w = v[3]; + int h = v[2]; + + VEC4_T inval = VEC4_T(texelFetch(image_in, ivec3(w, h, src_index / 4), 0)); + outval[j] = inval[src_index % 4]; + } + imageStore(image_out, posOut, outval); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.yaml b/backends/vulkan/runtime/graph/ops/glsl/permute.yaml new file mode 100644 index 00000000000..77491a52856 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/permute.yaml @@ -0,0 +1,10 @@ +permute: + parameter_names_with_default_values: + DTYPE: float + NDIM: 3 + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + shader_variants: + - NAME: permute diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp new file mode 100644 index 00000000000..0f43631e84d --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +namespace vkcompute { + +using api::utils::ivec3; +using api::utils::uvec2; +using api::utils::uvec4; + +void check_args( + const vTensor& in, + const IntListPtr& permute_dims, + const vTensor& out) { + VK_CHECK_COND(check_memory_layout_is(in, api::kChannelsPacked)); + VK_CHECK_COND(check_memory_layout_is(out, api::kChannelsPacked)); + + int64_t in_dim = in.dim(); + VK_CHECK_COND( + in_dim == permute_dims->size(), + "Input tensor dim size must match argument"); +} + +void add_permute_node( + ComputeGraph& graph, + ValueRef in, + ValueRef permute_dims_ref, + ValueRef out) { + vTensorPtr t_in = graph.get_tensor(in); + vTensorPtr t_out = graph.get_tensor(out); + + IntListPtr permute_dims = graph.get_int_list(permute_dims_ref); + + check_args(*t_in, permute_dims, *t_out); + + uvec4 in_size{1u, 1u, 1u, 1u}, out_size{1u, 1u, 1u, 1u}; + uvec4 out_dims{0u, 1u, 2u, 3u}; + + int64_t in_dim = t_in->dim(); + + std::vector seen(in_dim); + for (int i = 0; i < in_dim; i++) { + int64_t permute_dim = (*permute_dims)[i]; + VK_CHECK_COND( + !seen[permute_dim], "Argument dim ", permute_dim, " is repeated"); + seen[permute_dim] = true; + + // Map to 4D tensor dims. + in_size.data[(4u - in_dim) + i] = t_in->size(i); + out_size.data[(4u - in_dim) + i] = t_in->size(permute_dim); + out_dims.data[(4u - in_dim) + i] = permute_dim + (4u - in_dim); + } + + std::string kernel_name = "permute"; + kernel_name.reserve(kShaderNameReserve); + add_dtype_suffix(kernel_name, *t_out); + + uint32_t out_channels = out_size.data[1u]; + uint32_t in_channels = in_size.data[1u]; + + uint32_t out_c_aligned = api::utils::align_up(out_channels, 4u); + uint32_t in_c_aligned = api::utils::align_up(in_channels, 4u); + + const struct Block final { + uvec4 out_ndims; + uvec2 ch_info; + } params{ + out_dims, + {out_c_aligned, in_c_aligned}, + }; + + api::utils::uvec3 global_size = t_out->virtual_extents(); + api::utils::uvec3 local_size = adaptive_work_group_size(global_size); + + graph.execute_nodes().emplace_back(new ExecuteNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_size, + local_size, + {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}}, + {t_out->gpu_sizes_ubo(), graph.create_params_buffer(params)})); +} + +void permute(ComputeGraph& graph, const std::vector& args) { + return add_permute_node(graph, args[0], args[1], args[2]); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(aten.permute_copy.default, permute); +} + +} // namespace vkcompute diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index 7ebe7bbcffa..49c3188174b 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -171,6 +171,29 @@ def get_select_int_inputs(): return test_suite +def get_permute_inputs(): + test_suite = VkTestSuite( + [ + ((9, 2, 9, 4), [0, 1, 2, 3]), + ((9, 2, 9, 4), [0, 1, 3, 2]), + ((9, 2, 9, 4), [0, 2, 1, 3]), + ((9, 2, 9, 4), [0, 2, 3, 1]), + ((9, 2, 9, 4), [0, 3, 1, 2]), + ((9, 2, 9, 4), [0, 3, 2, 1]), + ((9, 2, 9, 4), [3, 0, 1, 2]), + ((9, 2, 9, 4), [3, 2, 0, 1]), + ((9, 2, 9, 4), [2, 3, 0, 1]), + ((9, 2, 9, 4), [2, 0, 3, 1]), + ((9, 2, 9), [2, 0, 1]), + ((9, 2, 9), [1, 2, 0]), + ((9, 2), [0, 1]), + ((9, 2), [1, 0]), + ] + ) + test_suite.layouts = ["api::kChannelsPacked"] + return test_suite + + test_suites = { "aten.add.Tensor": get_binary_elementwise_inputs(), "aten.sub.Tensor": get_binary_elementwise_inputs(), @@ -183,4 +206,5 @@ def get_select_int_inputs(): "aten.full.default": get_full_inputs(), "aten.select.int": get_select_int_inputs(), "aten.select_copy.int": get_select_int_inputs(), + "aten.permute_copy.default": get_permute_inputs(), }