diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl new file mode 100644 index 00000000000..cb4d21e172a --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl @@ -0,0 +1,92 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +layout(std430) buffer; + +layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; +layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; +layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in; +layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in; + +layout(set = 0, binding = 4) uniform PRECISION restrict OutExtents { + uvec4 data; +} +out_extents; + +layout(set = 0, binding = 5) uniform PRECISION restrict InExtents { + uvec4 data; +} +in_extents; + +layout(set = 0, binding = 6) uniform PRECISION restrict Params { + ivec2 kernel_size; + ivec2 stride; + ivec2 padding; + ivec2 dilation; +} +params; + +// If fields are separated, SwiftShader cannot identify in_group_size. +layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams { + ivec2 overlay_region; + int in_group_size; +} +extra_params; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +/* + * Computes a 2D transpose convolution. Each shader invocation calculates the + * output at a single output location. For details, refer to conv2d.glsl which + * uses a similar approach. + */ +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(pos, out_extents.data.xyz))) { + return; + } + + ivec2 ipos = pos.xy + params.padding; + + const ivec2 start = max( + ivec2(0), + ivec2(ceil((vec2(ipos) - params.kernel_size + 1) / vec2(params.stride)))); + const ivec2 end = + min(ivec2(in_extents.data.xy), + ivec2(floor(vec2(ipos) / vec2(params.stride))) + 1); + + const int ic = extra_params.in_group_size; + const int kx_stride = ic * (params.stride.x - 1); + + int ky_start = extra_params.overlay_region.y - 1 - + (ipos.y - params.stride.y * start.y) + pos.z * params.kernel_size.y; + int kx_start = (extra_params.overlay_region.x - 1 - + (ipos.x - params.stride.x * start.x)) * ic; + + ${VEC4_T[DTYPE]} sum = texelFetch(bias_in, ivec2(pos.z, 0), 0); + for (int y = start.y, ky = ky_start; y < end.y; ++y, ky += params.stride.y) { + for (int x = start.x, kx = kx_start; x < end.x; ++x, kx += kx_stride) { + for (int z4 = 0; z4 < ic / 4; ++z4, kx += 4) { + const ${VEC4_T[DTYPE]} in_texel = texelFetch(image_in, ivec3(x, y, z4), 0); + const ivec4 kxs = kx + ivec4(0, 1, 2, 3); + + sum = fma(in_texel.xxxx, texelFetch(kernel_in, ivec2(kxs.x, ky), 0), sum); + sum = fma(in_texel.yyyy, texelFetch(kernel_in, ivec2(kxs.y, ky), 0), sum); + sum = fma(in_texel.zzzz, texelFetch(kernel_in, ivec2(kxs.z, ky), 0), sum); + sum = fma(in_texel.wwww, texelFetch(kernel_in, ivec2(kxs.w, ky), 0), sum); + } + } + } + + imageStore(image_out, pos, sum); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml new file mode 100644 index 00000000000..ab2c82a901e --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv_transpose2d: + parameter_names_with_default_values: + NDIM: 3 + DTYPE: float + generate_variant_forall: + DTYPE: + - VALUE: half + SUFFIX: half + - VALUE: float + SUFFIX: float + shader_variants: + - NAME: conv_transpose2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl new file mode 100644 index 00000000000..22bdbb506f7 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl @@ -0,0 +1,113 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#include "indexing_utils.h" + +layout(std430) buffer; + +layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out; +layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { + ${T[DTYPE]} data[]; +} +buffer_in; + +// Corresponds to {1,4,6,36} in the example below. +layout(set = 0, binding = 2) uniform PRECISION restrict GpuSizes { + ivec4 data; +} +gpu_sizes; + +// Corresponds to {3,3,7,10} in the example below. +layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes { + ivec4 data; +} +original_sizes; + +// Corresponds to {8,12} in the example below. +layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes { + ivec2 data; +} +padded_sizes; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +/* + * Computes special prepacking for a 2D transpose convolution. Each shader + * invocation calculates the input buffer location to read into the desired + * texel. + * + * For details, refer to conv2d_prepack_weights.glsl which uses a similar + * approach. For transpose, there are slight differences to reflect the data + * access pattern in the shader. First, the weight tensor is flipped along the H + * and W dims. Second, steps 3 and 4 are slightly different so that the splits + * are interleaved. + */ +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + const ivec4 coord = POS_TO_COORD_CHANNELS_PACKED(pos, gpu_sizes.data); + + if (any(greaterThanEqual(coord, gpu_sizes.data))) { + return; + } + + // As in usual staging shaders, map from GPU texel position to normal CPU + // buffer indices: (36,6) -> (4,6,36) + const int base_index = COORD_TO_BUFFER_IDX(coord, gpu_sizes.data); + const ivec4 p0 = + base_index + ivec4(0, 1, 2, 3) * STRIDE_CHANNELS_PACKED(gpu_sizes.data); + + // Re-map the normal CPU buffer indices to special indices, through a series + // of mappings: reshape is a no-op to the underlying indices, so we only map + // for flip, pad, and permute. + const int Np = padded_sizes.data.y; + const int Cp = padded_sizes.data.x; + const int N = original_sizes.data.w; + const int C = original_sizes.data.z; + const int H = original_sizes.data.y; + const int W = original_sizes.data.x; + + // Undo step 6 premute: (4,2,3,36) -> (2,4,3,36) + // In the following comments, a=b=c=3. + // Undo step 3 permute, part 1: (8,a,b,c,4) -> (8,a,c,b,4) + // Undo step 3 permute, part 2: (8,a,c,b,4) -> (8,c,a,b,4) + // Undo step 3 permute, part 3: (8,c,a,b,4) -> (8,c,a,4,b) + // Undo step 3 permute, part 4: (8,c,a,4,b) -> (8,c,4,a,b) + const ivec4 p1 = SWAP_ADJ_DIMS(p0, 4, (Cp / 4), (H * Np * W)); + const ivec4 p2 = SWAP_ADJ_DIMS(p1, W, (Np / 4), 4); + const ivec4 p3 = SWAP_ADJ_DIMS(p2, H, (Np / 4), (W * 4)); + const ivec4 p4 = SWAP_ADJ_DIMS(p3, W, 4, 1); + const ivec4 p5 = SWAP_ADJ_DIMS(p4, H, 4, W); + + // Undo step 0 permute: (8,12,3,3) -> (12,8,3,3) + const ivec4 p6 = SWAP_ADJ_DIMS(p5, Cp, Np, (W * H)); + // Undo step 0 flip: (2,3) + const ivec4 w = p6 % W; + const ivec4 h = p6 % (H * W) / W; + const ivec4 p7 = p6 + W - 1 - 2 * w + W * (H - 1 - 2 * h); + + // Undo step 1 pad: (12,8,3,3) -> (10,7,3,3) + // For values in the padded region, write zero instead of buffer data. + const ivec4 c = p7 % (Cp * H * W) / (H * W); + const ivec4 n = p7 / (Cp * H * W); + const ivec4 p8 = p7 - n * (Cp - C) * H * W; + const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) | + ivec4(greaterThanEqual(n, ivec4(N))); + + ${T[DTYPE]} val_x = mix(buffer_in.data[p8.x], 0, mask.x); + ${T[DTYPE]} val_y = mix(buffer_in.data[p8.y], 0, mask.y); + ${T[DTYPE]} val_z = mix(buffer_in.data[p8.z], 0, mask.z); + ${T[DTYPE]} val_w = mix(buffer_in.data[p8.w], 0, mask.w); + + ${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w); + + imageStore(image_out, pos.xy, texel); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml new file mode 100644 index 00000000000..a6cae5c6a15 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +conv_transpose2d_prepack_weights: + parameter_names_with_default_values: + NDIM: 3 + DTYPE: float + generate_variant_forall: + DTYPE: + - VALUE: half + SUFFIX: half + - VALUE: float + SUFFIX: float + shader_variants: + - NAME: conv_transpose2d_prepack_weights diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp index 03f9b40c2f3..5d4b36f03f3 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp @@ -28,13 +28,15 @@ void resize_conv2d_node( size_t ndim = self.sizes().size(); std::vector new_out_sizes(ndim); + const bool transposed = graph->get_val(extra_args[4]).toBool(); // Batch, Channel if (ndim == 4) { new_out_sizes.at(ndim - 4) = self.sizes().at(ndim - 4); } const auto weight_sizes = graph->get_val(extra_args[0]).toTensorRef().sizes; - new_out_sizes.at(ndim - 3) = weight_sizes.at(ndim - 4); + new_out_sizes.at(ndim - 3) = + transposed ? weight_sizes.at(ndim - 3) : weight_sizes.at(ndim - 4); // Height, Width const auto new_out_sizes_hw = calc_out_sizes_hw( @@ -42,9 +44,8 @@ void resize_conv2d_node( self.sizes(), extra_args[0], /*kernel_size_only = */ false, - extra_args[1], - extra_args[2], - extra_args[3]); + {extra_args[1], extra_args[2], extra_args[3], extra_args[5]}, + transposed); new_out_sizes.at(ndim - 2) = new_out_sizes_hw.at(0); new_out_sizes.at(ndim - 1) = new_out_sizes_hw.at(1); @@ -79,9 +80,16 @@ ValueRef prepack_biases(ComputeGraph& graph, const ValueRef vref) { return v; } -api::ShaderInfo get_conv2d_shader(const vTensor& t_out, bool prepack_weights) { +api::ShaderInfo get_conv2d_shader( + const vTensor& t_out, + const bool prepack_weights, + const bool transposed) { std::stringstream kernel_name; - kernel_name << "conv2d"; + if (transposed) { + kernel_name << "conv_transpose2d"; + } else { + kernel_name << "conv2d"; + } if (prepack_weights) { kernel_name << "_prepack_weights"; } @@ -90,7 +98,10 @@ api::ShaderInfo get_conv2d_shader(const vTensor& t_out, bool prepack_weights) { return VK_KERNEL_FROM_STR(kernel_name.str()); } -ValueRef prepack_weights(ComputeGraph& graph, const ValueRef vref) { +ValueRef prepack_weights( + ComputeGraph& graph, + const ValueRef vref, + const bool transposed) { const auto original_sizes = graph.get_val(vref).toTensorRef().sizes; int64_t batch_padded = @@ -101,7 +112,9 @@ ValueRef prepack_weights(ComputeGraph& graph, const ValueRef vref) { int64_t width = api::utils::val_at(-1, original_sizes); const auto final_sizes = std::vector{ - 4, batch_padded * height / 4, channels_padded * width}; + 4, + transposed ? channels_padded * height / 4 : batch_padded * height / 4, + transposed ? batch_padded * width : channels_padded * width}; ValueRef v = graph.add_tensor( final_sizes, @@ -113,7 +126,8 @@ ValueRef prepack_weights(ComputeGraph& graph, const ValueRef vref) { api::utils::uvec3 global_size = t.extents(); api::utils::uvec3 local_size = adaptive_work_group_size(global_size); - api::ShaderInfo shader = get_conv2d_shader(t, /*prepack_weights = */ true); + api::ShaderInfo shader = + get_conv2d_shader(t, /*prepack_weights = */ true, transposed); const auto padded_sizes = std::vector{batch_padded, channels_padded}; @@ -152,7 +166,8 @@ struct Conv2dParams final { Conv2dParams create_conv2d_params( ComputeGraph& graph, const ValueRef weight, - const KernelParams& p) { + const KernelParams& p, + const bool transposed) { const auto overlay_region = api::utils::make_ivec2({ p.kernel_size.data[0] + (p.kernel_size.data[0] - 1) * (p.dilation.data[0] - 1), @@ -160,12 +175,19 @@ Conv2dParams create_conv2d_params( (p.kernel_size.data[1] - 1) * (p.dilation.data[1] - 1), }); const auto weight_sizes = graph.get_val(weight).toTensorRef().sizes; - const int32_t in_group_size = api::utils::safe_downcast( - api::utils::align_up(weight_sizes.at(1), INT64_C(4))); + const int32_t in_group_size = + api::utils::safe_downcast(api::utils::align_up( + transposed ? weight_sizes.at(0) : weight_sizes.at(1), INT64_C(4))); return {overlay_region, in_group_size}; } -void check_conv2d_params(const KernelParams& p) { +void check_conv2d_params(const KernelParams& p, const bool transposed) { + if (transposed) { + if (p.dilation.data[0] > 1 || p.dilation.data[1] > 1) { + VK_THROW( + "aten.convolution.default: transposed = true, dilation > 1 is not supported yet!"); + } + } if ((p.padding.data[0] > 0 && p.kernel_size.data[0] > 1 && p.dilation.data[0] > 1) || (p.padding.data[1] > 0 && p.kernel_size.data[1] > 1 && @@ -183,14 +205,17 @@ void add_conv2d_node( const ValueRef stride, const ValueRef padding, const ValueRef dilation, + const ValueRef transposed, + const ValueRef output_padding, const ValueRef out) { + const bool transposed_val = graph.get_val(transposed).toBool(); + ValueRef arg_in = prepack_if_tensor_ref(graph, in); - ValueRef arg_weight = prepack_weights(graph, weight); + ValueRef arg_weight = prepack_weights(graph, weight, transposed_val); ValueRef arg_bias = prepack_biases(graph, bias); vTensor& t_in = graph.get_val(arg_in).toTensor(); vTensor& t_out = graph.get_val(out).toTensor(); - check_conv2d_args(t_in, t_out); api::utils::uvec3 global_size = t_out.virtual_extents(); @@ -204,12 +229,12 @@ void add_conv2d_node( padding, dilation); Conv2dParams extra_params = - create_conv2d_params(graph, weight, kernel_params); + create_conv2d_params(graph, weight, kernel_params, transposed_val); - check_conv2d_params(kernel_params); + check_conv2d_params(kernel_params, transposed_val); api::ShaderInfo shader = - get_conv2d_shader(t_out, /*prepack_weights = */ false); + get_conv2d_shader(t_out, /*prepack_weights = */ false, transposed_val); graph.execute_nodes().emplace_back(new ExecuteNode( graph, @@ -228,20 +253,25 @@ void add_conv2d_node( }, // Resizing resize_conv2d_node, - {weight, stride, padding, dilation})); + {weight, stride, padding, dilation, transposed, output_padding})); } void conv2d(ComputeGraph& graph, const std::vector& args) { - const bool transposed = graph.get_val(args[6]).toBool(); - if (transposed) { - VK_THROW("aten.convolution.default: transpose is not supported yet!"); - } const int64_t groups = graph.get_val(args[8]).toInt(); if (groups > 1) { VK_THROW("aten.convolution.default: groups > 1 is not supported yet!"); } return add_conv2d_node( - graph, args[0], args[1], args[2], args[3], args[4], args[5], args[9]); + graph, + args[0], + args[1], + args[2], + args[3], + args[4], + args[5], + args[6], + args[7], + args[9]); } REGISTER_OPERATORS { diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp index d5f16cd98a8..a6ac2a1cb87 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp @@ -40,10 +40,7 @@ void resize_max_pool2d_node( self.sizes(), extra_args[0], /*kernel_size_only = */ true, - extra_args[1], - extra_args[2], - extra_args[3], - extra_args[4]); + {extra_args[1], extra_args[2], extra_args[3], extra_args[4]}); new_out_sizes.at(ndim - 2) = new_out_sizes_hw.at(0); new_out_sizes.at(ndim - 1) = new_out_sizes_hw.at(1); diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp index de55b296b9a..f1f3bfc6828 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp @@ -56,50 +56,106 @@ int64_t calc_out_size( if (ceil_mode && (out_size - 1) * stride >= in_size + padding) { --out_size; } + VK_CHECK_COND(out_size >= 1); return out_size; } std::vector calc_out_sizes_hw( - ComputeGraph& graph, const std::vector& in_sizes, - const ValueRef weight, - const bool kernel_size_only, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation, - const ValueRef ceil_mode) { + const api::utils::ivec2& kernel_size, + const api::utils::ivec2& stride, + const api::utils::ivec2& padding, + const api::utils::ivec2& dilation, + const bool ceil_mode) { const int64_t ndim = in_sizes.size(); std::vector out_sizes(2); - const auto kernel_vec = - make_ivec2_kernel_size(graph, weight, kernel_size_only); - const auto stride_vec = make_ivec2_from_list(graph, stride); - const auto padding_vec = make_ivec2_from_list(graph, padding); - const auto dilation_vec = make_ivec2_from_list(graph, dilation); - const bool ceil_mode_val = - ceil_mode == kDummyValueRef ? false : graph.get_val(ceil_mode).toBool(); - // Height out_sizes.at(0) = calc_out_size( in_sizes.at(ndim - 2), - kernel_vec.data[1], - stride_vec.data[1], - padding_vec.data[1], - dilation_vec.data[1], - ceil_mode_val); + kernel_size.data[1], + stride.data[1], + padding.data[1], + dilation.data[1], + ceil_mode); // Width out_sizes.at(1) = calc_out_size( in_sizes.at(ndim - 1), - kernel_vec.data[0], - stride_vec.data[0], - padding_vec.data[0], - dilation_vec.data[0], - ceil_mode_val); + kernel_size.data[0], + stride.data[0], + padding.data[0], + dilation.data[0], + ceil_mode); - VK_CHECK_COND(out_sizes.at(0) >= 1); - VK_CHECK_COND(out_sizes.at(1) >= 1); + return out_sizes; +} + +int64_t calc_transpose_out_size( + const int64_t in_size, + const int64_t kernel, + const int64_t stride, + const int64_t padding, + const int64_t dilation, + const int64_t output_padding) { + int64_t out_size = (in_size - 1) * stride - 2 * padding + + dilation * (kernel - 1) + output_padding + 1; + VK_CHECK_COND(out_size >= 1); + return out_size; +} + +std::vector calc_transpose_out_sizes_hw( + const std::vector& in_sizes, + const api::utils::ivec2& kernel_size, + const api::utils::ivec2& stride, + const api::utils::ivec2& padding, + const api::utils::ivec2& dilation, + const api::utils::ivec2& output_padding) { + const int64_t ndim = in_sizes.size(); + std::vector out_sizes(2); + + // Height + out_sizes.at(0) = calc_transpose_out_size( + in_sizes.at(ndim - 2), + kernel_size.data[1], + stride.data[1], + padding.data[1], + dilation.data[1], + output_padding.data[1]); + // Width + out_sizes.at(1) = calc_transpose_out_size( + in_sizes.at(ndim - 1), + kernel_size.data[0], + stride.data[0], + padding.data[0], + dilation.data[0], + output_padding.data[0]); return out_sizes; } +std::vector calc_out_sizes_hw( + ComputeGraph& graph, + const std::vector& in_sizes, + const ValueRef weight, + const bool kernel_size_only, + const std::vector& args, + const bool transposed) { + const auto kernel_size = + make_ivec2_kernel_size(graph, weight, kernel_size_only); + const auto stride = make_ivec2_from_list(graph, args[0]); + const auto padding = make_ivec2_from_list(graph, args[1]); + const auto dilation = make_ivec2_from_list(graph, args[2]); + + if (transposed) { + const auto output_padding = make_ivec2_from_list(graph, args[3]); + return calc_transpose_out_sizes_hw( + in_sizes, kernel_size, stride, padding, dilation, output_padding); + } else { + Value& vref = graph.get_val(args[3]); + const bool ceil_mode = vref.isBool() ? vref.toBool() : false; + return calc_out_sizes_hw( + in_sizes, kernel_size, stride, padding, dilation, ceil_mode); + } +} + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h index 923b3d8fd74..fafb00e126c 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h +++ b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h @@ -36,9 +36,7 @@ std::vector calc_out_sizes_hw( const std::vector& in_sizes, const ValueRef weight, const bool kernel_size_only, - const ValueRef stride, - const ValueRef padding, - const ValueRef dilation, - const ValueRef ceil_mode = kDummyValueRef); + const std::vector& args, + const bool transposed = false); } // namespace vkcompute diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py index d305fd19663..37b0b691b3f 100644 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ b/backends/vulkan/test/test_vulkan_delegate.py @@ -523,3 +523,31 @@ def forward(self, x): sample_inputs, memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED], ) + + def test_vulkan_backend_conv_transpose2d(self): + class ConvTranspose2dModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.ConvTranspose2d( + in_channels=6, + out_channels=8, + kernel_size=(3, 3), + padding=(2, 3), + stride=(1, 2), + output_padding=(0, 1), + dilation=1, + groups=1, + bias=True, + ) + + def forward(self, x): + return self.conv(x) + + conv_transpose2d_module = ConvTranspose2dModule() + sample_inputs = (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),) + + self.lower_module_and_test_output( + conv_transpose2d_module, + sample_inputs, + memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED], + ) diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 0f0edafe75a..caa94dd8f02 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -59,11 +59,17 @@ void record_conv2d_prepack_weights_op( api::VulkanBuffer& src_buffer, vTensor& v_dst, const std::vector& original_sizes, - const std::vector& padded_sizes) { + const std::vector& padded_sizes, + const bool transposed) { api::PipelineBarrier pipeline_barrier{}; std::stringstream kernel_name; - kernel_name << "conv2d_prepack_weights"; + if (transposed) { + kernel_name << "conv_transpose2d"; + } else { + kernel_name << "conv2d"; + } + kernel_name << "_prepack_weights"; apply_dtype_suffix(kernel_name, v_dst); api::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name.str()); diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index 2d7d0b0746f..a1f3b93dc3a 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -86,7 +86,8 @@ void record_conv2d_prepack_weights_op( api::VulkanBuffer& src_buffer, vTensor& v_dst, const std::vector& original_sizes, - const std::vector& padded_sizes); + const std::vector& padded_sizes, + const bool transposed); void record_binary_op( api::Context* const context, diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index c8e58c25cd2..88e0e68120d 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -1173,11 +1173,12 @@ TEST(VulkanComputeGraphOpsTest, max_pool2d_smoke_test) { kernel); } -TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) { - const auto original_sizes = std::vector{2, 3, 1, 2}; - const auto padded_sizes = std::vector{4, 4}; - const auto gpu_sizes = std::vector{4, 1, 8}; - +void test_conv2d( + const std::vector& original_sizes, + const std::vector& padded_sizes, + const std::vector& gpu_sizes, + const bool transposed, + const std::vector& data_out_expected) { vTensor vten = vTensor( api::context(), gpu_sizes, @@ -1207,7 +1208,8 @@ TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) { staging_buffer_in.buffer(), vten, original_sizes, - padded_sizes); + padded_sizes, + transposed); record_image_to_nchw_op(api::context(), vten, staging_buffer_out.buffer()); // Execute command buffer @@ -1219,10 +1221,26 @@ TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) { staging_buffer_out, data_out.data(), sizeof(float) * out_numel); // Check data matches results copied from ATen-VK - std::vector data_out_expected = {1, 3, 5, 0, 2, 4, 6, 0, 7, 9, 11, - 0, 8, 10, 12, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; for (int i = 0; i < vten.numel(); i++) { CHECK_VALUE(data_out, i, data_out_expected[i]); } } + +TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) { + test_conv2d( + /*original_sizes = */ {2, 3, 1, 2}, + /*padded_sizes = */ {4, 4}, + /*gpu_sizes = */ {4, 1, 8}, + /*transposed = */ false, + /*data_out_expected = */ {1, 3, 5, 0, 2, 4, 6, 0, 7, 9, 11, + 0, 8, 10, 12, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + test_conv2d( + /*original_sizes = */ {2, 3, 1, 2}, + /*padded_sizes = */ {4, 4}, + /*gpu_sizes = */ {4, 1, 8}, + /*transposed = */ true, + /*data_out_expected = */ {2, 8, 0, 0, 1, 7, 0, 0, 4, 10, 0, + 0, 3, 9, 0, 0, 6, 12, 0, 0, 5, 11, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); +}