diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl
new file mode 100644
index 00000000000..cb4d21e172a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
+layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_extents;
+
+layout(set = 0, binding = 5) uniform PRECISION restrict InExtents {
+  uvec4 data;
+}
+in_extents;
+
+layout(set = 0, binding = 6) uniform PRECISION restrict Params {
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+}
+params;
+
+// If fields are separated, SwiftShader cannot identify in_group_size.
+layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
+  ivec2 overlay_region;
+  int in_group_size;
+}
+extra_params;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Computes a 2D transpose convolution. Each shader invocation calculates the
+ * output at a single output location. For details, refer to conv2d.glsl which
+ * uses a similar approach.
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_extents.data.xyz))) {
+    return;
+  }
+
+  ivec2 ipos = pos.xy + params.padding;
+
+  const ivec2 start = max(
+      ivec2(0),
+      ivec2(ceil((vec2(ipos) - params.kernel_size + 1) / vec2(params.stride))));
+  const ivec2 end =
+      min(ivec2(in_extents.data.xy),
+          ivec2(floor(vec2(ipos) / vec2(params.stride))) + 1);
+
+  const int ic = extra_params.in_group_size;
+  const int kx_stride = ic * (params.stride.x - 1);
+
+  int ky_start = extra_params.overlay_region.y - 1 -
+      (ipos.y - params.stride.y * start.y) + pos.z * params.kernel_size.y;
+  int kx_start = (extra_params.overlay_region.x - 1 -
+                  (ipos.x - params.stride.x * start.x)) * ic;
+
+  ${VEC4_T[DTYPE]} sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
+  for (int y = start.y, ky = ky_start; y < end.y; ++y, ky += params.stride.y) {
+    for (int x = start.x, kx = kx_start; x < end.x; ++x, kx += kx_stride) {
+      for (int z4 = 0; z4 < ic / 4; ++z4, kx += 4) {
+        const ${VEC4_T[DTYPE]} in_texel = texelFetch(image_in, ivec3(x, y, z4), 0);
+        const ivec4 kxs = kx + ivec4(0, 1, 2, 3);
+
+        sum = fma(in_texel.xxxx, texelFetch(kernel_in, ivec2(kxs.x, ky), 0), sum);
+        sum = fma(in_texel.yyyy, texelFetch(kernel_in, ivec2(kxs.y, ky), 0), sum);
+        sum = fma(in_texel.zzzz, texelFetch(kernel_in, ivec2(kxs.z, ky), 0), sum);
+        sum = fma(in_texel.wwww, texelFetch(kernel_in, ivec2(kxs.w, ky), 0), sum);
+      }
+    }
+  }
+
+  imageStore(image_out, pos, sum);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml
new file mode 100644
index 00000000000..ab2c82a901e
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv_transpose2d:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+        SUFFIX: half
+      - VALUE: float
+        SUFFIX: float
+  shader_variants:
+    - NAME: conv_transpose2d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
new file mode 100644
index 00000000000..22bdbb506f7
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out;
+layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
+  ${T[DTYPE]} data[];
+}
+buffer_in;
+
+// Corresponds to {1,4,6,36} in the example below.
+layout(set = 0, binding = 2) uniform PRECISION restrict GpuSizes {
+  ivec4 data;
+}
+gpu_sizes;
+
+// Corresponds to {3,3,7,10} in the example below.
+layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
+  ivec4 data;
+}
+original_sizes;
+
+// Corresponds to {8,12} in the example below.
+layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
+  ivec2 data;
+}
+padded_sizes;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Computes special prepacking for a 2D transpose convolution. Each shader
+ * invocation calculates the input buffer location to read into the desired
+ * texel.
+ *
+ * For details, refer to conv2d_prepack_weights.glsl which uses a similar
+ * approach. For transpose, there are slight differences to reflect the data
+ * access pattern in the shader. First, the weight tensor is flipped along the H
+ * and W dims. Second, steps 3 and 4 are slightly different so that the splits
+ * are interleaved.
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 coord = POS_TO_COORD_CHANNELS_PACKED(pos, gpu_sizes.data);
+
+  if (any(greaterThanEqual(coord, gpu_sizes.data))) {
+    return;
+  }
+
+  // As in usual staging shaders, map from GPU texel position to normal CPU
+  // buffer indices: (36,6) -> (4,6,36)
+  const int base_index = COORD_TO_BUFFER_IDX(coord, gpu_sizes.data);
+  const ivec4 p0 =
+      base_index + ivec4(0, 1, 2, 3) * STRIDE_CHANNELS_PACKED(gpu_sizes.data);
+
+  // Re-map the normal CPU buffer indices to special indices, through a series
+  // of mappings: reshape is a no-op to the underlying indices, so we only map
+  // for flip, pad, and permute.
+  const int Np = padded_sizes.data.y;
+  const int Cp = padded_sizes.data.x;
+  const int N = original_sizes.data.w;
+  const int C = original_sizes.data.z;
+  const int H = original_sizes.data.y;
+  const int W = original_sizes.data.x;
+
+  // Undo step 6 premute: (4,2,3,36) -> (2,4,3,36)
+  // In the following comments, a=b=c=3.
+  // Undo step 3 permute, part 1: (8,a,b,c,4) -> (8,a,c,b,4)
+  // Undo step 3 permute, part 2: (8,a,c,b,4) -> (8,c,a,b,4)
+  // Undo step 3 permute, part 3: (8,c,a,b,4) -> (8,c,a,4,b)
+  // Undo step 3 permute, part 4: (8,c,a,4,b) -> (8,c,4,a,b)
+  const ivec4 p1 = SWAP_ADJ_DIMS(p0, 4, (Cp / 4), (H * Np * W));
+  const ivec4 p2 = SWAP_ADJ_DIMS(p1, W, (Np / 4), 4);
+  const ivec4 p3 = SWAP_ADJ_DIMS(p2, H, (Np / 4), (W * 4));
+  const ivec4 p4 = SWAP_ADJ_DIMS(p3, W, 4, 1);
+  const ivec4 p5 = SWAP_ADJ_DIMS(p4, H, 4, W);
+
+  // Undo step 0 permute: (8,12,3,3) -> (12,8,3,3)
+  const ivec4 p6 = SWAP_ADJ_DIMS(p5, Cp, Np, (W * H));
+  // Undo step 0 flip: (2,3)
+  const ivec4 w = p6 % W;
+  const ivec4 h = p6 % (H * W) / W;
+  const ivec4 p7 = p6 + W - 1 - 2 * w + W * (H - 1 - 2 * h);
+
+  // Undo step 1 pad: (12,8,3,3) -> (10,7,3,3)
+  // For values in the padded region, write zero instead of buffer data.
+  const ivec4 c = p7 % (Cp * H * W) / (H * W);
+  const ivec4 n = p7 / (Cp * H * W);
+  const ivec4 p8 = p7 - n * (Cp - C) * H * W;
+  const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) |
+      ivec4(greaterThanEqual(n, ivec4(N)));
+
+  ${T[DTYPE]} val_x = mix(buffer_in.data[p8.x], 0, mask.x);
+  ${T[DTYPE]} val_y = mix(buffer_in.data[p8.y], 0, mask.y);
+  ${T[DTYPE]} val_z = mix(buffer_in.data[p8.z], 0, mask.z);
+  ${T[DTYPE]} val_w = mix(buffer_in.data[p8.w], 0, mask.w);
+
+  ${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w);
+
+  imageStore(image_out, pos.xy, texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml
new file mode 100644
index 00000000000..a6cae5c6a15
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv_transpose2d_prepack_weights:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+        SUFFIX: half
+      - VALUE: float
+        SUFFIX: float
+  shader_variants:
+    - NAME: conv_transpose2d_prepack_weights
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp
index 03f9b40c2f3..5d4b36f03f3 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp
@@ -28,13 +28,15 @@ void resize_conv2d_node(
 
   size_t ndim = self.sizes().size();
   std::vector<int64_t> new_out_sizes(ndim);
+  const bool transposed = graph->get_val(extra_args[4]).toBool();
 
   // Batch, Channel
   if (ndim == 4) {
     new_out_sizes.at(ndim - 4) = self.sizes().at(ndim - 4);
   }
   const auto weight_sizes = graph->get_val(extra_args[0]).toTensorRef().sizes;
-  new_out_sizes.at(ndim - 3) = weight_sizes.at(ndim - 4);
+  new_out_sizes.at(ndim - 3) =
+      transposed ? weight_sizes.at(ndim - 3) : weight_sizes.at(ndim - 4);
 
   // Height, Width
   const auto new_out_sizes_hw = calc_out_sizes_hw(
@@ -42,9 +44,8 @@ void resize_conv2d_node(
       self.sizes(),
       extra_args[0],
       /*kernel_size_only = */ false,
-      extra_args[1],
-      extra_args[2],
-      extra_args[3]);
+      {extra_args[1], extra_args[2], extra_args[3], extra_args[5]},
+      transposed);
   new_out_sizes.at(ndim - 2) = new_out_sizes_hw.at(0);
   new_out_sizes.at(ndim - 1) = new_out_sizes_hw.at(1);
 
@@ -79,9 +80,16 @@ ValueRef prepack_biases(ComputeGraph& graph, const ValueRef vref) {
   return v;
 }
 
-api::ShaderInfo get_conv2d_shader(const vTensor& t_out, bool prepack_weights) {
+api::ShaderInfo get_conv2d_shader(
+    const vTensor& t_out,
+    const bool prepack_weights,
+    const bool transposed) {
   std::stringstream kernel_name;
-  kernel_name << "conv2d";
+  if (transposed) {
+    kernel_name << "conv_transpose2d";
+  } else {
+    kernel_name << "conv2d";
+  }
   if (prepack_weights) {
     kernel_name << "_prepack_weights";
   }
@@ -90,7 +98,10 @@ api::ShaderInfo get_conv2d_shader(const vTensor& t_out, bool prepack_weights) {
   return VK_KERNEL_FROM_STR(kernel_name.str());
 }
 
-ValueRef prepack_weights(ComputeGraph& graph, const ValueRef vref) {
+ValueRef prepack_weights(
+    ComputeGraph& graph,
+    const ValueRef vref,
+    const bool transposed) {
   const auto original_sizes = graph.get_val(vref).toTensorRef().sizes;
 
   int64_t batch_padded =
@@ -101,7 +112,9 @@ ValueRef prepack_weights(ComputeGraph& graph, const ValueRef vref) {
   int64_t width = api::utils::val_at(-1, original_sizes);
 
   const auto final_sizes = std::vector<int64_t>{
-      4, batch_padded * height / 4, channels_padded * width};
+      4,
+      transposed ? channels_padded * height / 4 : batch_padded * height / 4,
+      transposed ? batch_padded * width : channels_padded * width};
 
   ValueRef v = graph.add_tensor(
       final_sizes,
@@ -113,7 +126,8 @@ ValueRef prepack_weights(ComputeGraph& graph, const ValueRef vref) {
   api::utils::uvec3 global_size = t.extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
-  api::ShaderInfo shader = get_conv2d_shader(t, /*prepack_weights = */ true);
+  api::ShaderInfo shader =
+      get_conv2d_shader(t, /*prepack_weights = */ true, transposed);
 
   const auto padded_sizes = std::vector<int64_t>{batch_padded, channels_padded};
 
@@ -152,7 +166,8 @@ struct Conv2dParams final {
 Conv2dParams create_conv2d_params(
     ComputeGraph& graph,
     const ValueRef weight,
-    const KernelParams& p) {
+    const KernelParams& p,
+    const bool transposed) {
   const auto overlay_region = api::utils::make_ivec2({
       p.kernel_size.data[0] +
           (p.kernel_size.data[0] - 1) * (p.dilation.data[0] - 1),
@@ -160,12 +175,19 @@ Conv2dParams create_conv2d_params(
           (p.kernel_size.data[1] - 1) * (p.dilation.data[1] - 1),
   });
   const auto weight_sizes = graph.get_val(weight).toTensorRef().sizes;
-  const int32_t in_group_size = api::utils::safe_downcast<int32_t>(
-      api::utils::align_up(weight_sizes.at(1), INT64_C(4)));
+  const int32_t in_group_size =
+      api::utils::safe_downcast<int32_t>(api::utils::align_up(
+          transposed ? weight_sizes.at(0) : weight_sizes.at(1), INT64_C(4)));
   return {overlay_region, in_group_size};
 }
 
-void check_conv2d_params(const KernelParams& p) {
+void check_conv2d_params(const KernelParams& p, const bool transposed) {
+  if (transposed) {
+    if (p.dilation.data[0] > 1 || p.dilation.data[1] > 1) {
+      VK_THROW(
+          "aten.convolution.default: transposed = true, dilation > 1 is not supported yet!");
+    }
+  }
   if ((p.padding.data[0] > 0 && p.kernel_size.data[0] > 1 &&
        p.dilation.data[0] > 1) ||
       (p.padding.data[1] > 0 && p.kernel_size.data[1] > 1 &&
@@ -183,14 +205,17 @@ void add_conv2d_node(
     const ValueRef stride,
     const ValueRef padding,
     const ValueRef dilation,
+    const ValueRef transposed,
+    const ValueRef output_padding,
     const ValueRef out) {
+  const bool transposed_val = graph.get_val(transposed).toBool();
+
   ValueRef arg_in = prepack_if_tensor_ref(graph, in);
-  ValueRef arg_weight = prepack_weights(graph, weight);
+  ValueRef arg_weight = prepack_weights(graph, weight, transposed_val);
   ValueRef arg_bias = prepack_biases(graph, bias);
 
   vTensor& t_in = graph.get_val(arg_in).toTensor();
   vTensor& t_out = graph.get_val(out).toTensor();
-
   check_conv2d_args(t_in, t_out);
 
   api::utils::uvec3 global_size = t_out.virtual_extents();
@@ -204,12 +229,12 @@ void add_conv2d_node(
       padding,
       dilation);
   Conv2dParams extra_params =
-      create_conv2d_params(graph, weight, kernel_params);
+      create_conv2d_params(graph, weight, kernel_params, transposed_val);
 
-  check_conv2d_params(kernel_params);
+  check_conv2d_params(kernel_params, transposed_val);
 
   api::ShaderInfo shader =
-      get_conv2d_shader(t_out, /*prepack_weights = */ false);
+      get_conv2d_shader(t_out, /*prepack_weights = */ false, transposed_val);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
@@ -228,20 +253,25 @@ void add_conv2d_node(
       },
       // Resizing
       resize_conv2d_node,
-      {weight, stride, padding, dilation}));
+      {weight, stride, padding, dilation, transposed, output_padding}));
 }
 
 void conv2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  const bool transposed = graph.get_val(args[6]).toBool();
-  if (transposed) {
-    VK_THROW("aten.convolution.default: transpose is not supported yet!");
-  }
   const int64_t groups = graph.get_val(args[8]).toInt();
   if (groups > 1) {
     VK_THROW("aten.convolution.default: groups > 1 is not supported yet!");
   }
   return add_conv2d_node(
-      graph, args[0], args[1], args[2], args[3], args[4], args[5], args[9]);
+      graph,
+      args[0],
+      args[1],
+      args[2],
+      args[3],
+      args[4],
+      args[5],
+      args[6],
+      args[7],
+      args[9]);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
index d5f16cd98a8..a6ac2a1cb87 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -40,10 +40,7 @@ void resize_max_pool2d_node(
       self.sizes(),
       extra_args[0],
       /*kernel_size_only = */ true,
-      extra_args[1],
-      extra_args[2],
-      extra_args[3],
-      extra_args[4]);
+      {extra_args[1], extra_args[2], extra_args[3], extra_args[4]});
   new_out_sizes.at(ndim - 2) = new_out_sizes_hw.at(0);
   new_out_sizes.at(ndim - 1) = new_out_sizes_hw.at(1);
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
index de55b296b9a..f1f3bfc6828 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
@@ -56,50 +56,106 @@ int64_t calc_out_size(
   if (ceil_mode && (out_size - 1) * stride >= in_size + padding) {
     --out_size;
   }
+  VK_CHECK_COND(out_size >= 1);
   return out_size;
 }
 
 std::vector<int64_t> calc_out_sizes_hw(
-    ComputeGraph& graph,
     const std::vector<int64_t>& in_sizes,
-    const ValueRef weight,
-    const bool kernel_size_only,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation,
-    const ValueRef ceil_mode) {
+    const api::utils::ivec2& kernel_size,
+    const api::utils::ivec2& stride,
+    const api::utils::ivec2& padding,
+    const api::utils::ivec2& dilation,
+    const bool ceil_mode) {
   const int64_t ndim = in_sizes.size();
   std::vector<int64_t> out_sizes(2);
 
-  const auto kernel_vec =
-      make_ivec2_kernel_size(graph, weight, kernel_size_only);
-  const auto stride_vec = make_ivec2_from_list(graph, stride);
-  const auto padding_vec = make_ivec2_from_list(graph, padding);
-  const auto dilation_vec = make_ivec2_from_list(graph, dilation);
-  const bool ceil_mode_val =
-      ceil_mode == kDummyValueRef ? false : graph.get_val(ceil_mode).toBool();
-
   // Height
   out_sizes.at(0) = calc_out_size(
       in_sizes.at(ndim - 2),
-      kernel_vec.data[1],
-      stride_vec.data[1],
-      padding_vec.data[1],
-      dilation_vec.data[1],
-      ceil_mode_val);
+      kernel_size.data[1],
+      stride.data[1],
+      padding.data[1],
+      dilation.data[1],
+      ceil_mode);
   // Width
   out_sizes.at(1) = calc_out_size(
       in_sizes.at(ndim - 1),
-      kernel_vec.data[0],
-      stride_vec.data[0],
-      padding_vec.data[0],
-      dilation_vec.data[0],
-      ceil_mode_val);
+      kernel_size.data[0],
+      stride.data[0],
+      padding.data[0],
+      dilation.data[0],
+      ceil_mode);
 
-  VK_CHECK_COND(out_sizes.at(0) >= 1);
-  VK_CHECK_COND(out_sizes.at(1) >= 1);
+  return out_sizes;
+}
+
+int64_t calc_transpose_out_size(
+    const int64_t in_size,
+    const int64_t kernel,
+    const int64_t stride,
+    const int64_t padding,
+    const int64_t dilation,
+    const int64_t output_padding) {
+  int64_t out_size = (in_size - 1) * stride - 2 * padding +
+      dilation * (kernel - 1) + output_padding + 1;
+  VK_CHECK_COND(out_size >= 1);
+  return out_size;
+}
+
+std::vector<int64_t> calc_transpose_out_sizes_hw(
+    const std::vector<int64_t>& in_sizes,
+    const api::utils::ivec2& kernel_size,
+    const api::utils::ivec2& stride,
+    const api::utils::ivec2& padding,
+    const api::utils::ivec2& dilation,
+    const api::utils::ivec2& output_padding) {
+  const int64_t ndim = in_sizes.size();
+  std::vector<int64_t> out_sizes(2);
+
+  // Height
+  out_sizes.at(0) = calc_transpose_out_size(
+      in_sizes.at(ndim - 2),
+      kernel_size.data[1],
+      stride.data[1],
+      padding.data[1],
+      dilation.data[1],
+      output_padding.data[1]);
+  // Width
+  out_sizes.at(1) = calc_transpose_out_size(
+      in_sizes.at(ndim - 1),
+      kernel_size.data[0],
+      stride.data[0],
+      padding.data[0],
+      dilation.data[0],
+      output_padding.data[0]);
 
   return out_sizes;
 }
 
+std::vector<int64_t> calc_out_sizes_hw(
+    ComputeGraph& graph,
+    const std::vector<int64_t>& in_sizes,
+    const ValueRef weight,
+    const bool kernel_size_only,
+    const std::vector<ValueRef>& args,
+    const bool transposed) {
+  const auto kernel_size =
+      make_ivec2_kernel_size(graph, weight, kernel_size_only);
+  const auto stride = make_ivec2_from_list(graph, args[0]);
+  const auto padding = make_ivec2_from_list(graph, args[1]);
+  const auto dilation = make_ivec2_from_list(graph, args[2]);
+
+  if (transposed) {
+    const auto output_padding = make_ivec2_from_list(graph, args[3]);
+    return calc_transpose_out_sizes_hw(
+        in_sizes, kernel_size, stride, padding, dilation, output_padding);
+  } else {
+    Value& vref = graph.get_val(args[3]);
+    const bool ceil_mode = vref.isBool() ? vref.toBool() : false;
+    return calc_out_sizes_hw(
+        in_sizes, kernel_size, stride, padding, dilation, ceil_mode);
+  }
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h
index 923b3d8fd74..fafb00e126c 100644
--- a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h
+++ b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h
@@ -36,9 +36,7 @@ std::vector<int64_t> calc_out_sizes_hw(
     const std::vector<int64_t>& in_sizes,
     const ValueRef weight,
     const bool kernel_size_only,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation,
-    const ValueRef ceil_mode = kDummyValueRef);
+    const std::vector<ValueRef>& args,
+    const bool transposed = false);
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index d305fd19663..37b0b691b3f 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -523,3 +523,31 @@ def forward(self, x):
             sample_inputs,
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
+
+    def test_vulkan_backend_conv_transpose2d(self):
+        class ConvTranspose2dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.ConvTranspose2d(
+                    in_channels=6,
+                    out_channels=8,
+                    kernel_size=(3, 3),
+                    padding=(2, 3),
+                    stride=(1, 2),
+                    output_padding=(0, 1),
+                    dilation=1,
+                    groups=1,
+                    bias=True,
+                )
+
+            def forward(self, x):
+                return self.conv(x)
+
+        conv_transpose2d_module = ConvTranspose2dModule()
+        sample_inputs = (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            conv_transpose2d_module,
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 0f0edafe75a..caa94dd8f02 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -59,11 +59,17 @@ void record_conv2d_prepack_weights_op(
     api::VulkanBuffer& src_buffer,
     vTensor& v_dst,
     const std::vector<int64_t>& original_sizes,
-    const std::vector<int64_t>& padded_sizes) {
+    const std::vector<int64_t>& padded_sizes,
+    const bool transposed) {
   api::PipelineBarrier pipeline_barrier{};
 
   std::stringstream kernel_name;
-  kernel_name << "conv2d_prepack_weights";
+  if (transposed) {
+    kernel_name << "conv_transpose2d";
+  } else {
+    kernel_name << "conv2d";
+  }
+  kernel_name << "_prepack_weights";
   apply_dtype_suffix(kernel_name, v_dst);
   api::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name.str());
 
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index 2d7d0b0746f..a1f3b93dc3a 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -86,7 +86,8 @@ void record_conv2d_prepack_weights_op(
     api::VulkanBuffer& src_buffer,
     vTensor& v_dst,
     const std::vector<int64_t>& original_sizes,
-    const std::vector<int64_t>& padded_sizes);
+    const std::vector<int64_t>& padded_sizes,
+    const bool transposed);
 
 void record_binary_op(
     api::Context* const context,
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index c8e58c25cd2..88e0e68120d 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1173,11 +1173,12 @@ TEST(VulkanComputeGraphOpsTest, max_pool2d_smoke_test) {
       kernel);
 }
 
-TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) {
-  const auto original_sizes = std::vector<int64_t>{2, 3, 1, 2};
-  const auto padded_sizes = std::vector<int64_t>{4, 4};
-  const auto gpu_sizes = std::vector<int64_t>{4, 1, 8};
-
+void test_conv2d(
+    const std::vector<int64_t>& original_sizes,
+    const std::vector<int64_t>& padded_sizes,
+    const std::vector<int64_t>& gpu_sizes,
+    const bool transposed,
+    const std::vector<float>& data_out_expected) {
   vTensor vten = vTensor(
       api::context(),
       gpu_sizes,
@@ -1207,7 +1208,8 @@ TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) {
       staging_buffer_in.buffer(),
       vten,
       original_sizes,
-      padded_sizes);
+      padded_sizes,
+      transposed);
   record_image_to_nchw_op(api::context(), vten, staging_buffer_out.buffer());
 
   // Execute command buffer
@@ -1219,10 +1221,26 @@ TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) {
       staging_buffer_out, data_out.data(), sizeof(float) * out_numel);
 
   // Check data matches results copied from ATen-VK
-  std::vector<float> data_out_expected = {1, 3, 5,  0,  2, 4, 6, 0, 7, 9, 11,
-                                          0, 8, 10, 12, 0, 0, 0, 0, 0, 0, 0,
-                                          0, 0, 0,  0,  0, 0, 0, 0, 0, 0};
   for (int i = 0; i < vten.numel(); i++) {
     CHECK_VALUE(data_out, i, data_out_expected[i]);
   }
 }
+
+TEST(VulkanComputeGraphOpsTest, conv2d_prepack_test) {
+  test_conv2d(
+      /*original_sizes = */ {2, 3, 1, 2},
+      /*padded_sizes = */ {4, 4},
+      /*gpu_sizes = */ {4, 1, 8},
+      /*transposed = */ false,
+      /*data_out_expected = */ {1, 3, 5,  0,  2, 4, 6, 0, 7, 9, 11,
+                                0, 8, 10, 12, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0,  0,  0, 0, 0, 0, 0, 0});
+  test_conv2d(
+      /*original_sizes = */ {2, 3, 1, 2},
+      /*padded_sizes = */ {4, 4},
+      /*gpu_sizes = */ {4, 1, 8},
+      /*transposed = */ true,
+      /*data_out_expected = */ {2, 8, 0, 0, 1, 7, 0,  0, 4, 10, 0,
+                                0, 3, 9, 0, 0, 6, 12, 0, 0, 5,  11,
+                                0, 0, 0, 0, 0, 0, 0,  0, 0, 0});
+}