diff --git a/backends/vulkan/runtime/api/Tensor.h b/backends/vulkan/runtime/api/Tensor.h
index e1e28bf582c..110e94ab943 100644
--- a/backends/vulkan/runtime/api/Tensor.h
+++ b/backends/vulkan/runtime/api/Tensor.h
@@ -149,13 +149,14 @@ class vTensor final {
   // to be interpreted as a tensor with a different size.
   api::utils::uvec3 virtual_extents_;
 
-  // A Vulkan uniform buffer containing the tensor sizes that can be passed into
-  // a shader.
+  // A Vulkan uniform buffer containing the tensor sizes in WHCN that can be
+  // passed into a shader.
   std::shared_ptr<api::UniformParamsBuffer> cpu_sizes_uniform_;
 
-  // A Vulkan uniform buffer containing the GPU tensor sizes that can be passed
-  // into a shader. GPU sizes refers to the sizes of the tensor after padding
-  // has been applied to one dimension to align it to the next multiple of 4.
+  // A Vulkan uniform buffer containing the GPU tensor sizes in WHCN that can
+  // be passed into a shader. GPU sizes refers to the sizes of the tensor after
+  // padding has been applied to one dimension to align it to the next multiple
+  // of 4.
   std::shared_ptr<api::UniformParamsBuffer> gpu_sizes_uniform_;
 
   // A Vulkan uniform buffer containing the image extents of the underlying
diff --git a/backends/vulkan/runtime/api/Utils.h b/backends/vulkan/runtime/api/Utils.h
index f04c11ba030..3b0139b8efb 100644
--- a/backends/vulkan/runtime/api/Utils.h
+++ b/backends/vulkan/runtime/api/Utils.h
@@ -262,6 +262,12 @@ inline std::ostream& operator<<(std::ostream& os, const uvec3& v) {
   return os;
 }
 
+inline std::ostream& operator<<(std::ostream& os, const uvec4& v) {
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ")";
+  return os;
+}
+
 //
 // std::vector<T> Handling
 //
diff --git a/backends/vulkan/runtime/graph/Logging.h b/backends/vulkan/runtime/graph/Logging.h
index f2684081332..2c42b78fa5e 100644
--- a/backends/vulkan/runtime/graph/Logging.h
+++ b/backends/vulkan/runtime/graph/Logging.h
@@ -29,4 +29,8 @@ inline std::ostream& operator<<(std::ostream& os, const api::utils::uvec3& v) {
   return api::utils::operator<<(os, v);
 }
 
+inline std::ostream& operator<<(std::ostream& os, const api::utils::uvec4& v) {
+  return api::utils::operator<<(os, v);
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
new file mode 100644
index 00000000000..53883c68e3b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
+  // tensor size in WHCN.
+  uvec4 data;
+}
+out_sizes;
+
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 3) uniform PRECISION restrict Block {
+  // output dims
+  uvec4 out_ndims;
+  // x = output channels aligned to 4, y = input channels aligned to 4
+  uvec2 ch_info;
+}
+uBlock;
+
+/*
+ * Local Work Group
+ */
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 posOut = ivec3(gl_GlobalInvocationID);
+
+  const ivec4 idx = to_tensor_idx_C_packed(posOut, out_sizes.data);
+  if (any(greaterThanEqual(idx, out_sizes.data))) {
+    return;
+  }
+
+  const int out_channel_4up = int(uBlock.ch_info.x);
+  const int in_channel_4up = int(uBlock.ch_info.y);
+  const int out_batch = int(out_sizes.data[3]);
+  const int max_dst_index = out_batch * out_channel_4up;
+  VEC4_T outval = VEC4_T(0.0);
+
+  for (int j = 0; j < 4; ++j) {
+    int dst_index = posOut.z * 4 + j;
+    if (dst_index >= max_dst_index) {
+      // out of range
+      break;
+    }
+
+    ivec4 v = ivec4(0); // holds b,c,h,w
+    v[uBlock.out_ndims[0]] = dst_index / out_channel_4up;
+    v[uBlock.out_ndims[1]] = dst_index % out_channel_4up;
+    v[uBlock.out_ndims[2]] = posOut.y;
+    v[uBlock.out_ndims[3]] = posOut.x;
+
+    int src_index = v[0] * in_channel_4up + v[1];
+    int w = v[3];
+    int h = v[2];
+
+    VEC4_T inval = VEC4_T(texelFetch(image_in, ivec3(w, h, src_index / 4), 0));
+    outval[j] = inval[src_index % 4];
+  }
+  imageStore(image_out, posOut, outval);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.yaml b/backends/vulkan/runtime/graph/ops/glsl/permute.yaml
new file mode 100644
index 00000000000..77491a52856
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute.yaml
@@ -0,0 +1,10 @@
+permute:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: permute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
new file mode 100644
index 00000000000..0f43631e84d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+using api::utils::ivec3;
+using api::utils::uvec2;
+using api::utils::uvec4;
+
+void check_args(
+    const vTensor& in,
+    const IntListPtr& permute_dims,
+    const vTensor& out) {
+  VK_CHECK_COND(check_memory_layout_is(in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(out, api::kChannelsPacked));
+
+  int64_t in_dim = in.dim();
+  VK_CHECK_COND(
+      in_dim == permute_dims->size(),
+      "Input tensor dim size must match argument");
+}
+
+void add_permute_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef permute_dims_ref,
+    ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  IntListPtr permute_dims = graph.get_int_list(permute_dims_ref);
+
+  check_args(*t_in, permute_dims, *t_out);
+
+  uvec4 in_size{1u, 1u, 1u, 1u}, out_size{1u, 1u, 1u, 1u};
+  uvec4 out_dims{0u, 1u, 2u, 3u};
+
+  int64_t in_dim = t_in->dim();
+
+  std::vector<bool> seen(in_dim);
+  for (int i = 0; i < in_dim; i++) {
+    int64_t permute_dim = (*permute_dims)[i];
+    VK_CHECK_COND(
+        !seen[permute_dim], "Argument dim ", permute_dim, "  is repeated");
+    seen[permute_dim] = true;
+
+    // Map to 4D tensor dims.
+    in_size.data[(4u - in_dim) + i] = t_in->size(i);
+    out_size.data[(4u - in_dim) + i] = t_in->size(permute_dim);
+    out_dims.data[(4u - in_dim) + i] = permute_dim + (4u - in_dim);
+  }
+
+  std::string kernel_name = "permute";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  uint32_t out_channels = out_size.data[1u];
+  uint32_t in_channels = in_size.data[1u];
+
+  uint32_t out_c_aligned = api::utils::align_up(out_channels, 4u);
+  uint32_t in_c_aligned = api::utils::align_up(in_channels, 4u);
+
+  const struct Block final {
+    uvec4 out_ndims;
+    uvec2 ch_info;
+  } params{
+      out_dims,
+      {out_c_aligned, in_c_aligned},
+  };
+
+  api::utils::uvec3 global_size = t_out->virtual_extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
+      {t_out->gpu_sizes_ubo(), graph.create_params_buffer(params)}));
+}
+
+void permute(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_permute_node(graph, args[0], args[1], args[2]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.permute_copy.default, permute);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 7ebe7bbcffa..49c3188174b 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -171,6 +171,29 @@ def get_select_int_inputs():
     return test_suite
 
 
+def get_permute_inputs():
+    test_suite = VkTestSuite(
+        [
+            ((9, 2, 9, 4), [0, 1, 2, 3]),
+            ((9, 2, 9, 4), [0, 1, 3, 2]),
+            ((9, 2, 9, 4), [0, 2, 1, 3]),
+            ((9, 2, 9, 4), [0, 2, 3, 1]),
+            ((9, 2, 9, 4), [0, 3, 1, 2]),
+            ((9, 2, 9, 4), [0, 3, 2, 1]),
+            ((9, 2, 9, 4), [3, 0, 1, 2]),
+            ((9, 2, 9, 4), [3, 2, 0, 1]),
+            ((9, 2, 9, 4), [2, 3, 0, 1]),
+            ((9, 2, 9, 4), [2, 0, 3, 1]),
+            ((9, 2, 9), [2, 0, 1]),
+            ((9, 2, 9), [1, 2, 0]),
+            ((9, 2), [0, 1]),
+            ((9, 2), [1, 0]),
+        ]
+    )
+    test_suite.layouts = ["api::kChannelsPacked"]
+    return test_suite
+
+
 test_suites = {
     "aten.add.Tensor": get_binary_elementwise_inputs(),
     "aten.sub.Tensor": get_binary_elementwise_inputs(),
@@ -183,4 +206,5 @@ def get_select_int_inputs():
     "aten.full.default": get_full_inputs(),
     "aten.select.int": get_select_int_inputs(),
     "aten.select_copy.int": get_select_int_inputs(),
+    "aten.permute_copy.default": get_permute_inputs(),
 }