From d77d5c1d008f785b0801ab9c565f68e5851ea993 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Mon, 21 Apr 2025 15:11:25 -0700
Subject: [PATCH 1/3] depth to space impl

---
 .../providers/webgpu/tensor/depth_to_space.cc | 145 ++++++++++++++++++
 .../providers/webgpu/tensor/depth_to_space.h  |  39 +++++
 .../webgpu/webgpu_execution_provider.cc       |   8 +-
 .../cpu/tensor/space_depth_ops_test.cc        | 106 +++++++++++++
 4 files changed, 294 insertions(+), 4 deletions(-)
 create mode 100644 onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc
 create mode 100644 onnxruntime/core/providers/webgpu/tensor/depth_to_space.h
diff --git a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc
new file mode 100644
index 0000000000000..fc981b7cc6feb
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc
@@ -0,0 +1,145 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/tensor/depth_to_space.h"
+#include "core/providers/webgpu/webgpu_utils.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+#define WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(start, end, domain, is_nhwc) \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                        \
+      DepthToSpace,                                                         \
+      domain,                                                               \
+      start,                                                                \
+      end,                                                                  \
+      kWebGpuExecutionProvider,                                             \
+      (*KernelDefBuilder::Create())                                         \
+          .TypeConstraint("T", WebGpuSupportedFloatTypes()),                \
+      DepthToSpace<is_nhwc>);
+
+#define WEBGPU_DEPTH_TO_SPACE_KERNEL(version, domain, is_nhwc) \
+  ONNX_OPERATOR_KERNEL_EX(                                     \
+      DepthToSpace,                                            \
+      domain,                                                  \
+      version,                                                 \
+      kWebGpuExecutionProvider,                                \
+      (*KernelDefBuilder::Create())                            \
+          .TypeConstraint("T", WebGpuSupportedFloatTypes()),   \
+      DepthToSpace<is_nhwc>);
+
+WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(11, 12, kOnnxDomain, false)
+WEBGPU_DEPTH_TO_SPACE_KERNEL(13, kOnnxDomain, false)
+
+WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(11, 12, kMSInternalNHWCDomain, true)
+WEBGPU_DEPTH_TO_SPACE_KERNEL(13, kMSInternalNHWCDomain, true)
+
+void AppendPermFunction(std::ostream& os, const ShaderVariableHelper& input, const int64_t* perm) {
+  os << "fn perm(i: input_indices_t) -> input_indices_t {\n"
+     << "  var a: input_indices_t;\n";
+  for (int idx = 0; idx < input.Rank(); ++idx) {
+    os << "  " << input.IndicesSet("a", std::to_string(perm[idx]), "i[" + std::to_string(idx) + "]") << "\n";
+  }
+  os << "  return a;\n"
+     << "}\n";
+}
+
+Status DepthToSpaceProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const ShaderVariableHelper& input = shader.AddInput("input");
+  const ShaderVariableHelper& output = shader.AddOutput("output");
+
+  AppendPermFunction(shader.AdditionalImplementation(), input, perm_);
+
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "  let indices = " << output.OffsetToIndices("global_idx") << ";\n"
+                            << "  let aIndices = perm(indices);\n"
+                            << "  " << output.SetByOffset("global_idx", input.GetByIndices("aIndices"));
+
+  return Status::OK();
+}
+
+template <bool is_nhwc>
+Status DepthToSpace<is_nhwc>::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
+  const auto* input = context.Input(0);
+  const TensorShape input_shape = input->Shape();
+
+  int64_t n, c, h, w;
+  int64_t shape[6];
+  int64_t perm[6];
+  if (is_nhwc) {
+    n = input_shape[0];
+    h = input_shape[1];
+    w = input_shape[2];
+    c = input_shape[3];
+
+    if (is_dcr_) {
+      int64_t shape_values[] = {n, h, w, blocksize_, blocksize_, c / (blocksize_ * blocksize_)};
+      int64_t perm_values[] = {0, 1, 3, 2, 4, 5};
+      std::copy(shape_values, shape_values + 6, shape);
+      std::copy(perm_values, perm_values + 6, perm);
+    } else {
+      int64_t shape_values[] = {n, h, w, c / (blocksize_ * blocksize_), blocksize_, blocksize_};
+      int64_t perm_values[] = {0, 1, 4, 2, 5, 3};
+      std::copy(shape_values, shape_values + 6, shape);
+      std::copy(perm_values, perm_values + 6, perm);
+    }
+  } else {
+    n = input_shape[0];
+    h = input_shape[2];
+    w = input_shape[3];
+    c = input_shape[1];
+
+    if (is_dcr_) {
+      int64_t shape_values[] = {n, blocksize_, blocksize_, c / (blocksize_ * blocksize_), h, w};
+      int64_t perm_values[] = {0, 3, 4, 1, 5, 2};
+      std::copy(shape_values, shape_values + 6, shape);
+      std::copy(perm_values, perm_values + 6, perm);
+    } else {
+      int64_t shape_values[] = {n, c / (blocksize_ * blocksize_), blocksize_, blocksize_, h, w};
+      int64_t perm_values[] = {0, 1, 4, 2, 5, 3};
+      std::copy(shape_values, shape_values + 6, shape);
+      std::copy(perm_values, perm_values + 6, perm);
+    }
+  }
+
+  std::vector<int64_t> shape_vec(shape, shape + 6);
+  TensorShape input_override_shape(shape_vec);
+
+  // Calculate the final 4D output shape
+  int64_t output_shape[4];
+  if (is_nhwc) {
+    int64_t output_shape_values[] = {n, h * blocksize_, w * blocksize_, c / (blocksize_ * blocksize_)};
+    std::copy(output_shape_values, output_shape_values + 4, output_shape);
+  } else {
+    int64_t output_shape_values[] = {n, c / (blocksize_ * blocksize_), h * blocksize_, w * blocksize_};
+    std::copy(output_shape_values, output_shape_values + 4, output_shape);
+  }
+  TensorShape final_output_shape(gsl::make_span(output_shape));
+
+  auto* output = context.Output(0, final_output_shape);
+  int64_t output_size = output->Shape().Size();
+
+  if (output_size == 0) {
+    return Status::OK();
+  }
+
+  std::vector<int64_t> shape_after_permutation_vec(6);
+  for (int i = 0; i < 6; i++) {
+    shape_after_permutation_vec[i] = shape[perm[i]];
+  }
+  TensorShape output_override_shape(shape_after_permutation_vec);
+
+  DepthToSpaceProgram program{perm};
+  program
+      .AddInput({input, ProgramTensorMetadataDependency::TypeAndRank, input_override_shape, 1})
+      .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank, output_override_shape, 1})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .CacheHint(absl::StrJoin(input_shape.GetDims(), "-"), blocksize_, is_dcr_ ? "DCR" : "CRD")
+      .AddUniformVariable({static_cast<uint32_t>(output_size)});
+  return context.RunProgram(program);
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.h b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.h
new file mode 100644
index 0000000000000..153618b5d0237
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.h
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class DepthToSpaceProgram final : public Program<DepthToSpaceProgram> {
+ public:
+  DepthToSpaceProgram(int64_t* perm) : Program{"DepthToSpace"}, perm_{perm} {}
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  int64_t* perm_;
+};
+
+template <bool is_nhwc>
+class DepthToSpace final : public WebGpuKernel {
+ public:
+  DepthToSpace(const OpKernelInfo& info) : WebGpuKernel(info) {
+    blocksize_ = info.GetAttr<int64_t>("blocksize");
+    std::string mode = info.GetAttrOrDefault<std::string>("mode", "DCR");
+    is_dcr_ = (mode == "DCR");
+  }
+
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  int64_t blocksize_;
+  bool is_dcr_;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index b126ca823970a..f5f108121cb8d 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -581,10 +581,10 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 21, 22, Transpose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 23, Transpose)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Conv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 21, Conv)>,
diff --git a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
index a40b85b7754a3..f97de7a54bc99 100644
--- a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
@@ -468,5 +468,111 @@ TEST(TensorOpTest, DepthToSpaceTest_CRD_Batched) {
   test.Run();
 }
 
+TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DefaultMode1) {
+  OpTester test("DepthToSpace", 11);
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+
+  constexpr int64_t N = 1, C = 8, H = 1, W = 1;
+  std::vector<float> X = {0, 9, 18, 27, 36, 45, 54, 63};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0, 18, 36, 54, 9, 27, 45, 63};
+
+  test.AddOutput<float>("output", {1, 2, 2, 2}, result);
+  test.Run();
+}
+
+TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DefaultMode2) {
+  OpTester test("DepthToSpace", 11);
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+
+  constexpr int64_t N = 2, C = 8, H = 1, W = 2;
+  std::vector<float> X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                          29, 30, 31};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15, 16, 20, 17, 21, 24, 28, 25, 29, 18, 22, 19, 23, 26,
+                               30, 27, 31};
+
+  test.AddOutput<float>("output", {2, 2, 2, 4}, result);
+  test.Run();
+}
+
+TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DCR1) {
+  OpTester test("DepthToSpace", 11);
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  test.AddAttribute("mode", "DCR");
+
+  constexpr int64_t N = 1, C = 8, H = 1, W = 1;
+  std::vector<float> X = {0, 9, 18, 27, 36, 45, 54, 63};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0, 18, 36, 54, 9, 27, 45, 63};
+
+  test.AddOutput<float>("output", {1, 2, 2, 2}, result);
+  test.Run();
+}
+
+TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DCR2) {
+  OpTester test("DepthToSpace", 11);
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  test.AddAttribute("mode", "DCR");
+
+  constexpr int64_t N = 2, C = 8, H = 1, W = 2;
+  std::vector<float> X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                          29, 30, 31};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15, 16, 20, 17, 21, 24, 28, 25, 29, 18, 22, 19, 23, 26,
+                               30, 27, 31};
+
+  test.AddOutput<float>("output", {2, 2, 2, 4}, result);
+  test.Run();
+}
+
+TEST(TensorOpTest, DepthToSpaceTest_WebGPU_CRD1) {
+  OpTester test("DepthToSpace", 11);
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  test.AddAttribute("mode", "CRD");
+
+  constexpr int64_t N = 1, C = 8, H = 1, W = 1;
+  std::vector<float> X = {0, 9, 18, 27, 36, 45, 54, 63};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0, 9, 18, 27, 36, 45, 54, 63};
+
+  test.AddOutput<float>("output", {1, 2, 2, 2}, result);
+  test.Run();
+}
+
+TEST(TensorOpTest, DepthToSpaceTest_WebGPU_CRD2) {
+  OpTester test("DepthToSpace", 11);
+  constexpr int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  test.AddAttribute("mode", "CRD");
+
+  constexpr int64_t N = 2, C = 8, H = 1, W = 2;
+  std::vector<float> X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                          29, 30, 31};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  std::vector<float> result = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15, 16, 18, 17, 19, 20, 22, 21, 23, 24, 26, 25, 27, 28,
+                               30, 29, 31};
+
+  test.AddOutput<float>("output", {2, 2, 2, 4}, result);
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime

From 31e1779f741ecc3a5c79a37a6b0154752884f522 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Mon, 21 Apr 2025 15:50:39 -0700
Subject: [PATCH 2/3] add rank==4 check

---
 onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc
index fc981b7cc6feb..f55cf012f8c2f 100644
--- a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc
@@ -64,6 +64,8 @@ template <bool is_nhwc>
 Status DepthToSpace<is_nhwc>::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
   const auto* input = context.Input(0);
   const TensorShape input_shape = input->Shape();
+  int64_t input_rank = input_shape.NumDimensions();
+  ORT_ENFORCE(input_rank == 4, "Input must be rank 4.");
 
   int64_t n, c, h, w;
   int64_t shape[6];
@@ -133,8 +135,8 @@ Status DepthToSpace<is_nhwc>::ComputeInternal(onnxruntime::webgpu::ComputeContex
 
   DepthToSpaceProgram program{perm};
   program
-      .AddInput({input, ProgramTensorMetadataDependency::TypeAndRank, input_override_shape, 1})
-      .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank, output_override_shape, 1})
+      .AddInput({input, ProgramTensorMetadataDependency::Type, input_override_shape, 1})
+      .AddOutput({output, ProgramTensorMetadataDependency::None, output_override_shape, 1})
       .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
       .CacheHint(absl::StrJoin(input_shape.GetDims(), "-"), blocksize_, is_dcr_ ? "DCR" : "CRD")
       .AddUniformVariable({static_cast<uint32_t>(output_size)});

From aa74351bb99a3f7cc805acbbd21b609291f6fbed Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Mon, 21 Apr 2025 16:32:11 -0700
Subject: [PATCH 3/3] add rank dependency

---
 onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc
index f55cf012f8c2f..e7f902cc08b40 100644
--- a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc
@@ -135,7 +135,7 @@ Status DepthToSpace<is_nhwc>::ComputeInternal(onnxruntime::webgpu::ComputeContex
 
   DepthToSpaceProgram program{perm};
   program
-      .AddInput({input, ProgramTensorMetadataDependency::Type, input_override_shape, 1})
+      .AddInput({input, ProgramTensorMetadataDependency::TypeAndRank, input_override_shape, 1})
       .AddOutput({output, ProgramTensorMetadataDependency::None, output_override_shape, 1})
       .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
       .CacheHint(absl::StrJoin(input_shape.GetDims(), "-"), blocksize_, is_dcr_ ? "DCR" : "CRD")