[WebGPU EP] Implements Depth-To-Space Operator #24489

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

prathikr merged 3 commits into main from prathikrao/depth-to-space-webgpu-ep

Apr 22, 2025

onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc

-Original file line number
+Diff line change
@@ -0,0 +1,147 @@
+    // Copyright (c) Microsoft Corporation. All rights reserved.
+    // Licensed under the MIT License.
+    #include "core/providers/webgpu/shader_helper.h"
+    #include "core/providers/webgpu/webgpu_supported_types.h"
+    #include "core/providers/webgpu/tensor/depth_to_space.h"
+    #include "core/providers/webgpu/webgpu_utils.h"
+    namespace onnxruntime {
+    namespace webgpu {
+    #define WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(start, end, domain, is_nhwc) \
+      ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                        \
+          DepthToSpace,                                                         \
+          domain,                                                               \
+          start,                                                                \
+          end,                                                                  \
+          kWebGpuExecutionProvider,                                             \
+          (*KernelDefBuilder::Create())                                         \
+              .TypeConstraint("T", WebGpuSupportedFloatTypes()),                \
+          DepthToSpace<is_nhwc>);
+    #define WEBGPU_DEPTH_TO_SPACE_KERNEL(version, domain, is_nhwc) \
+      ONNX_OPERATOR_KERNEL_EX(                                     \
+          DepthToSpace,                                            \
+          domain,                                                  \
+          version,                                                 \
+          kWebGpuExecutionProvider,                                \
+          (*KernelDefBuilder::Create())                            \
+              .TypeConstraint("T", WebGpuSupportedFloatTypes()),   \
+          DepthToSpace<is_nhwc>);
+    WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(11, 12, kOnnxDomain, false)
+    WEBGPU_DEPTH_TO_SPACE_KERNEL(13, kOnnxDomain, false)
+    WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(11, 12, kMSInternalNHWCDomain, true)
+    WEBGPU_DEPTH_TO_SPACE_KERNEL(13, kMSInternalNHWCDomain, true)
+    void AppendPermFunction(std::ostream& os, const ShaderVariableHelper& input, const int64_t* perm) {
+      os << "fn perm(i: input_indices_t) -> input_indices_t {\n"
+         << "  var a: input_indices_t;\n";
+      for (int idx = 0; idx < input.Rank(); ++idx) {
+        os << "  " << input.IndicesSet("a", std::to_string(perm[idx]), "i[" + std::to_string(idx) + "]") << "\n";
+      }
+      os << "  return a;\n"
+         << "}\n";
+    }
+    Status DepthToSpaceProgram::GenerateShaderCode(ShaderHelper& shader) const {
+      const ShaderVariableHelper& input = shader.AddInput("input");
+      const ShaderVariableHelper& output = shader.AddOutput("output");
+      AppendPermFunction(shader.AdditionalImplementation(), input, perm_);
+      shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                                << "  let indices = " << output.OffsetToIndices("global_idx") << ";\n"
+                                << "  let aIndices = perm(indices);\n"
+                                << "  " << output.SetByOffset("global_idx", input.GetByIndices("aIndices"));
+      return Status::OK();
+    }
+    template <bool is_nhwc>
+    Status DepthToSpace<is_nhwc>::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
+      const auto* input = context.Input(0);
+      const TensorShape input_shape = input->Shape();
+      int64_t input_rank = input_shape.NumDimensions();
+      ORT_ENFORCE(input_rank == 4, "Input must be rank 4.");
+      int64_t n, c, h, w;
+      int64_t shape[6];
+      int64_t perm[6];
+      if (is_nhwc) {
+        n = input_shape[0];
+        h = input_shape[1];
+        w = input_shape[2];
+        c = input_shape[3];
+        if (is_dcr_) {
+          int64_t shape_values[] = {n, h, w, blocksize_, blocksize_, c / (blocksize_ * blocksize_)};
+          int64_t perm_values[] = {0, 1, 3, 2, 4, 5};
+          std::copy(shape_values, shape_values + 6, shape);
+          std::copy(perm_values, perm_values + 6, perm);
+        } else {
+          int64_t shape_values[] = {n, h, w, c / (blocksize_ * blocksize_), blocksize_, blocksize_};
+          int64_t perm_values[] = {0, 1, 4, 2, 5, 3};
+          std::copy(shape_values, shape_values + 6, shape);
+          std::copy(perm_values, perm_values + 6, perm);
+        }
+      } else {
+        n = input_shape[0];
+        h = input_shape[2];
+        w = input_shape[3];
+        c = input_shape[1];
+        if (is_dcr_) {
+          int64_t shape_values[] = {n, blocksize_, blocksize_, c / (blocksize_ * blocksize_), h, w};
+          int64_t perm_values[] = {0, 3, 4, 1, 5, 2};
+          std::copy(shape_values, shape_values + 6, shape);
+          std::copy(perm_values, perm_values + 6, perm);
+        } else {
+          int64_t shape_values[] = {n, c / (blocksize_ * blocksize_), blocksize_, blocksize_, h, w};
+          int64_t perm_values[] = {0, 1, 4, 2, 5, 3};
+          std::copy(shape_values, shape_values + 6, shape);
+          std::copy(perm_values, perm_values + 6, perm);
+        }
+      }
+      std::vector<int64_t> shape_vec(shape, shape + 6);
+      TensorShape input_override_shape(shape_vec);
+      // Calculate the final 4D output shape
+      int64_t output_shape[4];
+      if (is_nhwc) {
+        int64_t output_shape_values[] = {n, h * blocksize_, w * blocksize_, c / (blocksize_ * blocksize_)};
+        std::copy(output_shape_values, output_shape_values + 4, output_shape);
+      } else {
+        int64_t output_shape_values[] = {n, c / (blocksize_ * blocksize_), h * blocksize_, w * blocksize_};
+        std::copy(output_shape_values, output_shape_values + 4, output_shape);
+      }
+      TensorShape final_output_shape(gsl::make_span(output_shape));
+      auto* output = context.Output(0, final_output_shape);
+      int64_t output_size = output->Shape().Size();
+      if (output_size == 0) {
+        return Status::OK();
+      }
+      std::vector<int64_t> shape_after_permutation_vec(6);
+      for (int i = 0; i < 6; i++) {
+        shape_after_permutation_vec[i] = shape[perm[i]];
+      }
+      TensorShape output_override_shape(shape_after_permutation_vec);
+      DepthToSpaceProgram program{perm};
+      program
+          .AddInput({input, ProgramTensorMetadataDependency::TypeAndRank, input_override_shape, 1})
+          .AddOutput({output, ProgramTensorMetadataDependency::None, output_override_shape, 1})
+          .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+          .CacheHint(absl::StrJoin(input_shape.GetDims(), "-"), blocksize_, is_dcr_ ? "DCR" : "CRD")
+          .AddUniformVariable({static_cast<uint32_t>(output_size)});
+      return context.RunProgram(program);
+    }
+    }  // namespace webgpu
+    }  // namespace onnxruntime

onnxruntime/core/providers/webgpu/tensor/depth_to_space.h

-Original file line number
+Diff line change
@@ -0,0 +1,39 @@
+    // Copyright (c) Microsoft Corporation. All rights reserved.
+    // Licensed under the MIT License.
+    #pragma once
+    #include "core/providers/webgpu/program.h"
+    #include "core/providers/webgpu/webgpu_kernel.h"
+    namespace onnxruntime {
+    namespace webgpu {
+    class DepthToSpaceProgram final : public Program<DepthToSpaceProgram> {
+     public:
+      DepthToSpaceProgram(int64_t* perm) : Program{"DepthToSpace"}, perm_{perm} {}
+      Status GenerateShaderCode(ShaderHelper& sh) const override;
+      WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32});
+     private:
+      int64_t* perm_;
+    };
+    template <bool is_nhwc>
+    class DepthToSpace final : public WebGpuKernel {
+     public:
+      DepthToSpace(const OpKernelInfo& info) : WebGpuKernel(info) {
+        blocksize_ = info.GetAttr<int64_t>("blocksize");
+        std::string mode = info.GetAttrOrDefault<std::string>("mode", "DCR");
+        is_dcr_ = (mode == "DCR");
+      }
+      Status ComputeInternal(ComputeContext& context) const override;
+     private:
+      int64_t blocksize_;
+      bool is_dcr_;
+    };
+    }  // namespace webgpu
+    }  // namespace onnxruntime

onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -581,10 +581,10 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
  
          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 21, 22, Transpose)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 23, Transpose)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Conv)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 21, Conv)>,

onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc

-Original file line number
+Diff line change
@@ Expand Up / @@ -468,5 +468,111 @@ TEST(TensorOpTest, DepthToSpaceTest_CRD_Batched) { @@
       test.Run();
     }
+    TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DefaultMode1) {
+      OpTester test("DepthToSpace", 11);
+      constexpr int64_t blocksize = 2;
+      test.AddAttribute("blocksize", blocksize);
+      constexpr int64_t N = 1, C = 8, H = 1, W = 1;
+      std::vector<float> X = {0, 9, 18, 27, 36, 45, 54, 63};
+      test.AddInput<float>("input", {N, C, H, W}, X);
+      std::vector<float> result = {0, 18, 36, 54, 9, 27, 45, 63};
+      test.AddOutput<float>("output", {1, 2, 2, 2}, result);
+      test.Run();
+    }
+    TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DefaultMode2) {
+      OpTester test("DepthToSpace", 11);
+      constexpr int64_t blocksize = 2;
+      test.AddAttribute("blocksize", blocksize);
+      constexpr int64_t N = 2, C = 8, H = 1, W = 2;
+      std::vector<float> X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+, 30, 31};
+      test.AddInput<float>("input", {N, C, H, W}, X);
+      std::vector<float> result = {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15, 16, 20, 17, 21, 24, 28, 25, 29, 18, 22, 19, 23, 26,
+, 27, 31};
+      test.AddOutput<float>("output", {2, 2, 2, 4}, result);
+      test.Run();
+    }
+    TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DCR1) {
+      OpTester test("DepthToSpace", 11);
+      constexpr int64_t blocksize = 2;
+      test.AddAttribute("blocksize", blocksize);
+      test.AddAttribute("mode", "DCR");
+      constexpr int64_t N = 1, C = 8, H = 1, W = 1;
+      std::vector<float> X = {0, 9, 18, 27, 36, 45, 54, 63};
+      test.AddInput<float>("input", {N, C, H, W}, X);
+      std::vector<float> result = {0, 18, 36, 54, 9, 27, 45, 63};
+      test.AddOutput<float>("output", {1, 2, 2, 2}, result);
+      test.Run();
+    }
+    TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DCR2) {
+      OpTester test("DepthToSpace", 11);
+      constexpr int64_t blocksize = 2;
+      test.AddAttribute("blocksize", blocksize);
+      test.AddAttribute("mode", "DCR");
+      constexpr int64_t N = 2, C = 8, H = 1, W = 2;
+      std::vector<float> X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+, 30, 31};
+      test.AddInput<float>("input", {N, C, H, W}, X);
+      std::vector<float> result = {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15, 16, 20, 17, 21, 24, 28, 25, 29, 18, 22, 19, 23, 26,
+, 27, 31};
+      test.AddOutput<float>("output", {2, 2, 2, 4}, result);
+      test.Run();
+    }
+    TEST(TensorOpTest, DepthToSpaceTest_WebGPU_CRD1) {
+      OpTester test("DepthToSpace", 11);
+      constexpr int64_t blocksize = 2;
+      test.AddAttribute("blocksize", blocksize);
+      test.AddAttribute("mode", "CRD");
+      constexpr int64_t N = 1, C = 8, H = 1, W = 1;
+      std::vector<float> X = {0, 9, 18, 27, 36, 45, 54, 63};
+      test.AddInput<float>("input", {N, C, H, W}, X);
+      std::vector<float> result = {0, 9, 18, 27, 36, 45, 54, 63};
+      test.AddOutput<float>("output", {1, 2, 2, 2}, result);
+      test.Run();
+    }
+    TEST(TensorOpTest, DepthToSpaceTest_WebGPU_CRD2) {
+      OpTester test("DepthToSpace", 11);
+      constexpr int64_t blocksize = 2;
+      test.AddAttribute("blocksize", blocksize);
+      test.AddAttribute("mode", "CRD");
+      constexpr int64_t N = 2, C = 8, H = 1, W = 2;
+      std::vector<float> X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+, 30, 31};
+      test.AddInput<float>("input", {N, C, H, W}, X);
+      std::vector<float> result = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15, 16, 18, 17, 19, 20, 22, 21, 23, 24, 26, 25, 27, 28,
+, 29, 31};
+      test.AddOutput<float>("output", {2, 2, 2, 4}, result);
+      test.Run();
+    }
     }  // namespace test
     }  // namespace onnxruntime

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[WebGPU EP] Implements Depth-To-Space Operator #24489

Uh oh!

Diff view

Diff view

There are no files selected for viewing

GitHub Actions / Optional Lint C++

GitHub Actions / Optional Lint C++

GitHub Actions / Optional Lint C++

Uh oh!