From d77d5c1d008f785b0801ab9c565f68e5851ea993 Mon Sep 17 00:00:00 2001 From: Prathik Rao Date: Mon, 21 Apr 2025 15:11:25 -0700 Subject: [PATCH 1/3] depth to space impl --- .../providers/webgpu/tensor/depth_to_space.cc | 145 ++++++++++++++++++ .../providers/webgpu/tensor/depth_to_space.h | 39 +++++ .../webgpu/webgpu_execution_provider.cc | 8 +- .../cpu/tensor/space_depth_ops_test.cc | 106 +++++++++++++ 4 files changed, 294 insertions(+), 4 deletions(-) create mode 100644 onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc create mode 100644 onnxruntime/core/providers/webgpu/tensor/depth_to_space.h diff --git a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc new file mode 100644 index 0000000000000..fc981b7cc6feb --- /dev/null +++ b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc @@ -0,0 +1,145 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/webgpu/shader_helper.h" +#include "core/providers/webgpu/webgpu_supported_types.h" +#include "core/providers/webgpu/tensor/depth_to_space.h" +#include "core/providers/webgpu/webgpu_utils.h" + +namespace onnxruntime { +namespace webgpu { + +#define WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(start, end, domain, is_nhwc) \ + ONNX_OPERATOR_VERSIONED_KERNEL_EX( \ + DepthToSpace, \ + domain, \ + start, \ + end, \ + kWebGpuExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", WebGpuSupportedFloatTypes()), \ + DepthToSpace); + +#define WEBGPU_DEPTH_TO_SPACE_KERNEL(version, domain, is_nhwc) \ + ONNX_OPERATOR_KERNEL_EX( \ + DepthToSpace, \ + domain, \ + version, \ + kWebGpuExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", WebGpuSupportedFloatTypes()), \ + DepthToSpace); + +WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(11, 12, kOnnxDomain, false) +WEBGPU_DEPTH_TO_SPACE_KERNEL(13, kOnnxDomain, false) + +WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(11, 12, kMSInternalNHWCDomain, true) +WEBGPU_DEPTH_TO_SPACE_KERNEL(13, kMSInternalNHWCDomain, true) + +void AppendPermFunction(std::ostream& os, const ShaderVariableHelper& input, const int64_t* perm) { + os << "fn perm(i: input_indices_t) -> input_indices_t {\n" + << " var a: input_indices_t;\n"; + for (int idx = 0; idx < input.Rank(); ++idx) { + os << " " << input.IndicesSet("a", std::to_string(perm[idx]), "i[" + std::to_string(idx) + "]") << "\n"; + } + os << " return a;\n" + << "}\n"; +} + +Status DepthToSpaceProgram::GenerateShaderCode(ShaderHelper& shader) const { + const ShaderVariableHelper& input = shader.AddInput("input"); + const ShaderVariableHelper& output = shader.AddOutput("output"); + + AppendPermFunction(shader.AdditionalImplementation(), input, perm_); + + shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size") + << " let indices = " << output.OffsetToIndices("global_idx") << ";\n" + << " let aIndices = perm(indices);\n" + << " " << output.SetByOffset("global_idx", input.GetByIndices("aIndices")); + + return Status::OK(); +} + +template +Status DepthToSpace::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const { + const auto* input = context.Input(0); + const TensorShape input_shape = input->Shape(); + + int64_t n, c, h, w; + int64_t shape[6]; + int64_t perm[6]; + if (is_nhwc) { + n = input_shape[0]; + h = input_shape[1]; + w = input_shape[2]; + c = input_shape[3]; + + if (is_dcr_) { + int64_t shape_values[] = {n, h, w, blocksize_, blocksize_, c / (blocksize_ * blocksize_)}; + int64_t perm_values[] = {0, 1, 3, 2, 4, 5}; + std::copy(shape_values, shape_values + 6, shape); + std::copy(perm_values, perm_values + 6, perm); + } else { + int64_t shape_values[] = {n, h, w, c / (blocksize_ * blocksize_), blocksize_, blocksize_}; + int64_t perm_values[] = {0, 1, 4, 2, 5, 3}; + std::copy(shape_values, shape_values + 6, shape); + std::copy(perm_values, perm_values + 6, perm); + } + } else { + n = input_shape[0]; + h = input_shape[2]; + w = input_shape[3]; + c = input_shape[1]; + + if (is_dcr_) { + int64_t shape_values[] = {n, blocksize_, blocksize_, c / (blocksize_ * blocksize_), h, w}; + int64_t perm_values[] = {0, 3, 4, 1, 5, 2}; + std::copy(shape_values, shape_values + 6, shape); + std::copy(perm_values, perm_values + 6, perm); + } else { + int64_t shape_values[] = {n, c / (blocksize_ * blocksize_), blocksize_, blocksize_, h, w}; + int64_t perm_values[] = {0, 1, 4, 2, 5, 3}; + std::copy(shape_values, shape_values + 6, shape); + std::copy(perm_values, perm_values + 6, perm); + } + } + + std::vector shape_vec(shape, shape + 6); + TensorShape input_override_shape(shape_vec); + + // Calculate the final 4D output shape + int64_t output_shape[4]; + if (is_nhwc) { + int64_t output_shape_values[] = {n, h * blocksize_, w * blocksize_, c / (blocksize_ * blocksize_)}; + std::copy(output_shape_values, output_shape_values + 4, output_shape); + } else { + int64_t output_shape_values[] = {n, c / (blocksize_ * blocksize_), h * blocksize_, w * blocksize_}; + std::copy(output_shape_values, output_shape_values + 4, output_shape); + } + TensorShape final_output_shape(gsl::make_span(output_shape)); + + auto* output = context.Output(0, final_output_shape); + int64_t output_size = output->Shape().Size(); + + if (output_size == 0) { + return Status::OK(); + } + + std::vector shape_after_permutation_vec(6); + for (int i = 0; i < 6; i++) { + shape_after_permutation_vec[i] = shape[perm[i]]; + } + TensorShape output_override_shape(shape_after_permutation_vec); + + DepthToSpaceProgram program{perm}; + program + .AddInput({input, ProgramTensorMetadataDependency::TypeAndRank, input_override_shape, 1}) + .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank, output_override_shape, 1}) + .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) + .CacheHint(absl::StrJoin(input_shape.GetDims(), "-"), blocksize_, is_dcr_ ? "DCR" : "CRD") + .AddUniformVariable({static_cast(output_size)}); + return context.RunProgram(program); +} + +} // namespace webgpu +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.h b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.h new file mode 100644 index 0000000000000..153618b5d0237 --- /dev/null +++ b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.h @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/webgpu/program.h" +#include "core/providers/webgpu/webgpu_kernel.h" + +namespace onnxruntime { +namespace webgpu { + +class DepthToSpaceProgram final : public Program { + public: + DepthToSpaceProgram(int64_t* perm) : Program{"DepthToSpace"}, perm_{perm} {} + Status GenerateShaderCode(ShaderHelper& sh) const override; + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32}); + + private: + int64_t* perm_; +}; + +template +class DepthToSpace final : public WebGpuKernel { + public: + DepthToSpace(const OpKernelInfo& info) : WebGpuKernel(info) { + blocksize_ = info.GetAttr("blocksize"); + std::string mode = info.GetAttrOrDefault("mode", "DCR"); + is_dcr_ = (mode == "DCR"); + } + + Status ComputeInternal(ComputeContext& context) const override; + + private: + int64_t blocksize_; + bool is_dcr_; +}; + +} // namespace webgpu +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index b126ca823970a..f5f108121cb8d 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -581,10 +581,10 @@ std::unique_ptr RegisterKernels() { BuildKernelCreateInfo, BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc index a40b85b7754a3..f97de7a54bc99 100644 --- a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc @@ -468,5 +468,111 @@ TEST(TensorOpTest, DepthToSpaceTest_CRD_Batched) { test.Run(); } +TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DefaultMode1) { + OpTester test("DepthToSpace", 11); + constexpr int64_t blocksize = 2; + test.AddAttribute("blocksize", blocksize); + + constexpr int64_t N = 1, C = 8, H = 1, W = 1; + std::vector X = {0, 9, 18, 27, 36, 45, 54, 63}; + + test.AddInput("input", {N, C, H, W}, X); + + std::vector result = {0, 18, 36, 54, 9, 27, 45, 63}; + + test.AddOutput("output", {1, 2, 2, 2}, result); + test.Run(); +} + +TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DefaultMode2) { + OpTester test("DepthToSpace", 11); + constexpr int64_t blocksize = 2; + test.AddAttribute("blocksize", blocksize); + + constexpr int64_t N = 2, C = 8, H = 1, W = 2; + std::vector X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31}; + + test.AddInput("input", {N, C, H, W}, X); + + std::vector result = {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15, 16, 20, 17, 21, 24, 28, 25, 29, 18, 22, 19, 23, 26, + 30, 27, 31}; + + test.AddOutput("output", {2, 2, 2, 4}, result); + test.Run(); +} + +TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DCR1) { + OpTester test("DepthToSpace", 11); + constexpr int64_t blocksize = 2; + test.AddAttribute("blocksize", blocksize); + test.AddAttribute("mode", "DCR"); + + constexpr int64_t N = 1, C = 8, H = 1, W = 1; + std::vector X = {0, 9, 18, 27, 36, 45, 54, 63}; + + test.AddInput("input", {N, C, H, W}, X); + + std::vector result = {0, 18, 36, 54, 9, 27, 45, 63}; + + test.AddOutput("output", {1, 2, 2, 2}, result); + test.Run(); +} + +TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DCR2) { + OpTester test("DepthToSpace", 11); + constexpr int64_t blocksize = 2; + test.AddAttribute("blocksize", blocksize); + test.AddAttribute("mode", "DCR"); + + constexpr int64_t N = 2, C = 8, H = 1, W = 2; + std::vector X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31}; + + test.AddInput("input", {N, C, H, W}, X); + + std::vector result = {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15, 16, 20, 17, 21, 24, 28, 25, 29, 18, 22, 19, 23, 26, + 30, 27, 31}; + + test.AddOutput("output", {2, 2, 2, 4}, result); + test.Run(); +} + +TEST(TensorOpTest, DepthToSpaceTest_WebGPU_CRD1) { + OpTester test("DepthToSpace", 11); + constexpr int64_t blocksize = 2; + test.AddAttribute("blocksize", blocksize); + test.AddAttribute("mode", "CRD"); + + constexpr int64_t N = 1, C = 8, H = 1, W = 1; + std::vector X = {0, 9, 18, 27, 36, 45, 54, 63}; + + test.AddInput("input", {N, C, H, W}, X); + + std::vector result = {0, 9, 18, 27, 36, 45, 54, 63}; + + test.AddOutput("output", {1, 2, 2, 2}, result); + test.Run(); +} + +TEST(TensorOpTest, DepthToSpaceTest_WebGPU_CRD2) { + OpTester test("DepthToSpace", 11); + constexpr int64_t blocksize = 2; + test.AddAttribute("blocksize", blocksize); + test.AddAttribute("mode", "CRD"); + + constexpr int64_t N = 2, C = 8, H = 1, W = 2; + std::vector X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31}; + + test.AddInput("input", {N, C, H, W}, X); + + std::vector result = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15, 16, 18, 17, 19, 20, 22, 21, 23, 24, 26, 25, 27, 28, + 30, 29, 31}; + + test.AddOutput("output", {2, 2, 2, 4}, result); + test.Run(); +} + } // namespace test } // namespace onnxruntime From 31e1779f741ecc3a5c79a37a6b0154752884f522 Mon Sep 17 00:00:00 2001 From: Prathik Rao Date: Mon, 21 Apr 2025 15:50:39 -0700 Subject: [PATCH 2/3] add rank==4 check --- onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc index fc981b7cc6feb..f55cf012f8c2f 100644 --- a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc +++ b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc @@ -64,6 +64,8 @@ template Status DepthToSpace::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const { const auto* input = context.Input(0); const TensorShape input_shape = input->Shape(); + int64_t input_rank = input_shape.NumDimensions(); + ORT_ENFORCE(input_rank == 4, "Input must be rank 4."); int64_t n, c, h, w; int64_t shape[6]; @@ -133,8 +135,8 @@ Status DepthToSpace::ComputeInternal(onnxruntime::webgpu::ComputeContex DepthToSpaceProgram program{perm}; program - .AddInput({input, ProgramTensorMetadataDependency::TypeAndRank, input_override_shape, 1}) - .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank, output_override_shape, 1}) + .AddInput({input, ProgramTensorMetadataDependency::Type, input_override_shape, 1}) + .AddOutput({output, ProgramTensorMetadataDependency::None, output_override_shape, 1}) .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) .CacheHint(absl::StrJoin(input_shape.GetDims(), "-"), blocksize_, is_dcr_ ? "DCR" : "CRD") .AddUniformVariable({static_cast(output_size)}); From aa74351bb99a3f7cc805acbbd21b609291f6fbed Mon Sep 17 00:00:00 2001 From: Prathik Rao Date: Mon, 21 Apr 2025 16:32:11 -0700 Subject: [PATCH 3/3] add rank dependency --- onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc index f55cf012f8c2f..e7f902cc08b40 100644 --- a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc +++ b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc @@ -135,7 +135,7 @@ Status DepthToSpace::ComputeInternal(onnxruntime::webgpu::ComputeContex DepthToSpaceProgram program{perm}; program - .AddInput({input, ProgramTensorMetadataDependency::Type, input_override_shape, 1}) + .AddInput({input, ProgramTensorMetadataDependency::TypeAndRank, input_override_shape, 1}) .AddOutput({output, ProgramTensorMetadataDependency::None, output_override_shape, 1}) .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) .CacheHint(absl::StrJoin(input_shape.GetDims(), "-"), blocksize_, is_dcr_ ? "DCR" : "CRD")