Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 147 additions & 0 deletions onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "core/providers/webgpu/shader_helper.h"
#include "core/providers/webgpu/webgpu_supported_types.h"
#include "core/providers/webgpu/tensor/depth_to_space.h"
#include "core/providers/webgpu/webgpu_utils.h"

namespace onnxruntime {
namespace webgpu {

#define WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(start, end, domain, is_nhwc) \
ONNX_OPERATOR_VERSIONED_KERNEL_EX( \
DepthToSpace, \
domain, \
start, \
end, \
kWebGpuExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", WebGpuSupportedFloatTypes()), \
DepthToSpace<is_nhwc>);

#define WEBGPU_DEPTH_TO_SPACE_KERNEL(version, domain, is_nhwc) \
ONNX_OPERATOR_KERNEL_EX( \
DepthToSpace, \
domain, \
version, \
kWebGpuExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", WebGpuSupportedFloatTypes()), \
DepthToSpace<is_nhwc>);

WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(11, 12, kOnnxDomain, false)
WEBGPU_DEPTH_TO_SPACE_KERNEL(13, kOnnxDomain, false)

WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(11, 12, kMSInternalNHWCDomain, true)
WEBGPU_DEPTH_TO_SPACE_KERNEL(13, kMSInternalNHWCDomain, true)

void AppendPermFunction(std::ostream& os, const ShaderVariableHelper& input, const int64_t* perm) {
os << "fn perm(i: input_indices_t) -> input_indices_t {\n"
<< " var a: input_indices_t;\n";
for (int idx = 0; idx < input.Rank(); ++idx) {
os << " " << input.IndicesSet("a", std::to_string(perm[idx]), "i[" + std::to_string(idx) + "]") << "\n";
}
os << " return a;\n"
<< "}\n";
}

Status DepthToSpaceProgram::GenerateShaderCode(ShaderHelper& shader) const {
const ShaderVariableHelper& input = shader.AddInput("input");
const ShaderVariableHelper& output = shader.AddOutput("output");

AppendPermFunction(shader.AdditionalImplementation(), input, perm_);

shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
<< " let indices = " << output.OffsetToIndices("global_idx") << ";\n"
<< " let aIndices = perm(indices);\n"
<< " " << output.SetByOffset("global_idx", input.GetByIndices("aIndices"));

return Status::OK();
}

template <bool is_nhwc>
Status DepthToSpace<is_nhwc>::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
const auto* input = context.Input(0);
const TensorShape input_shape = input->Shape();
int64_t input_rank = input_shape.NumDimensions();
ORT_ENFORCE(input_rank == 4, "Input must be rank 4.");

int64_t n, c, h, w;
int64_t shape[6];
int64_t perm[6];
if (is_nhwc) {
n = input_shape[0];
h = input_shape[1];
w = input_shape[2];
c = input_shape[3];

if (is_dcr_) {
int64_t shape_values[] = {n, h, w, blocksize_, blocksize_, c / (blocksize_ * blocksize_)};
int64_t perm_values[] = {0, 1, 3, 2, 4, 5};
std::copy(shape_values, shape_values + 6, shape);
std::copy(perm_values, perm_values + 6, perm);
} else {
int64_t shape_values[] = {n, h, w, c / (blocksize_ * blocksize_), blocksize_, blocksize_};
int64_t perm_values[] = {0, 1, 4, 2, 5, 3};
std::copy(shape_values, shape_values + 6, shape);
std::copy(perm_values, perm_values + 6, perm);
}
} else {
n = input_shape[0];
h = input_shape[2];
w = input_shape[3];
c = input_shape[1];

if (is_dcr_) {
int64_t shape_values[] = {n, blocksize_, blocksize_, c / (blocksize_ * blocksize_), h, w};
int64_t perm_values[] = {0, 3, 4, 1, 5, 2};
std::copy(shape_values, shape_values + 6, shape);
std::copy(perm_values, perm_values + 6, perm);
} else {
int64_t shape_values[] = {n, c / (blocksize_ * blocksize_), blocksize_, blocksize_, h, w};
int64_t perm_values[] = {0, 1, 4, 2, 5, 3};
std::copy(shape_values, shape_values + 6, shape);
std::copy(perm_values, perm_values + 6, perm);
}
}

std::vector<int64_t> shape_vec(shape, shape + 6);
TensorShape input_override_shape(shape_vec);

// Calculate the final 4D output shape
int64_t output_shape[4];
if (is_nhwc) {
int64_t output_shape_values[] = {n, h * blocksize_, w * blocksize_, c / (blocksize_ * blocksize_)};
std::copy(output_shape_values, output_shape_values + 4, output_shape);
} else {
int64_t output_shape_values[] = {n, c / (blocksize_ * blocksize_), h * blocksize_, w * blocksize_};
std::copy(output_shape_values, output_shape_values + 4, output_shape);

Check warning on line 119 in onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <algorithm> for copy [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc:119: Add #include <algorithm> for copy [build/include_what_you_use] [4]
}
TensorShape final_output_shape(gsl::make_span(output_shape));

auto* output = context.Output(0, final_output_shape);
int64_t output_size = output->Shape().Size();

if (output_size == 0) {
return Status::OK();
}

std::vector<int64_t> shape_after_permutation_vec(6);

Check warning on line 130 in onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <vector> for vector<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc:130: Add #include <vector> for vector<> [build/include_what_you_use] [4]
for (int i = 0; i < 6; i++) {
shape_after_permutation_vec[i] = shape[perm[i]];
}
TensorShape output_override_shape(shape_after_permutation_vec);

DepthToSpaceProgram program{perm};
program
.AddInput({input, ProgramTensorMetadataDependency::TypeAndRank, input_override_shape, 1})
.AddOutput({output, ProgramTensorMetadataDependency::None, output_override_shape, 1})
.SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
.CacheHint(absl::StrJoin(input_shape.GetDims(), "-"), blocksize_, is_dcr_ ? "DCR" : "CRD")
.AddUniformVariable({static_cast<uint32_t>(output_size)});
return context.RunProgram(program);
}

} // namespace webgpu
} // namespace onnxruntime
39 changes: 39 additions & 0 deletions onnxruntime/core/providers/webgpu/tensor/depth_to_space.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

#include "core/providers/webgpu/program.h"
#include "core/providers/webgpu/webgpu_kernel.h"

namespace onnxruntime {
namespace webgpu {

class DepthToSpaceProgram final : public Program<DepthToSpaceProgram> {
public:
DepthToSpaceProgram(int64_t* perm) : Program{"DepthToSpace"}, perm_{perm} {}
Status GenerateShaderCode(ShaderHelper& sh) const override;
WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32});

private:
int64_t* perm_;
};

template <bool is_nhwc>
class DepthToSpace final : public WebGpuKernel {
public:
DepthToSpace(const OpKernelInfo& info) : WebGpuKernel(info) {
blocksize_ = info.GetAttr<int64_t>("blocksize");
std::string mode = info.GetAttrOrDefault<std::string>("mode", "DCR");

Check warning on line 27 in onnxruntime/core/providers/webgpu/tensor/depth_to_space.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <string> for string [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/webgpu/tensor/depth_to_space.h:27: Add #include <string> for string [build/include_what_you_use] [4]
is_dcr_ = (mode == "DCR");
}

Status ComputeInternal(ComputeContext& context) const override;

private:
int64_t blocksize_;
bool is_dcr_;
};

} // namespace webgpu
} // namespace onnxruntime
Original file line number Diff line number Diff line change
Expand Up @@ -581,10 +581,10 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 21, 22, Transpose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 23, Transpose)>,

// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace)>,

BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Conv)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 21, Conv)>,
Expand Down
106 changes: 106 additions & 0 deletions onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -468,5 +468,111 @@ TEST(TensorOpTest, DepthToSpaceTest_CRD_Batched) {
test.Run();
}

TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DefaultMode1) {
OpTester test("DepthToSpace", 11);
constexpr int64_t blocksize = 2;
test.AddAttribute("blocksize", blocksize);

constexpr int64_t N = 1, C = 8, H = 1, W = 1;
std::vector<float> X = {0, 9, 18, 27, 36, 45, 54, 63};

test.AddInput<float>("input", {N, C, H, W}, X);

std::vector<float> result = {0, 18, 36, 54, 9, 27, 45, 63};

test.AddOutput<float>("output", {1, 2, 2, 2}, result);
test.Run();
}

TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DefaultMode2) {
OpTester test("DepthToSpace", 11);
constexpr int64_t blocksize = 2;
test.AddAttribute("blocksize", blocksize);

constexpr int64_t N = 2, C = 8, H = 1, W = 2;
std::vector<float> X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31};

test.AddInput<float>("input", {N, C, H, W}, X);

std::vector<float> result = {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15, 16, 20, 17, 21, 24, 28, 25, 29, 18, 22, 19, 23, 26,
30, 27, 31};

test.AddOutput<float>("output", {2, 2, 2, 4}, result);
test.Run();
}

TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DCR1) {
OpTester test("DepthToSpace", 11);
constexpr int64_t blocksize = 2;
test.AddAttribute("blocksize", blocksize);
test.AddAttribute("mode", "DCR");

constexpr int64_t N = 1, C = 8, H = 1, W = 1;
std::vector<float> X = {0, 9, 18, 27, 36, 45, 54, 63};

test.AddInput<float>("input", {N, C, H, W}, X);

std::vector<float> result = {0, 18, 36, 54, 9, 27, 45, 63};

test.AddOutput<float>("output", {1, 2, 2, 2}, result);
test.Run();
}

TEST(TensorOpTest, DepthToSpaceTest_WebGPU_DCR2) {
OpTester test("DepthToSpace", 11);
constexpr int64_t blocksize = 2;
test.AddAttribute("blocksize", blocksize);
test.AddAttribute("mode", "DCR");

constexpr int64_t N = 2, C = 8, H = 1, W = 2;
std::vector<float> X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31};

test.AddInput<float>("input", {N, C, H, W}, X);

std::vector<float> result = {0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15, 16, 20, 17, 21, 24, 28, 25, 29, 18, 22, 19, 23, 26,
30, 27, 31};

test.AddOutput<float>("output", {2, 2, 2, 4}, result);
test.Run();
}

TEST(TensorOpTest, DepthToSpaceTest_WebGPU_CRD1) {
OpTester test("DepthToSpace", 11);
constexpr int64_t blocksize = 2;
test.AddAttribute("blocksize", blocksize);
test.AddAttribute("mode", "CRD");

constexpr int64_t N = 1, C = 8, H = 1, W = 1;
std::vector<float> X = {0, 9, 18, 27, 36, 45, 54, 63};

test.AddInput<float>("input", {N, C, H, W}, X);

std::vector<float> result = {0, 9, 18, 27, 36, 45, 54, 63};

test.AddOutput<float>("output", {1, 2, 2, 2}, result);
test.Run();
}

TEST(TensorOpTest, DepthToSpaceTest_WebGPU_CRD2) {
OpTester test("DepthToSpace", 11);
constexpr int64_t blocksize = 2;
test.AddAttribute("blocksize", blocksize);
test.AddAttribute("mode", "CRD");

constexpr int64_t N = 2, C = 8, H = 1, W = 2;
std::vector<float> X = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31};

test.AddInput<float>("input", {N, C, H, W}, X);

std::vector<float> result = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15, 16, 18, 17, 19, 20, 22, 21, 23, 24, 26, 25, 27, 28,
30, 29, 31};

test.AddOutput<float>("output", {2, 2, 2, 4}, result);
test.Run();
}

} // namespace test
} // namespace onnxruntime
Loading