Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions onnxruntime/core/providers/webgpu/math/cum_sum.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "core/providers/webgpu/math/cum_sum.h"
#include "core/providers/webgpu/shader_helper.h"
#include "core/providers/webgpu/webgpu_supported_types.h"

namespace onnxruntime {
namespace webgpu {

ONNX_OPERATOR_VERSIONED_KERNEL_EX(
CumSum,
kOnnxDomain,
11, 13,
kWebGpuExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", WebGpuSupportedFloatTypes())
.TypeConstraint("T2", {DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>()})
.InputMemoryType(OrtMemTypeCPU, 1),
CumSum);

ONNX_OPERATOR_KERNEL_EX(
CumSum,
kOnnxDomain,
14,
kWebGpuExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", WebGpuSupportedFloatTypes())
.TypeConstraint("T2", {DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>()})
.InputMemoryType(OrtMemTypeCPU, 1),
CumSum);

Status CumSumProgram::GenerateShaderCode(ShaderHelper& shader) const {
const ShaderVariableHelper& input = shader.AddInput("input", ShaderUsage::UseUniform);
const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);

shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
<< "var input_indices = " << input.OffsetToIndices("global_idx") << ";\n"
<< "var sum : output_value_t = 0;\n"
<< "var first : i32 = 0;\n"
<< "if (uniforms.reverse == 1) {\n"
<< " first = i32(" + input.IndicesGet("input_indices", "uniforms.axis") + ");\n"
<< " if (uniforms.exclusive == 1) { first += 1; }\n"
<< "}\n\n"
<< "var last : i32 = 0;\n"
<< "if (uniforms.reverse == 1) {\n"
<< " last = i32(" << GetElementAt("uniforms.input_shape", "uniforms.axis", input.Rank()) << ");\n"
<< "} else {\n"
<< " last = i32(" + input.IndicesGet("input_indices", "uniforms.axis") + ");\n"
<< " if (uniforms.exclusive == 0) { last += 1; }\n"
<< "}\n\n"
<< "for (var i : i32 = first; i < last; i++) {\n"
<< " " << input.IndicesSet("input_indices", "uniforms.axis", "u32(i)") << ";\n"
<< " sum = sum + " << input.GetByIndices("input_indices") << ";\n"
<< "}\n"
<< output.SetByOffset("global_idx", "sum");

return Status::OK();
}

Status CumSum::ComputeInternal(ComputeContext& context) const {
const auto* input_tensor = context.Input(0);
const TensorShape& input_shape = input_tensor->Shape();
int64_t input_rank = input_shape.NumDimensions();

const auto* axis_tensor = context.Input(1);
const auto* axis_data = axis_tensor->Data<int>();
int64_t axis = static_cast<int64_t>(axis_data[0]);

ORT_ENFORCE(-input_rank <= axis && axis < input_rank, "Axes attribute must be within range -input_rank <= axis < input_rank.");
// Handle negative axis
if (axis < 0) {
axis += input_rank;
}

auto* output_tensor = context.Output(0, input_shape);
int64_t output_size = output_tensor->Shape().Size();

if (output_size == 0) {
return Status::OK();
}

CumSumProgram program{};
program
.AddInput({input_tensor})
.AddOutput({output_tensor, ProgramTensorMetadataDependency::TypeAndRank})
.SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
.AddUniformVariables({{static_cast<uint32_t>(output_size)},
{static_cast<uint32_t>(axis)},
{static_cast<uint32_t>(exclusive_)},
{static_cast<uint32_t>(reverse_)}});
return context.RunProgram(program);
}

} // namespace webgpu
} // namespace onnxruntime
39 changes: 39 additions & 0 deletions onnxruntime/core/providers/webgpu/math/cum_sum.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

#include "core/providers/webgpu/webgpu_kernel.h"
#include "core/providers/webgpu/program.h"

namespace onnxruntime {
namespace webgpu {

class CumSumProgram final : public Program<CumSumProgram> {
public:
CumSumProgram() : Program{"CumSum"} {}

Status GenerateShaderCode(ShaderHelper& sh) const override;

WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
{"axis", ProgramUniformVariableDataType::Uint32},
{"exclusive", ProgramUniformVariableDataType::Uint32},
{"reverse", ProgramUniformVariableDataType::Uint32});
};

class CumSum final : public WebGpuKernel {
public:
CumSum(const OpKernelInfo& info) : WebGpuKernel(info) {
exclusive_ = info.GetAttrOrDefault<int64_t>("exclusive", 0);
reverse_ = info.GetAttrOrDefault<int64_t>("reverse", 0);
}

Status ComputeInternal(ComputeContext& context) const override;

private:
int64_t exclusive_;
int64_t reverse_;
};

} // namespace webgpu
} // namespace onnxruntime
Original file line number Diff line number Diff line change
Expand Up @@ -713,8 +713,8 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 9, 13, BatchNormalization)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 14, 14, BatchNormalization)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 15, BatchNormalization)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 13, CumSum)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 14, CumSum)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 13, CumSum)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 14, CumSum)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 12, int32_t, DequantizeLinear)>,
Expand Down
Loading