diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc new file mode 100644 index 0000000000000..eb7903e7903b6 --- /dev/null +++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc @@ -0,0 +1,168 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/webgpu/reduction/reduction_ops.h" +#include +#include "core/framework/data_transfer_manager.h" +#include "core/providers/webgpu/data_transfer.h" +#include "core/providers/webgpu/shader_helper.h" +#include "core/providers/webgpu/webgpu_supported_types.h" + +namespace onnxruntime { +namespace webgpu { + +#define REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceOp, begin, end) \ + ONNX_OPERATOR_VERSIONED_KERNEL_EX( \ + ReduceOp, \ + kOnnxDomain, \ + begin, end, \ + kWebGpuExecutionProvider, \ + (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()), \ + ReduceOp); + +#define REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceOp, version) \ + ONNX_OPERATOR_KERNEL_EX( \ + ReduceOp, \ + kOnnxDomain, \ + version, \ + kWebGpuExecutionProvider, \ + (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()).InputMemoryType(OrtMemTypeCPUInput, 1), \ + ReduceOp); + +REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 1, 10); +REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12); +REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 13, 17); +REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 18); + +Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const { + const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); + const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); + bool reduce_on_all_axes = no_op_with_empty_axes_ == false && axes_.empty(); + std::string loop_header = code_[0]; + std::string loop_body = "let current_element: input_value_t = " + input.GetByIndices("input_indices") + ";\n" + code_[1]; + std::string loop_footer = code_[2]; + const auto input_rank = input.Rank(); + for (int i = 0, l = 0; i < input_rank; ++i) { + if (reduce_on_all_axes || std::find(axes_.begin(), axes_.end(), i) != axes_.end()) { + if (keepdims_) { + l++; + } + std::stringstream ss; + std::string index = "i" + std::to_string(i); + ss << "for (var " << index << " : u32 = 0; " << index << " < " << input.IndicesGet("uniforms.input_shape", i) << "; " << index << "++) {\n"; + ss << input.IndicesSet("input_indices", i, index) << ";\n"; + ss << loop_body << "\n"; + ss << "}\n"; + loop_body = ss.str(); + } else { + std::stringstream ss; + ss << loop_header << "\n"; + std::string index = "i" + std::to_string(i); + ss << "let " << index << " = " << output.IndicesGet("output_indices", l) << ";\n"; + ss << input.IndicesSet("input_indices", i, index) << ";\n"; + loop_header = ss.str(); + l++; + } + } + std::stringstream input_indices_init_value; + for (int i = 0; i < input_rank - 1; ++i) { + input_indices_init_value << "0, "; + } + input_indices_init_value << "0"; + shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size") + << "let output_indices: output_indices_t = " << output.OffsetToIndices("global_idx") << ";\n" + << "var input_indices: input_indices_t = input_indices_t(" << input_indices_init_value.str() << ");\n" + << loop_header << loop_body << loop_footer; + shader.MainFunctionBody() << output.SetByOffset("global_idx", "output_value"); + return Status::OK(); +} + +template +Status ReduceKernel::ComputeInternal(ComputeContext& context) const { + const auto* input_tensor = context.Input(0); + InlinedVector input_axes; + auto rank = input_tensor->Shape().NumDimensions(); + auto transform_axis = [rank](int64_t axis) { + if (axis < 0) { + axis += rank; + } + if (axis < 0 || static_cast(axis) >= rank) { + ORT_THROW("Axes values must be in the range [-rank, rank-1]. Got: ", axis); + } + return static_cast(axis); + }; + // Check if axes input is provided and copy the axes values to input_axes + if (context.InputCount() > 1) { + ORT_ENFORCE(axes_.empty(), "Axes attribute may not be specified when axes input is also provided."); + const Tensor* axes_tensor = context.Input(1); + auto size = static_cast(axes_tensor->Shape()[0]); + const auto* data = axes_tensor->Data(); + input_axes.reserve(size); + std::transform(data, data + size, std::back_inserter(input_axes), transform_axis); + } else { + input_axes.reserve(axes_.size()); + std::transform(axes_.begin(), axes_.end(), std::back_inserter(input_axes), transform_axis); + } + if (input_axes.empty()) { + if (noop_with_empty_axes_ || rank == 0) { + // If axes is empty and noop_with_empty_axes_ is true, it is a no-op according to the spec + // If input tensor is a scalar, return the input tensor as is. + // This is not correct for ReduceLogSum and ReduceSumSquare + // TODO handle these cases separately. + auto output = context.Output(0, input_tensor->Shape()); + if (output->DataRaw() != input_tensor->DataRaw()) { + ORT_RETURN_IF_ERROR(Info().GetDataTransferManager().CopyTensor(*input_tensor, *output)); + } + return Status::OK(); + } else { + // If axes is empty and noop_with_empty_axes_ is false, it is a reduction over all axes + input_axes.resize(rank); + std::iota(input_axes.begin(), input_axes.end(), 0); + } + } + const auto code = GetOpSpecificCode(input_tensor, input_axes.size()); + // Compute output shape + std::vector output_shape; + for (size_t i = 0; i < input_tensor->Shape().NumDimensions(); ++i) { + if (std::find(input_axes.begin(), input_axes.end(), i) != input_axes.end()) { + if (keepdims_) { + output_shape.push_back(1); + } + } else { + output_shape.push_back(input_tensor->Shape()[i]); + } + } + TensorShape output_tensor_shape(output_shape); + int64_t output_size = output_tensor_shape.Size(); + ReduceKernelProgram program("ReduceMean", keepdims_, noop_with_empty_axes_, input_axes, code); + program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank}) + .AddOutput({context.Output(0, output_shape), ProgramTensorMetadataDependency::TypeAndRank}) + .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) + .AddUniformVariables({{static_cast(output_size)}, + {static_cast(noop_with_empty_axes_ ? 1 : 0)}, + {input_axes}, + {static_cast(input_axes.size())}}); + + return context.RunProgram(program); +} + +ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const { + const TensorShape& input_shape = input_tensor->Shape(); + size_t input_rank = input_shape.NumDimensions(); + std::stringstream ss; + ss << "var size: u32 = 1;\n" + << "for (var i: u32 = 0; i < uniforms.axes_size; i += 1) { \n" + << " let index = " << GetElementAt("uniforms.axes", "i", axes_size) << ";\n" + << " size = size * " << GetElementAt("uniforms.input_shape", "index", input_rank) << ";\n" + << "}\n" + << "let output_value = output_value_t(sum / f32(size));"; + ReduceOpSpecificCode code({"var sum = f32(0);", "sum += f32(current_element);", ss.str()}); + return code; +} + +Status ReduceMean::ComputeInternal(ComputeContext& ctx) const { + return ReduceKernel::ComputeInternal(ctx); +} + +} // namespace webgpu +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h new file mode 100644 index 0000000000000..e93eb06f20886 --- /dev/null +++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/optional.h" +#include "core/providers/webgpu/webgpu_supported_types.h" +#include "core/providers/webgpu/webgpu_kernel.h" +#include "core/providers/cpu/reduction/reduction_kernel_base.h" +#include "core/providers/webgpu/program.h" +#include "core/providers/webgpu/shader_helper.h" +namespace onnxruntime { +namespace webgpu { +// reduceOpSpecificCode is a 3-element array of strings that represent the op specific code for the reduce operation. +// The first element is the loop header, the second element is the loop body, and the third element is the loop footer. +// The loop header is the code that is executed before the loop starts. The loop body is the code that is executed for each element in the loop. +// The loop footer is the code that is executed after the loop ends. +typedef std::array ReduceOpSpecificCode; +class ReduceKernelProgram final : public Program { + public: + ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector& axes, ReduceOpSpecificCode code) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code) {} + Status GenerateShaderCode(ShaderHelper& wgpuShaderModuleAddRef) const override; + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32}, + {"no_op_with_empty_axes", ProgramUniformVariableDataType::Uint32}, + {"axes", ProgramUniformVariableDataType::Uint32}, + {"axes_size", ProgramUniformVariableDataType::Uint32}); + + private: + const bool keepdims_; + const bool no_op_with_empty_axes_; + InlinedVector axes_; + ReduceOpSpecificCode code_; +}; + +template +class ReduceKernel : public WebGpuKernel, public ReduceKernelBase { + protected: + using ReduceKernelBase::axes_; + using ReduceKernelBase::noop_with_empty_axes_; + using ReduceKernelBase::keepdims_; + using ReduceKernelBase::select_last_index_; + + ReduceKernel(const OpKernelInfo& info, std::string name, optional keepdims_override = {}) + : WebGpuKernel(info), + ReduceKernelBase(info, keepdims_override), + name_(name) { + } + Status ComputeInternal(ComputeContext& ctx) const; + virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const = 0; + + private: + std::string name_; +}; + +class ReduceMean final : public ReduceKernel { + public: + ReduceMean(const OpKernelInfo& info) : ReduceKernel(info, "ReduceMean") {} + ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const override; + Status ComputeInternal(ComputeContext& ctx) const override; +}; + +} // namespace webgpu +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index d44cf4674d8a3..4950d94dea4c4 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -516,10 +516,10 @@ std::unique_ptr RegisterKernels() { // BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo,