diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc index eb7903e7903b6..255ad9cdf66c6 100644 --- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc @@ -34,11 +34,28 @@ REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12); REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 13, 17); REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 18); +REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 1, 10); +REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 11, 11); +REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 12, 12); +REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 13, 17); +REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMax, 18); + +REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 1, 10); +REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 11, 12); +REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSum, 13); + Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const { - const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); + if (is_input_empty_) { + shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size") + << code_[0] + << code_[2] + << output.SetByOffset("global_idx", "output_value"); + return Status::OK(); + } + const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); bool reduce_on_all_axes = no_op_with_empty_axes_ == false && axes_.empty(); - std::string loop_header = code_[0]; + std::string loop_header = code_[0].find("first_element") == std::string::npos ? code_[0] : "let first_element = " + input.GetByIndices("input_indices") + ";\n" + code_[0] + "\n"; std::string loop_body = "let current_element: input_value_t = " + input.GetByIndices("input_indices") + ";\n" + code_[1]; std::string loop_footer = code_[2]; const auto input_rank = input.Rank(); @@ -56,10 +73,10 @@ Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const { loop_body = ss.str(); } else { std::stringstream ss; - ss << loop_header << "\n"; std::string index = "i" + std::to_string(i); ss << "let " << index << " = " << output.IndicesGet("output_indices", l) << ";\n"; ss << input.IndicesSet("input_indices", i, index) << ";\n"; + ss << loop_header << "\n"; loop_header = ss.str(); l++; } @@ -80,6 +97,7 @@ Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const { template Status ReduceKernel::ComputeInternal(ComputeContext& context) const { const auto* input_tensor = context.Input(0); + ORT_RETURN_IF_ERROR(CheckInput(input_tensor)); InlinedVector input_axes; auto rank = input_tensor->Shape().NumDimensions(); auto transform_axis = [rank](int64_t axis) { @@ -95,10 +113,12 @@ Status ReduceKernel::ComputeInternal(ComputeContext& context) if (context.InputCount() > 1) { ORT_ENFORCE(axes_.empty(), "Axes attribute may not be specified when axes input is also provided."); const Tensor* axes_tensor = context.Input(1); - auto size = static_cast(axes_tensor->Shape()[0]); - const auto* data = axes_tensor->Data(); - input_axes.reserve(size); - std::transform(data, data + size, std::back_inserter(input_axes), transform_axis); + if (nullptr != axes_tensor) { + auto size = static_cast(axes_tensor->Shape()[0]); + const auto* data = axes_tensor->Data(); + input_axes.reserve(size); + std::transform(data, data + size, std::back_inserter(input_axes), transform_axis); + } } else { input_axes.reserve(axes_.size()); std::transform(axes_.begin(), axes_.end(), std::back_inserter(input_axes), transform_axis); @@ -120,10 +140,12 @@ Status ReduceKernel::ComputeInternal(ComputeContext& context) std::iota(input_axes.begin(), input_axes.end(), 0); } } - const auto code = GetOpSpecificCode(input_tensor, input_axes.size()); + const auto code = GetOpSpecificCode(input_tensor); // Compute output shape std::vector output_shape; + bool is_input_empty = false; for (size_t i = 0; i < input_tensor->Shape().NumDimensions(); ++i) { + is_input_empty |= input_tensor->Shape()[i] == 0; if (std::find(input_axes.begin(), input_axes.end(), i) != input_axes.end()) { if (keepdims_) { output_shape.push_back(1); @@ -134,34 +156,68 @@ Status ReduceKernel::ComputeInternal(ComputeContext& context) } TensorShape output_tensor_shape(output_shape); int64_t output_size = output_tensor_shape.Size(); - ReduceKernelProgram program("ReduceMean", keepdims_, noop_with_empty_axes_, input_axes, code); - program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank}) + if (output_size == 0) { + ORT_IGNORE_RETURN_VALUE(context.Output(0, output_tensor_shape)); + return Status::OK(); + } + + auto input_rank = input_tensor->Shape().NumDimensions(); + // reduce_axes element is either 1 or 0 depending on whether the axis is reduced or not + std::vector reduce_axes; + reduce_axes.resize(input_rank, 0); + for (auto axis : input_axes) { + reduce_axes[axis] = 1; + } + + ReduceKernelProgram program(name_, keepdims_, noop_with_empty_axes_, input_axes, code, is_input_empty); + if (!is_input_empty) { + program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank}); + } + + program.CacheHint(is_input_empty) .AddOutput({context.Output(0, output_shape), ProgramTensorMetadataDependency::TypeAndRank}) .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) .AddUniformVariables({{static_cast(output_size)}, {static_cast(noop_with_empty_axes_ ? 1 : 0)}, - {input_axes}, - {static_cast(input_axes.size())}}); + {reduce_axes}}); return context.RunProgram(program); } -ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const { +ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor) const { const TensorShape& input_shape = input_tensor->Shape(); size_t input_rank = input_shape.NumDimensions(); + std::string loop_header = "var sum = f32(0);"; + std::string loop_body = "sum += f32(current_element);"; std::stringstream ss; ss << "var size: u32 = 1;\n" - << "for (var i: u32 = 0; i < uniforms.axes_size; i += 1) { \n" - << " let index = " << GetElementAt("uniforms.axes", "i", axes_size) << ";\n" - << " size = size * " << GetElementAt("uniforms.input_shape", "index", input_rank) << ";\n" + << "for (var i: u32 = 0; i < " << input_rank << "; i += 1) { \n" + << " let index_reduced_or_not = " << GetElementAt("uniforms.reduce_axes", "i", input_rank) << ";\n" + << " if (index_reduced_or_not == 1) { \n" + << " size = size * " << GetElementAt("uniforms.input_shape", "i", input_rank) << ";\n" + << " }\n" << "}\n" << "let output_value = output_value_t(sum / f32(size));"; - ReduceOpSpecificCode code({"var sum = f32(0);", "sum += f32(current_element);", ss.str()}); + std::string loop_footer = ss.str(); + ReduceOpSpecificCode code({loop_header, loop_body, loop_footer}); return code; } -Status ReduceMean::ComputeInternal(ComputeContext& ctx) const { - return ReduceKernel::ComputeInternal(ctx); +ReduceOpSpecificCode ReduceMax::GetOpSpecificCode(const Tensor* input_tensor) const { + ORT_UNUSED_PARAMETER(input_tensor); + std::string loop_header = "var max_element = first_element;"; + std::string loop_body = "max_element = max(max_element, current_element);"; + std::string loop_footer = "let output_value = output_value_t(max_element);"; + ReduceOpSpecificCode code({loop_header, loop_body, loop_footer}); + return code; +} +ReduceOpSpecificCode ReduceSum::GetOpSpecificCode(const Tensor* input_tensor) const { + ORT_UNUSED_PARAMETER(input_tensor); + std::string loop_header = "var sum = f32(0);"; + std::string loop_body = "sum += f32(current_element);"; + std::string loop_footer = "let output_value = output_value_t(sum);"; + ReduceOpSpecificCode code({loop_header, loop_body, loop_footer}); + return code; } } // namespace webgpu diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h index e93eb06f20886..1c7dba89b7144 100644 --- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h @@ -13,22 +13,23 @@ namespace webgpu { // reduceOpSpecificCode is a 3-element array of strings that represent the op specific code for the reduce operation. // The first element is the loop header, the second element is the loop body, and the third element is the loop footer. // The loop header is the code that is executed before the loop starts. The loop body is the code that is executed for each element in the loop. -// The loop footer is the code that is executed after the loop ends. +// The loop footer is the code that is executed after the loop ends. The loop body should contain the code that accumulates the result of the reduction and +// the loop footer should contain the code that assigins output_value the result of the reduction. typedef std::array ReduceOpSpecificCode; class ReduceKernelProgram final : public Program { public: - ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector& axes, ReduceOpSpecificCode code) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code) {} + ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector& axes, ReduceOpSpecificCode code, bool is_input_empty) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code), is_input_empty_(is_input_empty) {} Status GenerateShaderCode(ShaderHelper& wgpuShaderModuleAddRef) const override; WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32}, {"no_op_with_empty_axes", ProgramUniformVariableDataType::Uint32}, - {"axes", ProgramUniformVariableDataType::Uint32}, - {"axes_size", ProgramUniformVariableDataType::Uint32}); + {"reduce_axes", ProgramUniformVariableDataType::Uint32}); private: const bool keepdims_; const bool no_op_with_empty_axes_; InlinedVector axes_; ReduceOpSpecificCode code_; + bool is_input_empty_; }; template @@ -39,23 +40,41 @@ class ReduceKernel : public WebGpuKernel, public ReduceKernelBase::keepdims_; using ReduceKernelBase::select_last_index_; - ReduceKernel(const OpKernelInfo& info, std::string name, optional keepdims_override = {}) + ReduceKernel(const OpKernelInfo& info, std::string name, bool allow_empty_input = false, optional keepdims_override = {}) : WebGpuKernel(info), ReduceKernelBase(info, keepdims_override), - name_(name) { + name_(name), + allow_empty_input_(allow_empty_input) { } Status ComputeInternal(ComputeContext& ctx) const; - virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const = 0; + virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const = 0; + + Status CheckInput(const Tensor* input_tensor) const { + ORT_ENFORCE(input_tensor != nullptr && (input_tensor->Shape().Size() > 0 || allow_empty_input_), "Input tensor cannot be null or empty"); + return Status::OK(); + } private: std::string name_; + bool allow_empty_input_; }; class ReduceMean final : public ReduceKernel { public: - ReduceMean(const OpKernelInfo& info) : ReduceKernel(info, "ReduceMean") {} - ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const override; - Status ComputeInternal(ComputeContext& ctx) const override; + ReduceMean(const OpKernelInfo& info) : ReduceKernel(info, "ReduceMean", true) {} + ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override; +}; + +class ReduceMax final : public ReduceKernel { + public: + ReduceMax(const OpKernelInfo& info) : ReduceKernel(info, "ReduceMax") {} + ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override; +}; + +class ReduceSum final : public ReduceKernel { + public: + ReduceSum(const OpKernelInfo& info) : ReduceKernel(info, "ReduceSum", true) {} + ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override; }; } // namespace webgpu diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index a2b8709e0e075..dcb7c9083ece9 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -512,11 +512,11 @@ std::unique_ptr RegisterKernels() { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -538,9 +538,9 @@ std::unique_ptr RegisterKernels() { // BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo,