[Native WebGPU] Added ReduceMax and ReduceSum #23934

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

satyajandhyala merged 6 commits into main from sajandhy/webgpu-ep-add-reduce-sum

Mar 12, 2025

onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc

-Original file line number
+Diff line change
@@ Expand Up / @@ -34,11 +34,28 @@ @@
     REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 13, 17);
     REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 18);
+    REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 1, 10);
+    REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 11, 11);
+    REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 12, 12);
+    REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 13, 17);
+    REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMax, 18);
+    REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 1, 10);
+    REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 11, 12);
+    REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSum, 13);
     Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
-      const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
       const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+      if (is_input_empty_) {
+        shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                                  << code_[0]
+                                  << code_[2]
+                                  << output.SetByOffset("global_idx", "output_value");
+        return Status::OK();
+      }
+      const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
       bool reduce_on_all_axes = no_op_with_empty_axes_ == false && axes_.empty();
-      std::string loop_header = code_[0];
+      std::string loop_header = code_[0].find("first_element") == std::string::npos ? code_[0] : "let first_element = " + input.GetByIndices("input_indices") + ";\n" + code_[0] + "\n";
       std::string loop_body = "let current_element: input_value_t = " + input.GetByIndices("input_indices") + ";\n" + code_[1];
       std::string loop_footer = code_[2];
       const auto input_rank = input.Rank();
@@ Expand All / @@ -56,10 +73,10 @@ @@
           loop_body = ss.str();
         } else {
           std::stringstream ss;
-          ss << loop_header << "\n";
           std::string index = "i" + std::to_string(i);
           ss << "let " << index << " = " << output.IndicesGet("output_indices", l) << ";\n";
           ss << input.IndicesSet("input_indices", i, index) << ";\n";
+          ss << loop_header << "\n";
           loop_header = ss.str();
           l++;
         }
@@ Expand All / @@ -80,6 +97,7 @@ @@
     template <bool allow_multi_axes>
     Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context) const {
       const auto* input_tensor = context.Input(0);
+      ORT_RETURN_IF_ERROR(CheckInput(input_tensor));
       InlinedVector<uint32_t> input_axes;
       auto rank = input_tensor->Shape().NumDimensions();
       auto transform_axis = [rank](int64_t axis) {
@@ Expand All / @@ -95,10 +113,12 @@ @@
       if (context.InputCount() > 1) {
         ORT_ENFORCE(axes_.empty(), "Axes attribute may not be specified when axes input is also provided.");
         const Tensor* axes_tensor = context.Input<Tensor>(1);
-        auto size = static_cast<size_t>(axes_tensor->Shape()[0]);
-        const auto* data = axes_tensor->Data<int64_t>();
-        input_axes.reserve(size);
-        std::transform(data, data + size, std::back_inserter(input_axes), transform_axis);
+        if (nullptr != axes_tensor) {
+          auto size = static_cast<size_t>(axes_tensor->Shape()[0]);
+          const auto* data = axes_tensor->Data<int64_t>();
+          input_axes.reserve(size);
+          std::transform(data, data + size, std::back_inserter(input_axes), transform_axis);
+        }
       } else {
         input_axes.reserve(axes_.size());
         std::transform(axes_.begin(), axes_.end(), std::back_inserter(input_axes), transform_axis);
@@ Expand All / @@ -120,10 +140,12 @@ @@
           std::iota(input_axes.begin(), input_axes.end(), 0);
         }
       }
-      const auto code = GetOpSpecificCode(input_tensor, input_axes.size());
+      const auto code = GetOpSpecificCode(input_tensor);
       // Compute output shape
       std::vector<int64_t> output_shape;
+      bool is_input_empty = false;
       for (size_t i = 0; i < input_tensor->Shape().NumDimensions(); ++i) {
+        is_input_empty |= input_tensor->Shape()[i] == 0;
         if (std::find(input_axes.begin(), input_axes.end(), i) != input_axes.end()) {
           if (keepdims_) {
             output_shape.push_back(1);
@@ Expand All / @@ -134,34 +156,68 @@ @@
       }
       TensorShape output_tensor_shape(output_shape);
       int64_t output_size = output_tensor_shape.Size();
-      ReduceKernelProgram program("ReduceMean", keepdims_, noop_with_empty_axes_, input_axes, code);
-      program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
+      if (output_size == 0) {
+        ORT_IGNORE_RETURN_VALUE(context.Output(0, output_tensor_shape));
+        return Status::OK();
+      }
+      auto input_rank = input_tensor->Shape().NumDimensions();
+      // reduce_axes element is either 1 or 0 depending on whether the axis is reduced or not
+      std::vector<uint32_t> reduce_axes;
+      reduce_axes.resize(input_rank, 0);
+      for (auto axis : input_axes) {
+        reduce_axes[axis] = 1;
+      }
+      ReduceKernelProgram program(name_, keepdims_, noop_with_empty_axes_, input_axes, code, is_input_empty);
+      if (!is_input_empty) {
+        program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank});
+      }
+      program.CacheHint(is_input_empty)
           .AddOutput({context.Output(0, output_shape), ProgramTensorMetadataDependency::TypeAndRank})
           .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
           .AddUniformVariables({{static_cast<uint32_t>(output_size)},
                                 {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)},
-                                {input_axes},
-                                {static_cast<uint32_t>(input_axes.size())}});
+                                {reduce_axes}});
       return context.RunProgram(program);
     }
-    ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const {
+    ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor) const {
       const TensorShape& input_shape = input_tensor->Shape();
       size_t input_rank = input_shape.NumDimensions();
+      std::string loop_header = "var sum = f32(0);";
+      std::string loop_body = "sum += f32(current_element);";
       std::stringstream ss;
       ss << "var size: u32 = 1;\n"
-         << "for (var i: u32 = 0; i < uniforms.axes_size; i += 1) { \n"
-         << "  let index = " << GetElementAt("uniforms.axes", "i", axes_size) << ";\n"
-         << "  size = size * " << GetElementAt("uniforms.input_shape", "index", input_rank) << ";\n"
+         << "for (var i: u32 = 0; i < " << input_rank << "; i += 1) { \n"
+         << "  let index_reduced_or_not = " << GetElementAt("uniforms.reduce_axes", "i", input_rank) << ";\n"
+         << "  if (index_reduced_or_not == 1) { \n"
+         << "    size = size * " << GetElementAt("uniforms.input_shape", "i", input_rank) << ";\n"
+         << "  }\n"
          << "}\n"
          << "let output_value = output_value_t(sum / f32(size));";
-      ReduceOpSpecificCode code({"var sum = f32(0);", "sum += f32(current_element);", ss.str()});
+      std::string loop_footer = ss.str();
+      ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
       return code;
     }
-    Status ReduceMean::ComputeInternal(ComputeContext& ctx) const {
-      return ReduceKernel<true>::ComputeInternal(ctx);
+    ReduceOpSpecificCode ReduceMax::GetOpSpecificCode(const Tensor* input_tensor) const {
+      ORT_UNUSED_PARAMETER(input_tensor);
+      std::string loop_header = "var max_element = first_element;";
+      std::string loop_body = "max_element = max(max_element, current_element);";
+      std::string loop_footer = "let output_value = output_value_t(max_element);";
+      ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
+      return code;
+    }
+    ReduceOpSpecificCode ReduceSum::GetOpSpecificCode(const Tensor* input_tensor) const {
+      ORT_UNUSED_PARAMETER(input_tensor);
+      std::string loop_header = "var sum = f32(0);";
+      std::string loop_body = "sum += f32(current_element);";
+      std::string loop_footer = "let output_value = output_value_t(sum);";
+      ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
+      return code;
     }
     }  // namespace webgpu
@@ Expand Down @@

onnxruntime/core/providers/webgpu/reduction/reduction_ops.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -13,22 +13,23 @@ namespace webgpu { @@
     // reduceOpSpecificCode is a 3-element array of strings that represent the op specific code for the reduce operation.
     // The first element is the loop header, the second element is the loop body, and the third element is the loop footer.
     // The loop header is the code that is executed before the loop starts. The loop body is the code that is executed for each element in the loop.
-    // The loop footer is the code that is executed after the loop ends.
+    // The loop footer is the code that is executed after the loop ends. The loop body should contain the code that accumulates the result of the reduction and
+    // the loop footer should contain the code that assigins output_value the result of the reduction.
     typedef std::array<std::string, 3> ReduceOpSpecificCode;
     class ReduceKernelProgram final : public Program<ReduceKernelProgram> {
      public:
-      ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector<uint32_t>& axes, ReduceOpSpecificCode code) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code) {}
+      ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector<uint32_t>& axes, ReduceOpSpecificCode code, bool is_input_empty) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code), is_input_empty_(is_input_empty) {}
       Status GenerateShaderCode(ShaderHelper& wgpuShaderModuleAddRef) const override;
       WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
                                               {"no_op_with_empty_axes", ProgramUniformVariableDataType::Uint32},
-                                              {"axes", ProgramUniformVariableDataType::Uint32},
-                                              {"axes_size", ProgramUniformVariableDataType::Uint32});
+                                              {"reduce_axes", ProgramUniformVariableDataType::Uint32});
      private:
       const bool keepdims_;
       const bool no_op_with_empty_axes_;
       InlinedVector<uint32_t> axes_;
       ReduceOpSpecificCode code_;
+      bool is_input_empty_;
     };
     template <bool allow_multi_axes = true>
@@ Expand All @@
       using ReduceKernelBase<allow_multi_axes>::keepdims_;
       using ReduceKernelBase<allow_multi_axes>::select_last_index_;
-      ReduceKernel(const OpKernelInfo& info, std::string name, optional<int64_t> keepdims_override = {})
+      ReduceKernel(const OpKernelInfo& info, std::string name, bool allow_empty_input = false, optional<int64_t> keepdims_override = {})
           : WebGpuKernel(info),
             ReduceKernelBase<allow_multi_axes>(info, keepdims_override),
-            name_(name) {
+            name_(name),
+            allow_empty_input_(allow_empty_input) {
       }
       Status ComputeInternal(ComputeContext& ctx) const;
-      virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const = 0;
+      virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const = 0;
+      Status CheckInput(const Tensor* input_tensor) const {
+        ORT_ENFORCE(input_tensor != nullptr && (input_tensor->Shape().Size() > 0 || allow_empty_input_), "Input tensor cannot be null or empty");
+        return Status::OK();
+      }
      private:
       std::string name_;
+      bool allow_empty_input_;
     };
     class ReduceMean final : public ReduceKernel<true> {
      public:
-      ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMean") {}
-      ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const override;
-      Status ComputeInternal(ComputeContext& ctx) const override;
+      ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMean", true) {}
+      ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
+    };
+    class ReduceMax final : public ReduceKernel<true> {
+     public:
+      ReduceMax(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMax") {}
+      ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
+    };
+    class ReduceSum final : public ReduceKernel<true> {
+     public:
+      ReduceSum(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceSum", true) {}
+      ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
     };
     }  // namespace webgpu
@@ Expand Down @@

onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -512,11 +512,11 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
  
          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Squeeze)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Squeeze)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Squeeze)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMax)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 11, ReduceMax)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, 12, ReduceMax)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMax)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMax)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMax)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 11, ReduceMax)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, 12, ReduceMax)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMax)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMax)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMean)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceMean)>,

    @@ -538,9 +538,9 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
  
          // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceProd)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceProd)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceSum)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceSum)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ReduceSum)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceSum)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceSum)>,

          BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ReduceSum)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceL1)>,

          // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceL1)>,

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Native WebGPU] Added ReduceMax and ReduceSum #23934

Uh oh!

Diff view

Diff view

There are no files selected for viewing

GitHub Actions / Optional Lint C++

GitHub Actions / Optional Lint C++

Uh oh!

Uh oh!

Uh oh!