microsoft · prathikr · Mar 18, 2025
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
@@ -38,50 +38,12 @@
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 11, 11);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 12, 12);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 18, 19);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMax, 20);
-
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 11, 11);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 12, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 18, 19);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMin, 20);
+REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMax, 18);
 
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 1, 10);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 11, 12);
 REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSum, 13);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceProd, 18);
-
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceL1, 18);
-
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceL2, 18);
-
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceLogSum, 18);
-
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSumSquare, 18);
-
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceLogSumExp, 18);
-
 Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
   if (is_input_empty_) {
@@ -164,28 +126,14 @@
   if (input_axes.empty()) {
     if (noop_with_empty_axes_ || rank == 0) {
       // If axes is empty and noop_with_empty_axes_ is true, it is a no-op according to the spec
-      // If input tensor is a scalar and it's not a ReduceLogSum or ReduceSumSquare, return the input tensor as is.
-      if (rank == 0 && (name_ == "ReduceLogSum" || name_ == "ReduceSumSquare" || name_ == "ReduceL1" || name_ == "ReduceL2")) {
-        // For ReduceLogSum with scalar input, output = log(input)
-        // For ReduceSumSquare with scalar input, output = input * input
-        auto output = context.Output(0, input_tensor->Shape());
-        // We need to run the operation even for scalar inputs for these ops
-        const auto code = GetOpSpecificCode(input_tensor);
-        ReduceKernelProgram program(name_, keepdims_, noop_with_empty_axes_, input_axes, code, false);
-        std::vector<uint32_t> reduce_axes = {0};
-        program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
-            .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank})
-            .SetDispatchGroupSize(1)
-            .AddUniformVariables({{1}, {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)}, {reduce_axes}});
-        return context.RunProgram(program);
-      } else {
-        // For other ops, or when axes is empty with noop_with_empty_axes_ true, just copy the input
-        auto output = context.Output(0, input_tensor->Shape());
-        if (output->DataRaw() != input_tensor->DataRaw()) {
-          ORT_RETURN_IF_ERROR(Info().GetDataTransferManager().CopyTensor(*input_tensor, *output));
-        }
-        return Status::OK();
+      // If input tensor is a scalar, return the input tensor as is.
+      // This is not correct for ReduceLogSum and ReduceSumSquare
+      // TODO handle these cases separately.
+      auto output = context.Output(0, input_tensor->Shape());
+      if (output->DataRaw() != input_tensor->DataRaw()) {
+        ORT_RETURN_IF_ERROR(Info().GetDataTransferManager().CopyTensor(*input_tensor, *output));
       }
+      return Status::OK();
     } else {
       // If axes is empty and noop_with_empty_axes_ is false, it is a reduction over all axes
       input_axes.resize(rank);
@@ -263,14 +211,6 @@
   ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
   return code;
 }
-ReduceOpSpecificCode ReduceMin::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var min_element = first_element;";
-  std::string loop_body = "min_element = min(min_element, current_element);";
-  std::string loop_footer = "let output_value = output_value_t(min_element);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
 ReduceOpSpecificCode ReduceSum::GetOpSpecificCode(const Tensor* input_tensor) const {
   ORT_UNUSED_PARAMETER(input_tensor);
   std::string loop_header = "var sum = f32(0);";
@@ -279,54 +219,6 @@
   ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
   return code;
 }
-ReduceOpSpecificCode ReduceProd::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var prod = f32(1);";
-  std::string loop_body = "prod *= f32(current_element);";
-  std::string loop_footer = "let output_value = output_value_t(prod);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceL1::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var l1 = f32(0);";
-  std::string loop_body = "l1 += abs(f32(current_element));";
-  std::string loop_footer = "let output_value = output_value_t(l1);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceL2::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var l2 = f32(0);";
-  std::string loop_body = "let t = f32(current_element); l2 += (t * t);";
-  std::string loop_footer = "l2 = sqrt(l2); let output_value = output_value_t(l2);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceLogSum::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var sum = f32(0);";
-  std::string loop_body = "sum += f32(current_element);";
-  std::string loop_footer = "let log_sum = log(sum); let output_value = output_value_t(log_sum);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceSumSquare::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var sum_square = f32(0);";
-  std::string loop_body = "let t = f32(current_element); sum_square += (t * t);";
-  std::string loop_footer = "let output_value = output_value_t(sum_square);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
-ReduceOpSpecificCode ReduceLogSumExp::GetOpSpecificCode(const Tensor* input_tensor) const {
-  ORT_UNUSED_PARAMETER(input_tensor);
-  std::string loop_header = "var sum_exp = f32(0);";
-  std::string loop_body = "sum_exp += exp(f32(current_element));";
-  std::string loop_footer = "let log_sum_exp = log(sum_exp); let output_value = output_value_t(log_sum_exp);";
-  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
-  return code;
-}
 
 }  // namespace webgpu
-}  // namespace onnxruntime
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
@@ -71,53 +71,11 @@ class ReduceMax final : public ReduceKernel<true> {
   ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
-class ReduceMin final : public ReduceKernel<true> {
- public:
-  ReduceMin(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMin") {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
-};
-
 class ReduceSum final : public ReduceKernel<true> {
  public:
   ReduceSum(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceSum", true) {}
   ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
-class ReduceProd final : public ReduceKernel<true> {
- public:
-  ReduceProd(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceProd", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
-};
-
-class ReduceL1 final : public ReduceKernel<true> {
- public:
-  ReduceL1(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceL1", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
-};
-
-class ReduceL2 final : public ReduceKernel<true> {
- public:
-  ReduceL2(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceL2", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
-};
-
-class ReduceLogSum final : public ReduceKernel<true> {
- public:
-  ReduceLogSum(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceLogSum", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
-};
-
-class ReduceSumSquare final : public ReduceKernel<true> {
- public:
-  ReduceSumSquare(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceSumSquare", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
-};
-
-class ReduceLogSumExp final : public ReduceKernel<true> {
- public:
-  ReduceLogSumExp(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceLogSumExp", true) {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
-};
-
 }  // namespace webgpu
 }  // namespace onnxruntime