Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 75 additions & 19 deletions onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,28 @@
REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 13, 17);
REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 18);

REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 1, 10);
REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 11, 11);
REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 12, 12);
REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 13, 17);
REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMax, 18);

REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 1, 10);
REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 11, 12);
REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSum, 13);

Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
if (is_input_empty_) {
shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
<< code_[0]
<< code_[2]
<< output.SetByOffset("global_idx", "output_value");
return Status::OK();
}
const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
bool reduce_on_all_axes = no_op_with_empty_axes_ == false && axes_.empty();
std::string loop_header = code_[0];
std::string loop_header = code_[0].find("first_element") == std::string::npos ? code_[0] : "let first_element = " + input.GetByIndices("input_indices") + ";\n" + code_[0] + "\n";
std::string loop_body = "let current_element: input_value_t = " + input.GetByIndices("input_indices") + ";\n" + code_[1];
std::string loop_footer = code_[2];
const auto input_rank = input.Rank();
Expand All @@ -56,10 +73,10 @@
loop_body = ss.str();
} else {
std::stringstream ss;
ss << loop_header << "\n";
std::string index = "i" + std::to_string(i);
ss << "let " << index << " = " << output.IndicesGet("output_indices", l) << ";\n";
ss << input.IndicesSet("input_indices", i, index) << ";\n";
ss << loop_header << "\n";
loop_header = ss.str();
l++;
}
Expand All @@ -80,6 +97,7 @@
template <bool allow_multi_axes>
Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context) const {
const auto* input_tensor = context.Input(0);
ORT_RETURN_IF_ERROR(CheckInput(input_tensor));
InlinedVector<uint32_t> input_axes;
auto rank = input_tensor->Shape().NumDimensions();
auto transform_axis = [rank](int64_t axis) {
Expand All @@ -95,10 +113,12 @@
if (context.InputCount() > 1) {
ORT_ENFORCE(axes_.empty(), "Axes attribute may not be specified when axes input is also provided.");
const Tensor* axes_tensor = context.Input<Tensor>(1);
auto size = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->Data<int64_t>();
input_axes.reserve(size);
std::transform(data, data + size, std::back_inserter(input_axes), transform_axis);
if (nullptr != axes_tensor) {
auto size = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->Data<int64_t>();
input_axes.reserve(size);
std::transform(data, data + size, std::back_inserter(input_axes), transform_axis);
}
} else {
input_axes.reserve(axes_.size());
std::transform(axes_.begin(), axes_.end(), std::back_inserter(input_axes), transform_axis);
Expand All @@ -120,10 +140,12 @@
std::iota(input_axes.begin(), input_axes.end(), 0);
}
}
const auto code = GetOpSpecificCode(input_tensor, input_axes.size());
const auto code = GetOpSpecificCode(input_tensor);
// Compute output shape
std::vector<int64_t> output_shape;
bool is_input_empty = false;
for (size_t i = 0; i < input_tensor->Shape().NumDimensions(); ++i) {
is_input_empty |= input_tensor->Shape()[i] == 0;
if (std::find(input_axes.begin(), input_axes.end(), i) != input_axes.end()) {
if (keepdims_) {
output_shape.push_back(1);
Expand All @@ -134,34 +156,68 @@
}
TensorShape output_tensor_shape(output_shape);
int64_t output_size = output_tensor_shape.Size();
ReduceKernelProgram program("ReduceMean", keepdims_, noop_with_empty_axes_, input_axes, code);
program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
if (output_size == 0) {
ORT_IGNORE_RETURN_VALUE(context.Output(0, output_tensor_shape));
return Status::OK();
}

auto input_rank = input_tensor->Shape().NumDimensions();
// reduce_axes element is either 1 or 0 depending on whether the axis is reduced or not
std::vector<uint32_t> reduce_axes;

Check warning on line 166 in onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <vector> for vector<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc:166: Add #include <vector> for vector<> [build/include_what_you_use] [4]
reduce_axes.resize(input_rank, 0);
for (auto axis : input_axes) {
reduce_axes[axis] = 1;
}

ReduceKernelProgram program(name_, keepdims_, noop_with_empty_axes_, input_axes, code, is_input_empty);
if (!is_input_empty) {
program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank});
}

program.CacheHint(is_input_empty)
.AddOutput({context.Output(0, output_shape), ProgramTensorMetadataDependency::TypeAndRank})
.SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
.AddUniformVariables({{static_cast<uint32_t>(output_size)},
{static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)},
{input_axes},
{static_cast<uint32_t>(input_axes.size())}});
{reduce_axes}});

return context.RunProgram(program);
}

ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const {
ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor) const {
const TensorShape& input_shape = input_tensor->Shape();
size_t input_rank = input_shape.NumDimensions();
std::string loop_header = "var sum = f32(0);";
std::string loop_body = "sum += f32(current_element);";
std::stringstream ss;
ss << "var size: u32 = 1;\n"
<< "for (var i: u32 = 0; i < uniforms.axes_size; i += 1) { \n"
<< " let index = " << GetElementAt("uniforms.axes", "i", axes_size) << ";\n"
<< " size = size * " << GetElementAt("uniforms.input_shape", "index", input_rank) << ";\n"
<< "for (var i: u32 = 0; i < " << input_rank << "; i += 1) { \n"
<< " let index_reduced_or_not = " << GetElementAt("uniforms.reduce_axes", "i", input_rank) << ";\n"
<< " if (index_reduced_or_not == 1) { \n"
<< " size = size * " << GetElementAt("uniforms.input_shape", "i", input_rank) << ";\n"
<< " }\n"
<< "}\n"
<< "let output_value = output_value_t(sum / f32(size));";
ReduceOpSpecificCode code({"var sum = f32(0);", "sum += f32(current_element);", ss.str()});
std::string loop_footer = ss.str();
ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
return code;
}

Status ReduceMean::ComputeInternal(ComputeContext& ctx) const {
return ReduceKernel<true>::ComputeInternal(ctx);
ReduceOpSpecificCode ReduceMax::GetOpSpecificCode(const Tensor* input_tensor) const {
ORT_UNUSED_PARAMETER(input_tensor);
std::string loop_header = "var max_element = first_element;";
std::string loop_body = "max_element = max(max_element, current_element);";
std::string loop_footer = "let output_value = output_value_t(max_element);";
ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
return code;
}
ReduceOpSpecificCode ReduceSum::GetOpSpecificCode(const Tensor* input_tensor) const {
ORT_UNUSED_PARAMETER(input_tensor);
std::string loop_header = "var sum = f32(0);";
std::string loop_body = "sum += f32(current_element);";
std::string loop_footer = "let output_value = output_value_t(sum);";

Check warning on line 218 in onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <string> for string [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc:218: Add #include <string> for string [build/include_what_you_use] [4]
ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
return code;
}

} // namespace webgpu
Expand Down
39 changes: 29 additions & 10 deletions onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,23 @@ namespace webgpu {
// reduceOpSpecificCode is a 3-element array of strings that represent the op specific code for the reduce operation.
// The first element is the loop header, the second element is the loop body, and the third element is the loop footer.
// The loop header is the code that is executed before the loop starts. The loop body is the code that is executed for each element in the loop.
// The loop footer is the code that is executed after the loop ends.
// The loop footer is the code that is executed after the loop ends. The loop body should contain the code that accumulates the result of the reduction and
// the loop footer should contain the code that assigins output_value the result of the reduction.
typedef std::array<std::string, 3> ReduceOpSpecificCode;
class ReduceKernelProgram final : public Program<ReduceKernelProgram> {
public:
ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector<uint32_t>& axes, ReduceOpSpecificCode code) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code) {}
ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector<uint32_t>& axes, ReduceOpSpecificCode code, bool is_input_empty) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code), is_input_empty_(is_input_empty) {}
Status GenerateShaderCode(ShaderHelper& wgpuShaderModuleAddRef) const override;
WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
{"no_op_with_empty_axes", ProgramUniformVariableDataType::Uint32},
{"axes", ProgramUniformVariableDataType::Uint32},
{"axes_size", ProgramUniformVariableDataType::Uint32});
{"reduce_axes", ProgramUniformVariableDataType::Uint32});

private:
const bool keepdims_;
const bool no_op_with_empty_axes_;
InlinedVector<uint32_t> axes_;
ReduceOpSpecificCode code_;
bool is_input_empty_;
};

template <bool allow_multi_axes = true>
Expand All @@ -39,23 +40,41 @@ class ReduceKernel : public WebGpuKernel, public ReduceKernelBase<allow_multi_ax
using ReduceKernelBase<allow_multi_axes>::keepdims_;
using ReduceKernelBase<allow_multi_axes>::select_last_index_;

ReduceKernel(const OpKernelInfo& info, std::string name, optional<int64_t> keepdims_override = {})
ReduceKernel(const OpKernelInfo& info, std::string name, bool allow_empty_input = false, optional<int64_t> keepdims_override = {})
: WebGpuKernel(info),
ReduceKernelBase<allow_multi_axes>(info, keepdims_override),
name_(name) {
name_(name),
allow_empty_input_(allow_empty_input) {
}
Status ComputeInternal(ComputeContext& ctx) const;
virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const = 0;
virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const = 0;

Status CheckInput(const Tensor* input_tensor) const {
ORT_ENFORCE(input_tensor != nullptr && (input_tensor->Shape().Size() > 0 || allow_empty_input_), "Input tensor cannot be null or empty");
return Status::OK();
}

private:
std::string name_;
bool allow_empty_input_;
};

class ReduceMean final : public ReduceKernel<true> {
public:
ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMean") {}
ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const override;
Status ComputeInternal(ComputeContext& ctx) const override;
ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMean", true) {}
ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
};

class ReduceMax final : public ReduceKernel<true> {
public:
ReduceMax(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMax") {}
ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
};

class ReduceSum final : public ReduceKernel<true> {
public:
ReduceSum(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceSum", true) {}
ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
};

} // namespace webgpu
Expand Down
16 changes: 8 additions & 8 deletions onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -512,11 +512,11 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Squeeze)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Squeeze)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Squeeze)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMax)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 11, ReduceMax)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, 12, ReduceMax)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMax)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMax)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMax)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 11, ReduceMax)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, 12, ReduceMax)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMax)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMax)>,

BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMean)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceMean)>,
Expand All @@ -538,9 +538,9 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceProd)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceProd)>,

// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceSum)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceSum)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ReduceSum)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceSum)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceSum)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ReduceSum)>,

// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceL1)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceL1)>,
Expand Down
Loading