diff --git a/onnxruntime/core/providers/webgpu/nn/pool.cc b/onnxruntime/core/providers/webgpu/nn/pool.cc index d650392b71fb5..6698a831b0c1a 100644 --- a/onnxruntime/core/providers/webgpu/nn/pool.cc +++ b/onnxruntime/core/providers/webgpu/nn/pool.cc @@ -84,14 +84,8 @@ Status PoolProgram::GenerateShaderCode(ShaderHelper& shader) const { constexpr const size_t kStringInitialSize = 128; if (is_max_pool_) { - std::string f16_min = "f16(-65504)"; - - SS(f32_min_ss, kStringInitialSize); - f32_min_ss << "f32(" << std::numeric_limits::lowest() << ")"; - std::string f32_min = SS_GET(f32_min_ss); - SS(var_decl_ss, kStringInitialSize); - var_decl_ss << " var value = " << (is_float16_ ? f16_min : f32_min) << ";\n"; + var_decl_ss << " var value = " << (is_float16_ ? "-65504.0h" : "-3.4028234663852886e+38f") << ";\n"; var_decl_code = SS_GET(var_decl_ss); sampling_code = " value = max(value, x_val);\n"; diff --git a/onnxruntime/core/providers/webgpu/program.cc b/onnxruntime/core/providers/webgpu/program.cc index 9c0f1e85b3021..df23f5d1dc73c 100644 --- a/onnxruntime/core/providers/webgpu/program.cc +++ b/onnxruntime/core/providers/webgpu/program.cc @@ -49,17 +49,18 @@ ProgramUniformVariableValue::ProgramUniformVariableValue(ProgramUniformVariableD memcpy(data.data(), ptr, length * element_byte_size); } -std::ostream& operator<<(std::ostream& os, ProgramUniformVariableDataType type) { - os << ProgramUniformVariableDataTypeName[std::underlying_type::type(type)]; - return os; -} +#define DEFINE_ENUM_STREAM_OP(StreamType, EnumType, EnumNameArray) \ + StreamType& operator<<(StreamType& os, EnumType type) { \ + os << EnumNameArray[std::underlying_type::type(type)]; \ + return os; \ + } -std::ostream& operator<<(std::ostream& os, ProgramConstantDataType type) { - os << ProgramConstantDataTypeName[std::underlying_type::type(type)]; - return os; -} +DEFINE_ENUM_STREAM_OP(std::ostream, ProgramUniformVariableDataType, ProgramUniformVariableDataTypeName) +DEFINE_ENUM_STREAM_OP(OStringStream, ProgramUniformVariableDataType, ProgramUniformVariableDataTypeName) +DEFINE_ENUM_STREAM_OP(std::ostream, ProgramConstantDataType, ProgramConstantDataTypeName) +DEFINE_ENUM_STREAM_OP(OStringStream, ProgramConstantDataType, ProgramConstantDataTypeName) -std::ostream& operator<<(std::ostream& os, ProgramTensorMetadataDependency dep) { +OStringStream& operator<<(OStringStream& os, ProgramTensorMetadataDependency dep) { bool first = true; if ((dep & ProgramTensorMetadataDependency::Type) == ProgramTensorMetadataDependency::Type) { os << "Type"; @@ -109,10 +110,7 @@ constexpr std::string_view ProgramVariableDataTypeName[] = { "i4x8", // Int4x8 }; -std::ostream& operator<<(std::ostream& os, ProgramVariableDataType type) { - os << ProgramVariableDataTypeName[std::underlying_type::type(type)]; - return os; -} +DEFINE_ENUM_STREAM_OP(OStringStream, ProgramVariableDataType, ProgramVariableDataTypeName) #endif int NumberOfComponents(ProgramVariableDataType type) { diff --git a/onnxruntime/core/providers/webgpu/program.h b/onnxruntime/core/providers/webgpu/program.h index 858966431ac22..fa6ca1a8f6ba5 100644 --- a/onnxruntime/core/providers/webgpu/program.h +++ b/onnxruntime/core/providers/webgpu/program.h @@ -23,6 +23,8 @@ #include "core/common/safeint.h" #include "core/framework/tensor.h" +#include "core/providers/webgpu/string_utils.h" + namespace onnxruntime { namespace webgpu { class ShaderHelper; @@ -37,6 +39,7 @@ enum class ProgramUniformVariableDataType { Int32, }; std::ostream& operator<<(std::ostream& os, ProgramUniformVariableDataType); +OStringStream& operator<<(OStringStream& os, ProgramUniformVariableDataType); constexpr size_t ProgramUniformVariableDataTypeSize[] = {sizeof(float), sizeof(uint16_t), sizeof(uint32_t), sizeof(int32_t)}; @@ -80,6 +83,7 @@ enum class ProgramConstantDataType { Bool }; std::ostream& operator<<(std::ostream& os, ProgramConstantDataType); +OStringStream& operator<<(OStringStream& os, ProgramConstantDataType); constexpr std::string_view ProgramConstantDataTypeName[] = {"f32", "f16", "u32", "i32", "bool"}; @@ -158,7 +162,7 @@ enum class ProgramTensorMetadataDependency : int { TypeAndRank = Type | Rank, TypeAndShape = Type | Shape, }; -std::ostream& operator<<(std::ostream& os, ProgramTensorMetadataDependency); +OStringStream& operator<<(OStringStream& os, ProgramTensorMetadataDependency); #if defined(__GNUC__) #pragma GCC diagnostic push @@ -216,7 +220,7 @@ enum class ProgramVariableDataType { // if you add a new type here, you also need to update ProgramVariableDataTypeName }; #ifndef NDEBUG -std::ostream& operator<<(std::ostream& os, ProgramVariableDataType); +OStringStream& operator<<(OStringStream& os, ProgramVariableDataType); #endif int NumberOfComponents(ProgramVariableDataType type); diff --git a/onnxruntime/core/providers/webgpu/program_cache_key.cc b/onnxruntime/core/providers/webgpu/program_cache_key.cc index 371539b136010..d4a6b2bf1d812 100644 --- a/onnxruntime/core/providers/webgpu/program_cache_key.cc +++ b/onnxruntime/core/providers/webgpu/program_cache_key.cc @@ -17,7 +17,7 @@ namespace webgpu { namespace { // append the info of an input or output to the cachekey -void AppendTensorInfo(std::ostream& ss, +void AppendTensorInfo(OStringStream& ss, const TensorShape& tensor_shape, ProgramVariableDataType var_type, ProgramTensorMetadataDependency dependency, diff --git a/onnxruntime/core/providers/webgpu/shader_helper.cc b/onnxruntime/core/providers/webgpu/shader_helper.cc index 50b875e1c629f..07e9e9e793c56 100644 --- a/onnxruntime/core/providers/webgpu/shader_helper.cc +++ b/onnxruntime/core/providers/webgpu/shader_helper.cc @@ -34,8 +34,8 @@ ShaderHelper::ShaderHelper(const ProgramBase& program, dispatch_group_size_z_{dispatch_group_size_z}, program_{program}, program_metadata_{program_metadata}, - additional_implementation_ss_{&additional_implementation_}, - body_ss_{&body_} {} + additional_implementation_ss_{kStringInitialSizeShaderSourceCodeAdditionalImplementation}, + body_ss_{kStringInitialSizeShaderSourceCodeMain} {} Status ShaderHelper::Init() { // dispatch group size is normalized so no need to validate it here @@ -59,8 +59,6 @@ Status ShaderHelper::Init() { // init body string stream bool is_1d_dispatch = dispatch_group_size_y_ == 1 && dispatch_group_size_z_ == 1; bool use_indirect_dispatch = program_.IndirectDispatchTensor() != nullptr; - body_.reserve(4096); - additional_implementation_.reserve(1024); // append header for main function so it is ready for user to append main function body body_ss_ << "@compute @workgroup_size(workgroup_size_x, workgroup_size_y, workgroup_size_z)\n" @@ -384,7 +382,7 @@ Status ShaderHelper::ValidateIndices() const { return Status::OK(); } -Status ShaderHelper::GenerateSourceCode(std::string& code, std::vector& shape_uniform_ranks) const { +Status ShaderHelper::GenerateSourceCode(std::string& code, std::vector& shape_uniform_ranks) { SS(ss, kStringInitialSizeShaderSourceCode); // @@ -633,12 +631,12 @@ Status ShaderHelper::GenerateSourceCode(std::string& code, std::vector& sha // // Additional Implementation // - ss << additional_implementation_; + ss << SS_GET(additional_implementation_ss_); // // Main Function Body // - ss << body_; + ss << SS_GET(body_ss_); ss << "\n" "}\n"; diff --git a/onnxruntime/core/providers/webgpu/shader_helper.h b/onnxruntime/core/providers/webgpu/shader_helper.h index 85ca52fe6307b..5137c735feb38 100644 --- a/onnxruntime/core/providers/webgpu/shader_helper.h +++ b/onnxruntime/core/providers/webgpu/shader_helper.h @@ -108,7 +108,7 @@ class ShaderHelper final { private: template // ConstantType is one of {ProgramConstant, ProgramOverridableConstantValue, ProgramOverridableConstantDefinition} - void WriteConstantValue(std::ostream& ss, const ConstantType& constant) const { + void WriteConstantValue(OStringStream& ss, const ConstantType& constant) const { switch (constant.type) { case ProgramConstantDataType::Float16: ss << constant.f16.ToFloat(); @@ -156,7 +156,7 @@ class ShaderHelper final { // \param code The generated full WGSL source code. // \param shape_uniform_ranks The ranks for variables that need a uniform for the shape. // - Status GenerateSourceCode(std::string& code, std::vector& shape_uniform_ranks) const; + Status GenerateSourceCode(std::string& code, std::vector& shape_uniform_ranks); friend class ProgramManager; const WebGpuContext& webgpu_context_; @@ -175,9 +175,7 @@ class ShaderHelper final { std::vector> input_vars_; std::vector> output_vars_; std::vector> indices_vars_; - std::string additional_implementation_; OStringStream additional_implementation_ss_; - std::string body_; OStringStream body_ss_; }; diff --git a/onnxruntime/core/providers/webgpu/shader_variable.cc b/onnxruntime/core/providers/webgpu/shader_variable.cc index aa1f6c9a0ec0b..611a75065d509 100644 --- a/onnxruntime/core/providers/webgpu/shader_variable.cc +++ b/onnxruntime/core/providers/webgpu/shader_variable.cc @@ -150,7 +150,7 @@ ShaderVariableHelper::ShaderVariableHelper(std::string_view name, ProgramVariabl ORT_ENFORCE(num_components_ > 0, "Invalid number of components for variable ", name_); } -void ShaderIndicesHelper::Impl(std::ostream& ss) const { +void ShaderIndicesHelper::Impl(OStringStream& ss) const { // Start generating code const std::string shape = (usage_ & ShaderUsage::UseUniform) ? "uniforms." + name_ + "_shape" : name_ + "_shape"; @@ -249,7 +249,7 @@ void ShaderIndicesHelper::Impl(std::ostream& ss) const { } } -void ShaderVariableHelper::Impl(std::ostream& ss) const { +void ShaderVariableHelper::Impl(OStringStream& ss) const { ShaderIndicesHelper::Impl(ss); // Implementation of "fn set_{name}" diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h index 8e921d6deafbb..c62eff0209df8 100644 --- a/onnxruntime/core/providers/webgpu/shader_variable.h +++ b/onnxruntime/core/providers/webgpu/shader_variable.h @@ -130,7 +130,7 @@ class ShaderIndicesHelper { protected: ORT_DISALLOW_COPY_AND_ASSIGNMENT(ShaderIndicesHelper); - void Impl(std::ostream& ss) const; + void Impl(OStringStream& ss) const; std::string_view IndicesType() const; @@ -197,7 +197,7 @@ class ShaderVariableHelper : public ShaderIndicesHelper { private: ORT_DISALLOW_COPY_AND_ASSIGNMENT(ShaderVariableHelper); - void Impl(std::ostream& ss) const; + void Impl(OStringStream& ss) const; std::string GetByOffsetImpl(std::string_view offset) const; std::string SetByOffsetImpl(std::string_view offset, std::string_view value) const; diff --git a/onnxruntime/core/providers/webgpu/string_macros.h b/onnxruntime/core/providers/webgpu/string_macros.h index 7821d9c49a171..3ecdcffa605a3 100644 --- a/onnxruntime/core/providers/webgpu/string_macros.h +++ b/onnxruntime/core/providers/webgpu/string_macros.h @@ -6,13 +6,10 @@ #include "core/providers/webgpu/string_utils.h" // macro "SS" - declare an ostream variable and its string buffer -#define SS(ss, reserve_size) \ - std::string ss##_str; \ - ss##_str.reserve(reserve_size); \ - ::onnxruntime::webgpu::OStringStream ss(&ss##_str) +#define SS(ss, reserve_size) ::onnxruntime::webgpu::OStringStream ss(reserve_size) // macro "SS_GET" - get the string from the ostream -#define SS_GET(ss) ss##_str +#define SS_GET(ss) (std::move(ss).str()) // macro "SS_APPEND" - use function call style to append to the ostream #define SS_APPEND(ss, ...) ::onnxruntime::webgpu::detail::OStringStreamAppend(ss, __VA_ARGS__) diff --git a/onnxruntime/core/providers/webgpu/string_utils.h b/onnxruntime/core/providers/webgpu/string_utils.h index a976d952e385b..1ed65fbc8d509 100644 --- a/onnxruntime/core/providers/webgpu/string_utils.h +++ b/onnxruntime/core/providers/webgpu/string_utils.h @@ -5,14 +5,15 @@ #include "core/common/make_string.h" +#include +#include + #ifdef _MSC_VER #pragma warning(push) // C4702: unreachable code #pragma warning(disable : 4702) #endif // _MSC_VER -#include - #ifdef _MSC_VER #pragma warning(pop) #endif // _MSC_VER @@ -22,32 +23,102 @@ namespace webgpu { constexpr const size_t kStringInitialSizeSetByOffsetImpl = 128; constexpr const size_t kStringInitialSizeGetByOffsetImpl = 128; -constexpr const size_t kStringInitialSizeShaderSourceCode = 2048; -#ifndef NDEBUG +constexpr const size_t kStringInitialSizeShaderSourceCode = 4096; +constexpr const size_t kStringInitialSizeShaderSourceCodeAdditionalImplementation = 1024; +constexpr const size_t kStringInitialSizeShaderSourceCodeMain = 3068; constexpr const size_t kStringInitialSizeCacheKey = 512; -#else -constexpr const size_t kStringInitialSizeCacheKey = 256; -#endif -using OStringStream = absl::strings_internal::OStringStream; +namespace detail { + +// A simpler and faster ostringstream implementation than absl::strings_internal::OStringStream +// +// This FastOStringStream class is intended to be used in very performance critical paths. It does +// not inherit from std::ostream so that it can avoid the following overheads: +// - locale handling and formatting +// - state management (e.g. error handling, badbit, EOF, I/O sync) +// - unnecessary heap allocations +// - virtual function calls +// +// This class is majorly used for generating shader source code and program cache keys. +// +class FastOStringStream { + public: + explicit FastOStringStream(size_t reserve_size) { + str_.reserve(reserve_size); + } + + std::string str() && { + return std::move(str_); + } + + // String types + FastOStringStream& operator<<(const char* s) { + str_.append(s); + return *this; + } + + FastOStringStream& operator<<(const std::string& s) { + str_.append(s); + return *this; + } + + FastOStringStream& operator<<(std::string_view s) { + str_.append(s); + return *this; + } + + // Character + FastOStringStream& operator<<(char c) { + str_.push_back(c); + return *this; + } + + // Integer types + template + std::enable_if_t && !std::is_same_v, FastOStringStream&> + operator<<(T value) { + std::array buffer; + auto [ptr, ec] = std::to_chars(buffer.data(), buffer.data() + buffer.size(), value); + str_.append(buffer.data(), ptr - buffer.data()); + return *this; + } + + // Floating point types + template + std::enable_if_t, FastOStringStream&> + operator<<(T value) { + std::array buffer; + auto [ptr, ec] = std::to_chars(buffer.data(), buffer.data() + buffer.size(), value); + str_.append(buffer.data(), ptr - buffer.data()); + return *this; + } + + private: + std::string str_; +}; + +} // namespace detail + +using OStringStream = detail::FastOStringStream; namespace detail { -inline void OStringStreamAppendImpl(std::ostream& /*ss*/) noexcept { + +inline void OStringStreamAppendImpl(OStringStream& /*ss*/) noexcept { } template -inline void OStringStreamAppendImpl(std::ostream& ss, const T& t) noexcept { +inline void OStringStreamAppendImpl(OStringStream& ss, const T& t) noexcept { ss << t; } template -inline void OStringStreamAppendImpl(std::ostream& ss, const T& t, const Args&... args) noexcept { +inline void OStringStreamAppendImpl(OStringStream& ss, const T& t, const Args&... args) noexcept { OStringStreamAppendImpl(ss, t); OStringStreamAppendImpl(ss, args...); } template -inline void OStringStreamAppend(std::ostream& ss, const Args&... args) { +inline void OStringStreamAppend(OStringStream& ss, const Args&... args) { return OStringStreamAppendImpl(ss, ::onnxruntime::detail::if_char_array_make_ptr_t(args)...); } diff --git a/onnxruntime/core/providers/webgpu/tensor/concat.cc b/onnxruntime/core/providers/webgpu/tensor/concat.cc index 283a9e5fe8262..75453b991a0cd 100644 --- a/onnxruntime/core/providers/webgpu/tensor/concat.cc +++ b/onnxruntime/core/providers/webgpu/tensor/concat.cc @@ -38,7 +38,7 @@ WEBGPU_CONCAT_VERSIONED_KERNEL(4, 10) WEBGPU_CONCAT_VERSIONED_KERNEL(11, 12) WEBGPU_CONCAT_KERNEL(13) -void AppendCalculateInputIndexFunction(std::ostream& os, size_t input_count) { +void AppendCalculateInputIndexFunction(OStringStream& os, size_t input_count) { os << "fn calculate_input_index(global_idx: u32) -> u32 {\n" << " for (var i = 1u; i < " << input_count << "; i = i + 1u) {\n" << " if (global_idx < " << GetElementAt("uniforms.offsets", "i", input_count) << ") {\n" @@ -49,7 +49,7 @@ void AppendCalculateInputIndexFunction(std::ostream& os, size_t input_count) { << "}\n"; } -void AppendAssignOutputDataFunction(std::ostream& os, gsl::span inputs, const ShaderVariableHelper& output, size_t axis, size_t input_count) { +void AppendAssignOutputDataFunction(OStringStream& os, gsl::span inputs, const ShaderVariableHelper& output, size_t axis, size_t input_count) { os << "fn assign_output_data(global_idx: u32, input_index: u32) {\n"; for (size_t i = 0; i < inputs.size(); ++i) { if (i == 0) { diff --git a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc index e7f902cc08b40..9b54eca641dd7 100644 --- a/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc +++ b/onnxruntime/core/providers/webgpu/tensor/depth_to_space.cc @@ -36,7 +36,7 @@ WEBGPU_DEPTH_TO_SPACE_KERNEL(13, kOnnxDomain, false) WEBGPU_DEPTH_TO_SPACE_VERSIONED_KERNEL(11, 12, kMSInternalNHWCDomain, true) WEBGPU_DEPTH_TO_SPACE_KERNEL(13, kMSInternalNHWCDomain, true) -void AppendPermFunction(std::ostream& os, const ShaderVariableHelper& input, const int64_t* perm) { +void AppendPermFunction(OStringStream& os, const ShaderVariableHelper& input, const int64_t* perm) { os << "fn perm(i: input_indices_t) -> input_indices_t {\n" << " var a: input_indices_t;\n"; for (int idx = 0; idx < input.Rank(); ++idx) { diff --git a/onnxruntime/core/providers/webgpu/tensor/resize_impl.cc b/onnxruntime/core/providers/webgpu/tensor/resize_impl.cc index 75a7f859c965f..2fa2c52794948 100644 --- a/onnxruntime/core/providers/webgpu/tensor/resize_impl.cc +++ b/onnxruntime/core/providers/webgpu/tensor/resize_impl.cc @@ -30,7 +30,7 @@ std::string GetSafeIntegerDivision(ResizeCoordinateTransformationMode transform_ } } -void TransformCoordinate(std::ostream& os, ResizeCoordinateTransformationMode transform_coordinate) { +void TransformCoordinate(OStringStream& os, ResizeCoordinateTransformationMode transform_coordinate) { std::string params; std::string body; switch (transform_coordinate) { @@ -110,7 +110,7 @@ std::string GetCoordinateCaller(ResizeCoordinateTransformationMode transform_coo return caller_ss.str(); } -void CalcNearestPixel(std::ostream& os, ResizeNearestMode mode) { +void CalcNearestPixel(OStringStream& os, ResizeNearestMode mode) { std::string params = "x_original: f32"; std::string body; switch (mode) { diff --git a/onnxruntime/core/providers/webgpu/tensor/split.cc b/onnxruntime/core/providers/webgpu/tensor/split.cc index f6de34dcf120c..25d829c4e8bad 100644 --- a/onnxruntime/core/providers/webgpu/tensor/split.cc +++ b/onnxruntime/core/providers/webgpu/tensor/split.cc @@ -11,7 +11,7 @@ namespace webgpu { namespace { // Helper function to calculate the output index based on the input index and the sizes of the splits. -void CalculateOutputIndex(std::ostream& os, size_t output_count) { +void CalculateOutputIndex(OStringStream& os, size_t output_count) { os << "fn calculate_output_index(index: u32) -> u32 {\n" << " for (var i: u32 = 0u; i < " << output_count << "u; i += 1u ) {\n" << " if (index < " << GetElementAt("uniforms.sizes_in_split_axis", "i", output_count) << ") {\n" @@ -23,7 +23,7 @@ void CalculateOutputIndex(std::ostream& os, size_t output_count) { } // Helper function to write the buffer data for each output. -void WriteBufferData(std::ostream& os, const ShaderVariableHelper& input, +void WriteBufferData(OStringStream& os, const ShaderVariableHelper& input, gsl::span outputs) { os << "fn write_buffer_data(output_number: u32, global_idx: u32, indices: output_0_indices_t) {\n"; for (size_t i = 0; i < outputs.size(); ++i) {