diff --git a/onnxruntime/contrib_ops/webgpu/bert/fast_gelu.cc b/onnxruntime/contrib_ops/webgpu/bert/fast_gelu.cc index a5cae7e7f6747..29ea4f81dd5e1 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/fast_gelu.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/fast_gelu.cc @@ -50,7 +50,7 @@ Status FastGelu::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) c const auto* bias = context.Input(1); auto* output = context.Output(0, input->Shape()); - uint32_t data_size = gsl::narrow(output->Shape().Size()); + uint32_t data_size = onnxruntime::narrow(output->Shape().Size()); if (data_size == 0) { return Status::OK(); } @@ -60,7 +60,7 @@ Status FastGelu::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) c int bias_components = 1; if (bias != nullptr) { - bias_size = gsl::narrow(bias->Shape().Size()); + bias_size = onnxruntime::narrow(bias->Shape().Size()); if (bias_size % 4 == 0) { bias_components = 4; bias_size = bias_size / 4; diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc index c1b025b10e067..1e95d3d9610ff 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc @@ -98,7 +98,7 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt program.AddOutputs({{present_key, ProgramTensorMetadataDependency::Rank, components}, {present_value, ProgramTensorMetadataDependency::Rank, components}}) .AddIndices(valid_present_shape); - program.SetDispatchGroupSize(gsl::narrow(valid_kv_size + 63 / 64)) + program.SetDispatchGroupSize(onnxruntime::narrow(valid_kv_size + 63 / 64)) .SetWorkgroupSize(64) .CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_) .AddUniformVariables({{static_cast(valid_kv_size)}, diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc index bc8b7493fc916..20e1583e0da8f 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc @@ -66,11 +66,11 @@ Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& con const auto* sin_cache = context.Input(3); auto* output = context.Output(0, input_shape); - const auto batch_size = gsl::narrow(input->Shape()[0]); - const auto batch_stride = gsl::narrow(input_shape.SizeFromDimension(1)); - const auto sequence_length = gsl::narrow(input_shape[input_shape.NumDimensions() - 2]); + const auto batch_size = onnxruntime::narrow(input->Shape()[0]); + const auto batch_stride = onnxruntime::narrow(input_shape.SizeFromDimension(1)); + const auto sequence_length = onnxruntime::narrow(input_shape[input_shape.NumDimensions() - 2]); const auto hidden_size = batch_stride / sequence_length; - const auto half_rotary_embedding_dim = gsl::narrow(cos_cache->Shape()[1]); + const auto half_rotary_embedding_dim = onnxruntime::narrow(cos_cache->Shape()[1]); const auto head_size = rotary_embedding_dim_ == 0 ? half_rotary_embedding_dim * 2 : hidden_size / num_heads_; // Rotary embeddings will be calculated in a pair-wise fashion. In accordance, use the shape @@ -85,11 +85,11 @@ Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& con std::vector global_dims(rank); std::vector global_strides(rank); for (size_t j = 0; j < rank; ++j) { - global_dims[j] = gsl::narrow(global_shape[j]); - global_strides[j] = gsl::narrow(global_shape.SizeFromDimension(j + 1)); + global_dims[j] = onnxruntime::narrow(global_shape[j]); + global_strides[j] = onnxruntime::narrow(global_shape.SizeFromDimension(j + 1)); } - const auto output_size = gsl::narrow(global_shape.Size()); + const auto output_size = onnxruntime::narrow(global_shape.Size()); RotaryEmbeddingProgram program{interleaved_}; const auto input_output_strides = input_shape.NumDimensions() == 3 diff --git a/onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc b/onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc index a1840257d734f..d5d4632c01e2a 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc @@ -122,7 +122,7 @@ Status SkipLayerNorm::ComputeInternal(onnxruntime::webgpu::ComputeCo } const bool is_fp16 = x->GetElementType() == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; - const uint32_t hidden_size = gsl::narrow(x_shape[x_shape.NumDimensions() - 1]); + const uint32_t hidden_size = onnxruntime::narrow(x_shape[x_shape.NumDimensions() - 1]); const int components = GetMaxComponents(hidden_size); const bool has_input_skip_bias_sum = input_skip_bias_sum != nullptr; @@ -133,7 +133,7 @@ Status SkipLayerNorm::ComputeInternal(onnxruntime::webgpu::ComputeCo .AddInputs({{skip, ProgramTensorMetadataDependency::Type, components}}) .AddInputs({{gamma, ProgramTensorMetadataDependency::Type, components}}) .AddOutputs({{output, ProgramTensorMetadataDependency::None, components}}) - .SetDispatchGroupSize(gsl::narrow(ceil(1.0 * data_size / hidden_size))) + .SetDispatchGroupSize(onnxruntime::narrow(ceil(1.0 * data_size / hidden_size))) .AddUniformVariables({ {static_cast(components)}, }) diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc index 6720a6072f7bb..05cbfb1f99c48 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc @@ -277,9 +277,9 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor Tensor a_quant = context.CreateGPUTensor(DataTypeImpl::GetType(), a_quant_shape); TensorShapeVector a_scales_dims({1, 1, M, K / kBlockSizeA}); Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims); - quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(kVec4Components)}}) - .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), gsl::narrow(1)}, - {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), gsl::narrow(1)}}) + quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, static_cast(kVec4Components)}}) + .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), 1}, + {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}}) .AddUniformVariable({static_cast(M * K / kVec4Components)}); ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program)); @@ -290,16 +290,16 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor mul_program.SetDispatchGroupSize( (M + kTileSize - 1) / kTileSize, (N + kTileSize - 1) / kTileSize, 1); - mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(kVec4Components)}, - {&a_scale, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(1)}, - {b, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(kVec2Components * kU32Components)}, - {scales, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(1)}}) + mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, static_cast(kVec4Components)}, + {&a_scale, ProgramTensorMetadataDependency::TypeAndRank, 1}, + {b, ProgramTensorMetadataDependency::TypeAndRank, static_cast(kVec2Components * kU32Components)}, + {scales, ProgramTensorMetadataDependency::TypeAndRank, 1}}) .AddUniformVariables({{static_cast(M)}, {static_cast(N)}, {static_cast(K)}, {static_cast(K / 8)}, {static_cast(K / 16)}}) - .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, gsl::narrow(kVec4Components)}) + .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, static_cast(kVec4Components)}) .CacheHint("Block" + std::to_string(block_size)); return context.RunProgram(mul_program); } diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc index e10a7f551eec9..cce10a59fbd4b 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc @@ -372,7 +372,7 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const { } } else { const std::string quantized_data_type = QuantizedDataType(a.NumComponents()); - const int output_element_number = y.NumComponents() * gsl::narrow(output_number_); + const int output_element_number = y.NumComponents() * onnxruntime::narrow(output_number_); const uint32_t shared_memory_size = output_number_ * WORKGROUP_SIZE; std::string offset = "workgroup_idx * " + std::to_string(output_number_); @@ -548,16 +548,16 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context TensorShape b_shape({N_, K_}); ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b_shape, false, true)); auto* y = context.Output(0, helper.OutputShape()); - const uint32_t data_size = gsl::narrow(y->Shape().Size()); + const uint32_t data_size = onnxruntime::narrow(y->Shape().Size()); if (data_size == 0) { return Status::OK(); } - const uint32_t batch_count = gsl::narrow(helper.OutputOffsets().size()); - const uint32_t M = gsl::narrow(helper.M()); - const uint32_t N = gsl::narrow(helper.N()); - const uint32_t K = gsl::narrow(helper.K()); - const uint32_t block_size = gsl::narrow(block_size_); + const uint32_t batch_count = onnxruntime::narrow(helper.OutputOffsets().size()); + const uint32_t M = onnxruntime::narrow(helper.M()); + const uint32_t N = onnxruntime::narrow(helper.N()); + const uint32_t K = onnxruntime::narrow(helper.K()); + const uint32_t block_size = onnxruntime::narrow(block_size_); constexpr uint32_t nbits = 4; const uint32_t n_blocks_per_col = (K + block_size - 1) / block_size; @@ -584,7 +584,7 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context const uint32_t tile_m = M > kMinMForTileOptimization ? 4 : 1; const bool has_subgroup = context.Device().HasFeature(wgpu::FeatureName::Subgroups); const bool use_subgroup = has_subgroup && context.AdapterInfo().vendor == std::string_view{"intel"} && components_a == 4 && block_size == 32; - MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow(components_b), has_zero_points, use_subgroup}; + MatMulNBitsProgram program{output_number, block_size, tile_m, static_cast(components_b), has_zero_points, use_subgroup}; if (M > kMinMForTileOptimization && block_size == 32) { components = 1; constexpr uint32_t workgroup_size = 64; @@ -614,10 +614,10 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context TensorShape reshaped_y_shape{batch_count, M, N / components}; program - .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, reshaped_a_shape, gsl::narrow(components_a)}, - {b, ProgramTensorMetadataDependency::TypeAndRank, reshaped_b_shape, gsl::narrow(components_b * 4 /** b will be accessed as uint32 which includs 4 uint8. So here we need to multiply 4.*/)}, + .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, reshaped_a_shape, static_cast(components_a)}, + {b, ProgramTensorMetadataDependency::TypeAndRank, reshaped_b_shape, static_cast(components_b * 4 /** b will be accessed as uint32 which includs 4 uint8. So here we need to multiply 4.*/)}, {scales, ProgramTensorMetadataDependency::None}}) - .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, gsl::narrow(components)}) + .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, static_cast(components)}) .AddUniformVariable({block_size}); if (has_zero_points) { program.AddInput({zero_points, ProgramTensorMetadataDependency::None, {(zero_points->Shape().Size() + 3) / 4}, 4}); diff --git a/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc index 2944a4d61b8ef..cb024d2a758a9 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/subgroup_matrix_matmul_nbits.cc @@ -185,13 +185,13 @@ Status ApplySubgroupMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Te mul_program.SetDispatchGroupSize( (N + kTileSizeB - 1) / kTileSizeB, (M + kTileSizeA - 1) / kTileSizeA, 1); - mul_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(1)}, - {b, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(kU32Components)}, - {scales, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow(1)}}) + mul_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, 1}, + {b, ProgramTensorMetadataDependency::TypeAndRank, static_cast(kU32Components)}, + {scales, ProgramTensorMetadataDependency::TypeAndRank, 1}}) .AddUniformVariables({{static_cast(M)}, {static_cast(N)}, {static_cast(K)}}) - .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, y_shape, gsl::narrow(1)}); + .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, y_shape, 1}); return context.RunProgram(mul_program); } diff --git a/onnxruntime/core/providers/webgpu/generator/range.cc b/onnxruntime/core/providers/webgpu/generator/range.cc index a0b65f08a5b4e..99c5a1c1b5566 100644 --- a/onnxruntime/core/providers/webgpu/generator/range.cc +++ b/onnxruntime/core/providers/webgpu/generator/range.cc @@ -23,7 +23,7 @@ Status Range::ComputeInternal(ComputeContext& context) const { return Status::OK(); } - uint32_t output_size = gsl::narrow(n); + uint32_t output_size = onnxruntime::narrow(n); RangeProgram program{}; #if defined(__GNUC__) #pragma GCC diagnostic push diff --git a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc index 75866513e2c7d..8a22e45f17047 100644 --- a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc +++ b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc @@ -141,7 +141,7 @@ Status BinaryElementwise::ComputeInternal(ComputeContext& context) const { } } - uint32_t vec_size = gsl::narrow((size + 3) / 4); + uint32_t vec_size = onnxruntime::narrow((size + 3) / 4); BinaryElementwiseProgram program{kernel_name_, expression_, is_broadcast, diff --git a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc index eaaad206ebaf5..189d7baafce6a 100644 --- a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc +++ b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc @@ -27,7 +27,7 @@ Status UnaryElementwise::ComputeInternal(ComputeContext& context) const { if (size == 0) { return Status::OK(); } - uint32_t vec_size = gsl::narrow((size + 3) / 4); + uint32_t vec_size = onnxruntime::narrow((size + 3) / 4); UnaryElementwiseProgram program{kernel_name_, expression_, additional_impl_, additional_usage_}; program .AddInputs({{input_tensor, ProgramTensorMetadataDependency::Type, {vec_size}, 4}}) diff --git a/onnxruntime/core/providers/webgpu/nn/layer_norm.cc b/onnxruntime/core/providers/webgpu/nn/layer_norm.cc index 64172021e82f1..28ad686909a47 100644 --- a/onnxruntime/core/providers/webgpu/nn/layer_norm.cc +++ b/onnxruntime/core/providers/webgpu/nn/layer_norm.cc @@ -23,7 +23,7 @@ static size_t NormalizeAxis(int64_t axis, size_t tensor_rank) { if (axis < -rank && axis >= rank) { ORT_THROW("invalid axis: ", axis); } - return gsl::narrow(axis < 0 ? axis + rank : axis); + return onnxruntime::narrow(axis < 0 ? axis + rank : axis); } static std::string SumVector(std::string x, int components) { @@ -92,10 +92,10 @@ Status LayerNorm::ComputeInternal(onnxruntime::webgpu::ComputeContex const bool is_fp16 = x->GetElementType() == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; const size_t axis = NormalizeAxis(axis_, x_shape.NumDimensions()); - const uint32_t norm_count = gsl::narrow(x_shape.SizeToDimension(axis)); + const uint32_t norm_count = onnxruntime::narrow(x_shape.SizeToDimension(axis)); const int64_t norm_size = x_shape.SizeFromDimension(axis); const int components = GetMaxComponents(norm_size); - const uint32_t norm_size_vectorized = gsl::narrow((norm_size + components - 1) / components); + const uint32_t norm_size_vectorized = onnxruntime::narrow((norm_size + components - 1) / components); const auto scale_size = scale->Shape().Size(); const auto bias_size = (bias) ? bias->Shape().Size() : 0; diff --git a/onnxruntime/core/providers/webgpu/program_manager.cc b/onnxruntime/core/providers/webgpu/program_manager.cc index 1fdd312d4f0d8..7a4a873a1adf3 100644 --- a/onnxruntime/core/providers/webgpu/program_manager.cc +++ b/onnxruntime/core/providers/webgpu/program_manager.cc @@ -24,14 +24,14 @@ Status ProgramManager::NormalizeDispatchGroupSize(uint32_t& x, uint32_t& y, uint auto limit_per_dimension = limits_.maxComputeWorkgroupsPerDimension; if (x > limit_per_dimension || y > limit_per_dimension || z > limit_per_dimension) { - auto size = static_cast(x) * static_cast(y) * static_cast(z); - uint32_t dispatch_avg = gsl::narrow(std::ceil(std::sqrt(size))); + double size = static_cast(x) * static_cast(y) * static_cast(z); + double dispatch_avg = std::ceil(std::sqrt(size)); if (dispatch_avg > limit_per_dimension) { - dispatch_avg = gsl::narrow(std::ceil(std::cbrt(size))); + dispatch_avg = std::ceil(std::cbrt(size)); ORT_RETURN_IF(dispatch_avg > limit_per_dimension, "The dispatch group size exceeds WebGPU maximum."); - x = y = z = dispatch_avg; + x = y = z = static_cast(dispatch_avg); } else { - x = y = dispatch_avg; + x = y = static_cast(dispatch_avg); z = 1; } } diff --git a/onnxruntime/core/providers/webgpu/shader_variable.cc b/onnxruntime/core/providers/webgpu/shader_variable.cc index 5e5920f582251..f8e1e0b3b8d2b 100644 --- a/onnxruntime/core/providers/webgpu/shader_variable.cc +++ b/onnxruntime/core/providers/webgpu/shader_variable.cc @@ -91,7 +91,7 @@ ShaderIndicesHelper::ShaderIndicesHelper(std::string_view name, ProgramVariableD : name_(name), type_(type), num_components_{NumberOfComponents(type)}, - rank_{gsl::narrow(dims.NumDimensions())}, + rank_{static_cast(dims.NumDimensions())}, dims_{dims}, usage_(usage), indices_type_{GetIndicesType(rank_)}, diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.cc b/onnxruntime/core/providers/webgpu/tensor/cast.cc index 8b5bede34e6d0..7f92ea4ed3776 100644 --- a/onnxruntime/core/providers/webgpu/tensor/cast.cc +++ b/onnxruntime/core/providers/webgpu/tensor/cast.cc @@ -69,7 +69,7 @@ Status Cast::ComputeInternal(ComputeContext& context) const { if (size == 0) { return Status::OK(); } - uint32_t vec_size = gsl::narrow((size + 3) / 4); + uint32_t vec_size = onnxruntime::narrow((size + 3) / 4); CastProgram program{to_}; program diff --git a/onnxruntime/core/providers/webgpu/tensor/cast.h b/onnxruntime/core/providers/webgpu/tensor/cast.h index ef5c4d5d0dabe..925cd200f0aba 100644 --- a/onnxruntime/core/providers/webgpu/tensor/cast.h +++ b/onnxruntime/core/providers/webgpu/tensor/cast.h @@ -26,7 +26,7 @@ class Cast final : public WebGpuKernel { int64_t to; Status status = info.GetAttr("to", &to); ORT_ENFORCE(status.IsOK(), "Attribute to is not set."); - to_ = gsl::narrow(to); + to_ = onnxruntime::narrow(to); // ignore attribute 'saturate' as float8 is not supported in WebGPU } diff --git a/onnxruntime/core/providers/webgpu/tensor/concat.cc b/onnxruntime/core/providers/webgpu/tensor/concat.cc index 5ed8099fde05e..5cfd6c78f8929 100644 --- a/onnxruntime/core/providers/webgpu/tensor/concat.cc +++ b/onnxruntime/core/providers/webgpu/tensor/concat.cc @@ -104,7 +104,7 @@ Status Concat::ComputeInternal(ComputeContext& context) const { return Status::OK(); } - uint32_t output_size = gsl::narrow_cast(prepare.output_tensor->Shape().Size()); + uint32_t output_size = onnxruntime::narrow(prepare.output_tensor->Shape().Size()); size_t axis = static_cast(prepare.axis); ConcatProgram program{axis}; diff --git a/onnxruntime/core/providers/webgpu/tensor/expand.cc b/onnxruntime/core/providers/webgpu/tensor/expand.cc index 809616660aa9e..9bdebe2c1e0d3 100644 --- a/onnxruntime/core/providers/webgpu/tensor/expand.cc +++ b/onnxruntime/core/providers/webgpu/tensor/expand.cc @@ -42,7 +42,7 @@ Status Expand::ComputeInternal(ComputeContext& context) const { : 1; const int components_o = output_shape.IsScalar() ? 1 : output_shape[output_shape.NumDimensions() - 1] % 4 == 0 ? 4 : 1; - uint32_t data_size = gsl::narrow(output_shape.Size() / components_o); + uint32_t data_size = onnxruntime::narrow(output_shape.Size() / components_o); ExpandProgram program{}; program diff --git a/onnxruntime/core/providers/webgpu/tensor/gather.cc b/onnxruntime/core/providers/webgpu/tensor/gather.cc index 9f6e5f2420d86..39d07991f3c5a 100644 --- a/onnxruntime/core/providers/webgpu/tensor/gather.cc +++ b/onnxruntime/core/providers/webgpu/tensor/gather.cc @@ -42,7 +42,7 @@ Status GatherProgram::GenerateShaderCode(ShaderHelper& shader) const { Status Gather::ComputeInternal(ComputeContext& context) const { Prepare p; ORT_RETURN_IF_ERROR(PrepareForCompute(&context.KernelContext(), p)); - uint32_t data_size = gsl::narrow(p.output_tensor->Shape().Size()); + uint32_t data_size = onnxruntime::narrow(p.output_tensor->Shape().Size()); if (data_size == 0) { return Status::OK(); } diff --git a/onnxruntime/core/providers/webgpu/tensor/pad.cc b/onnxruntime/core/providers/webgpu/tensor/pad.cc index 9ee13aada67fe..6a8bc6554b772 100644 --- a/onnxruntime/core/providers/webgpu/tensor/pad.cc +++ b/onnxruntime/core/providers/webgpu/tensor/pad.cc @@ -130,7 +130,7 @@ Status Pad::ComputeInternal(ComputeContext& context) const { } auto* output_tensor = context.Output(0, output_shape); - uint32_t output_size = gsl::narrow(output_shape.Size()); + uint32_t output_size = onnxruntime::narrow(output_shape.Size()); if (output_size == 0) { // Do not need to fill output, return return Status::OK(); diff --git a/onnxruntime/core/providers/webgpu/tensor/resize_impl.cc b/onnxruntime/core/providers/webgpu/tensor/resize_impl.cc index 455e7dc54bf1d..f68ace3c1d8a1 100644 --- a/onnxruntime/core/providers/webgpu/tensor/resize_impl.cc +++ b/onnxruntime/core/providers/webgpu/tensor/resize_impl.cc @@ -211,7 +211,7 @@ Status ResizeNearestImpl(ComputeContext& context, onnxruntime::ResizeNearestMode nearest_mode) { TensorShape output_shape(output_dims); auto* output_tensor = context.Output(0, output_shape); - uint32_t output_size = gsl::narrow(output_shape.Size()); + uint32_t output_size = onnxruntime::narrow(output_shape.Size()); ResizeNearestProgram program{coordinate_transform_mode, nearest_mode, extrapolation_enabled, rank}; program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank}) @@ -299,7 +299,7 @@ Status ResizeBilinearImpl(ComputeContext& context, onnxruntime::ResizeCoordinateTransformationMode coordinate_transform_mode) { TensorShape output_shape(output_dims); auto* output_tensor = context.Output(0, output_shape); - uint32_t output_size = gsl::narrow(output_shape.Size()); + uint32_t output_size = onnxruntime::narrow(output_shape.Size()); ResizeBilinearProgram program{coordinate_transform_mode, extrapolation_enabled, rank}; program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank}) @@ -413,7 +413,7 @@ Status ResizeTrilinearImpl(ComputeContext& context, onnxruntime::ResizeCoordinateTransformationMode coordinate_transform_mode) { TensorShape output_shape(output_dims); auto* output_tensor = context.Output(0, output_shape); - uint32_t output_size = gsl::narrow(output_shape.Size()); + uint32_t output_size = onnxruntime::narrow(output_shape.Size()); ResizeTrilinearProgram program{coordinate_transform_mode, extrapolation_enabled, rank}; program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank}) @@ -534,7 +534,7 @@ Status ResizeBiCubicImpl(ComputeContext& context, onnxruntime::ResizeCoordinateTransformationMode coordinate_transform_mode) { TensorShape output_shape(output_dims); auto* output_tensor = context.Output(0, output_shape); - uint32_t output_size = gsl::narrow(output_shape.Size()); + uint32_t output_size = onnxruntime::narrow(output_shape.Size()); ResizeBiCubicProgram program{coordinate_transform_mode, extrapolation_enabled, exclude_outside, rank}; program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank}) diff --git a/onnxruntime/core/providers/webgpu/tensor/split.cc b/onnxruntime/core/providers/webgpu/tensor/split.cc index 83bf832cc5b11..d93b75fa21c16 100644 --- a/onnxruntime/core/providers/webgpu/tensor/split.cc +++ b/onnxruntime/core/providers/webgpu/tensor/split.cc @@ -107,7 +107,7 @@ Status Split::ComputeInternal(ComputeContext& context) const { ORT_RETURN_IF_ERROR(PrepareForCompute(input_shape, num_outputs, axis, before_dims, after_dims_including_split_axis, after_dims_excluding_split, split_sizes)); - SplitProgram program{gsl::narrow_cast(axis)}; + SplitProgram program{static_cast(axis)}; program.AddInput({input, ProgramTensorMetadataDependency::TypeAndRank}); auto output_dimensions = input_shape.AsShapeVector(); @@ -120,7 +120,7 @@ Status Split::ComputeInternal(ComputeContext& context) const { program.AddOutput({output, ProgramTensorMetadataDependency::Rank}); } - uint32_t input_size = gsl::narrow(input_shape.Size()); + uint32_t input_size = onnxruntime::narrow(input_shape.Size()); // Early return if the input tensor is empty. if (input_size == 0) { return Status::OK(); @@ -130,7 +130,7 @@ Status Split::ComputeInternal(ComputeContext& context) const { std::vector sizes_in_split_axis; // sizes_in_split_axis are the cumulative sizes of the splits in the split axis. for (auto split_size : split_sizes) { - previous_sum += gsl::narrow(split_size); + previous_sum += onnxruntime::narrow(split_size); sizes_in_split_axis.push_back(previous_sum); } diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc index 24b98e9533d17..0df7d1ae9fa2f 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -105,7 +105,7 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context, const Tensor& input, Tensor& output) { const auto& input_shape = input.Shape(); const auto& input_dims = input_shape.GetDims(); - int32_t rank = gsl::narrow_cast(input_shape.NumDimensions()); + int32_t rank = static_cast(input_shape.NumDimensions()); TensorShapeVector output_dims(rank); @@ -131,7 +131,7 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context, new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]}); } - uint32_t output_size = gsl::narrow_cast(input_shape.Size()); + uint32_t output_size = onnxruntime::narrow(input_shape.Size()); TransposeProgram program{permutations, use_shared}; if (use_shared) { @@ -156,7 +156,7 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context, Status Transpose::ComputeInternal(ComputeContext& context) const { const auto* input_tensor = context.Input(0); const TensorShape& input_shape = input_tensor->Shape(); - int32_t rank = gsl::narrow_cast(input_shape.NumDimensions()); + int32_t rank = static_cast(input_shape.NumDimensions()); TensorShapeVector output_dims(rank); InlinedVector default_perm(rank); diff --git a/onnxruntime/core/providers/webgpu/tensor/where.cc b/onnxruntime/core/providers/webgpu/tensor/where.cc index e8cdabb9dbe40..d7272ec525296 100644 --- a/onnxruntime/core/providers/webgpu/tensor/where.cc +++ b/onnxruntime/core/providers/webgpu/tensor/where.cc @@ -127,7 +127,7 @@ Status Where::ComputeInternal(ComputeContext& context) const { ORT_RETURN_IF_ERROR(ComputeOutputShape(cond_shape, x_shape, y_shape, output_shape)); auto* output_tensor = context.Output(0, output_shape); constexpr int component = 4; - uint32_t vec_size = gsl::narrow_cast((output_shape.Size() + 3) / component); + uint32_t vec_size = onnxruntime::narrow((output_shape.Size() + 3) / component); const auto is_broadcast = !(x_shape == y_shape && y_shape == cond_shape); WhereProgram program{is_broadcast}; diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc index 14c12ac247080..97144573dde2d 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc @@ -322,9 +322,9 @@ Status WebGpuContext::Run(ComputeContext& context, const ProgramBase& program) { std::vector dims(expected_rank); std::vector stride(expected_rank - 1); for (size_t j = 0; j < expected_rank; ++j) { - dims[j] = gsl::narrow(shape[j]); + dims[j] = onnxruntime::narrow(shape[j]); if (j < expected_rank - 1) { - stride[j] = gsl::narrow(shape.SizeFromDimension(j + 1)); + stride[j] = onnxruntime::narrow(shape.SizeFromDimension(j + 1)); } } diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml index bb6c210161952..a0f22fcfce14e 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml @@ -105,3 +105,31 @@ stages: onnxruntime_webgpu_external_dawn_test.exe --no_proc_table displayName: Run tests (onnxruntime_webgpu_external_dawn_test) workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' + +- stage: webgpu_minimal_build_edge + dependsOn: [] + jobs: + - template: templates/jobs/win-ci-vs-2022-job.yml + parameters: + BuildConfig: 'RelWithDebInfo' + EnvSetupScript: setup_env.bat + buildArch: x64 + additionalBuildFlags: >- + --build_shared_lib + --disable_exceptions + --disable_rtti + --enable_msvc_static_runtime + --enable_reduced_operator_type_support + --skip_tests + --use_binskim_compliant_compile_flags + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF onnxruntime_DISABLE_SPARSE_TENSORS=ON onnxruntime_DISABLE_OPTIONAL_TYPE=ON + --minimal_build extended + --use_webgpu + msbuildPlatform: x64 + isX86: false + job_name_suffix: x64_RelWithDebInfo + RunOnnxRuntimeTests: false + ORT_EP_NAME: WebGPU + EnablePython: false + WITH_CACHE: true + MachinePool: onnxruntime-Win2022-VS2022-webgpu-A10