Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions onnxruntime/contrib_ops/webgpu/bert/fast_gelu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ Status FastGelu::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) c
const auto* bias = context.Input(1);
auto* output = context.Output(0, input->Shape());

uint32_t data_size = gsl::narrow<uint32_t>(output->Shape().Size());
uint32_t data_size = onnxruntime::narrow<uint32_t>(output->Shape().Size());
if (data_size == 0) {
return Status::OK();
}
Expand All @@ -60,7 +60,7 @@ Status FastGelu::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) c
int bias_components = 1;

if (bias != nullptr) {
bias_size = gsl::narrow<uint32_t>(bias->Shape().Size());
bias_size = onnxruntime::narrow<uint32_t>(bias->Shape().Size());
if (bias_size % 4 == 0) {
bias_components = 4;
bias_size = bias_size / 4;
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ Status CopyKVCache(onnxruntime::webgpu::ComputeContext& context, const WebgpuAtt
program.AddOutputs({{present_key, ProgramTensorMetadataDependency::Rank, components},
{present_value, ProgramTensorMetadataDependency::Rank, components}})
.AddIndices(valid_present_shape);
program.SetDispatchGroupSize(gsl::narrow<uint32_t>(valid_kv_size + 63 / 64))
program.SetDispatchGroupSize(onnxruntime::narrow<uint32_t>(valid_kv_size + 63 / 64))
.SetWorkgroupSize(64)
.CacheHint(has_past, parameters.qkv_format_, parameters.past_present_share_buffer_)
.AddUniformVariables({{static_cast<uint32_t>(valid_kv_size)},
Expand Down
14 changes: 7 additions & 7 deletions onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& con
const auto* sin_cache = context.Input<Tensor>(3);
auto* output = context.Output(0, input_shape);

const auto batch_size = gsl::narrow<uint32_t>(input->Shape()[0]);
const auto batch_stride = gsl::narrow<uint32_t>(input_shape.SizeFromDimension(1));
const auto sequence_length = gsl::narrow<uint32_t>(input_shape[input_shape.NumDimensions() - 2]);
const auto batch_size = onnxruntime::narrow<uint32_t>(input->Shape()[0]);
const auto batch_stride = onnxruntime::narrow<uint32_t>(input_shape.SizeFromDimension(1));
const auto sequence_length = onnxruntime::narrow<uint32_t>(input_shape[input_shape.NumDimensions() - 2]);
const auto hidden_size = batch_stride / sequence_length;
const auto half_rotary_embedding_dim = gsl::narrow<uint32_t>(cos_cache->Shape()[1]);
const auto half_rotary_embedding_dim = onnxruntime::narrow<uint32_t>(cos_cache->Shape()[1]);
const auto head_size = rotary_embedding_dim_ == 0 ? half_rotary_embedding_dim * 2 : hidden_size / num_heads_;

// Rotary embeddings will be calculated in a pair-wise fashion. In accordance, use the shape
Expand All @@ -85,11 +85,11 @@ Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& con
std::vector<uint32_t> global_dims(rank);
std::vector<uint32_t> global_strides(rank);
for (size_t j = 0; j < rank; ++j) {
global_dims[j] = gsl::narrow<uint32_t>(global_shape[j]);
global_strides[j] = gsl::narrow<uint32_t>(global_shape.SizeFromDimension(j + 1));
global_dims[j] = onnxruntime::narrow<uint32_t>(global_shape[j]);
global_strides[j] = onnxruntime::narrow<uint32_t>(global_shape.SizeFromDimension(j + 1));
}

const auto output_size = gsl::narrow<const uint32_t>(global_shape.Size());
const auto output_size = onnxruntime::narrow<const uint32_t>(global_shape.Size());
RotaryEmbeddingProgram program{interleaved_};
const auto input_output_strides =
input_shape.NumDimensions() == 3
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/contrib_ops/webgpu/bert/skip_layer_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ Status SkipLayerNorm<simplified>::ComputeInternal(onnxruntime::webgpu::ComputeCo
}

const bool is_fp16 = x->GetElementType() == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
const uint32_t hidden_size = gsl::narrow<uint32_t>(x_shape[x_shape.NumDimensions() - 1]);
const uint32_t hidden_size = onnxruntime::narrow<uint32_t>(x_shape[x_shape.NumDimensions() - 1]);
const int components = GetMaxComponents(hidden_size);
const bool has_input_skip_bias_sum = input_skip_bias_sum != nullptr;

Expand All @@ -133,7 +133,7 @@ Status SkipLayerNorm<simplified>::ComputeInternal(onnxruntime::webgpu::ComputeCo
.AddInputs({{skip, ProgramTensorMetadataDependency::Type, components}})
.AddInputs({{gamma, ProgramTensorMetadataDependency::Type, components}})
.AddOutputs({{output, ProgramTensorMetadataDependency::None, components}})
.SetDispatchGroupSize(gsl::narrow<uint32_t>(ceil(1.0 * data_size / hidden_size)))
.SetDispatchGroupSize(onnxruntime::narrow<uint32_t>(ceil(1.0 * data_size / hidden_size)))
.AddUniformVariables({
{static_cast<uint32_t>(components)},
})
Expand Down
16 changes: 8 additions & 8 deletions onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,9 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
Tensor a_quant = context.CreateGPUTensor(DataTypeImpl::GetType<uint32_t>(), a_quant_shape);
TensorShapeVector a_scales_dims({1, 1, M, K / kBlockSizeA});
Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims);
quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec4Components)}})
.AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), gsl::narrow<int>(1)},
{&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), gsl::narrow<int>(1)}})
quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components)}})
.AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), 1},
{&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}})
.AddUniformVariable({static_cast<uint32_t>(M * K / kVec4Components)});
ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program));

Expand All @@ -290,16 +290,16 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
mul_program.SetDispatchGroupSize(
(M + kTileSize - 1) / kTileSize,
(N + kTileSize - 1) / kTileSize, 1);
mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec4Components)},
{&a_scale, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)},
{b, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kVec2Components * kU32Components)},
{scales, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)}})
mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components)},
{&a_scale, ProgramTensorMetadataDependency::TypeAndRank, 1},
{b, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec2Components * kU32Components)},
{scales, ProgramTensorMetadataDependency::TypeAndRank, 1}})
.AddUniformVariables({{static_cast<uint32_t>(M)},
{static_cast<uint32_t>(N)},
{static_cast<uint32_t>(K)},
{static_cast<uint32_t>(K / 8)},
{static_cast<uint32_t>(K / 16)}})
.AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, gsl::narrow<int>(kVec4Components)})
.AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, static_cast<int>(kVec4Components)})
.CacheHint("Block" + std::to_string(block_size));
return context.RunProgram(mul_program);
}
Expand Down
22 changes: 11 additions & 11 deletions onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ Status MatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
}
} else {
const std::string quantized_data_type = QuantizedDataType(a.NumComponents());
const int output_element_number = y.NumComponents() * gsl::narrow<int>(output_number_);
const int output_element_number = y.NumComponents() * onnxruntime::narrow<int>(output_number_);

const uint32_t shared_memory_size = output_number_ * WORKGROUP_SIZE;
std::string offset = "workgroup_idx * " + std::to_string(output_number_);
Expand Down Expand Up @@ -548,16 +548,16 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
TensorShape b_shape({N_, K_});
ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b_shape, false, true));
auto* y = context.Output(0, helper.OutputShape());
const uint32_t data_size = gsl::narrow<uint32_t>(y->Shape().Size());
const uint32_t data_size = onnxruntime::narrow<uint32_t>(y->Shape().Size());
if (data_size == 0) {
return Status::OK();
}

const uint32_t batch_count = gsl::narrow<uint32_t>(helper.OutputOffsets().size());
const uint32_t M = gsl::narrow<uint32_t>(helper.M());
const uint32_t N = gsl::narrow<uint32_t>(helper.N());
const uint32_t K = gsl::narrow<uint32_t>(helper.K());
const uint32_t block_size = gsl::narrow<uint32_t>(block_size_);
const uint32_t batch_count = onnxruntime::narrow<uint32_t>(helper.OutputOffsets().size());
const uint32_t M = onnxruntime::narrow<uint32_t>(helper.M());
const uint32_t N = onnxruntime::narrow<uint32_t>(helper.N());
const uint32_t K = onnxruntime::narrow<uint32_t>(helper.K());
const uint32_t block_size = onnxruntime::narrow<uint32_t>(block_size_);
constexpr uint32_t nbits = 4;

const uint32_t n_blocks_per_col = (K + block_size - 1) / block_size;
Expand All @@ -584,7 +584,7 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
const uint32_t tile_m = M > kMinMForTileOptimization ? 4 : 1;
const bool has_subgroup = context.Device().HasFeature(wgpu::FeatureName::Subgroups);
const bool use_subgroup = has_subgroup && context.AdapterInfo().vendor == std::string_view{"intel"} && components_a == 4 && block_size == 32;
MatMulNBitsProgram program{output_number, block_size, tile_m, gsl::narrow<int>(components_b), has_zero_points, use_subgroup};
MatMulNBitsProgram program{output_number, block_size, tile_m, static_cast<int>(components_b), has_zero_points, use_subgroup};
if (M > kMinMForTileOptimization && block_size == 32) {
components = 1;
constexpr uint32_t workgroup_size = 64;
Expand Down Expand Up @@ -614,10 +614,10 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context
TensorShape reshaped_y_shape{batch_count, M, N / components};

program
.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, reshaped_a_shape, gsl::narrow<int>(components_a)},
{b, ProgramTensorMetadataDependency::TypeAndRank, reshaped_b_shape, gsl::narrow<int>(components_b * 4 /** b will be accessed as uint32 which includs 4 uint8. So here we need to multiply 4.*/)},
.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, reshaped_a_shape, static_cast<int>(components_a)},
{b, ProgramTensorMetadataDependency::TypeAndRank, reshaped_b_shape, static_cast<int>(components_b * 4 /** b will be accessed as uint32 which includs 4 uint8. So here we need to multiply 4.*/)},
{scales, ProgramTensorMetadataDependency::None}})
.AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, gsl::narrow<int>(components)})
.AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, static_cast<int>(components)})
.AddUniformVariable({block_size});
if (has_zero_points) {
program.AddInput({zero_points, ProgramTensorMetadataDependency::None, {(zero_points->Shape().Size() + 3) / 4}, 4});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,13 +185,13 @@ Status ApplySubgroupMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Te
mul_program.SetDispatchGroupSize(
(N + kTileSizeB - 1) / kTileSizeB,
(M + kTileSizeA - 1) / kTileSizeA, 1);
mul_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)},
{b, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(kU32Components)},
{scales, ProgramTensorMetadataDependency::TypeAndRank, gsl::narrow<int>(1)}})
mul_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, 1},
{b, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kU32Components)},
{scales, ProgramTensorMetadataDependency::TypeAndRank, 1}})
.AddUniformVariables({{static_cast<uint32_t>(M)},
{static_cast<uint32_t>(N)},
{static_cast<uint32_t>(K)}})
.AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, y_shape, gsl::narrow<int>(1)});
.AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, y_shape, 1});
return context.RunProgram(mul_program);
}

Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/webgpu/generator/range.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Status Range<T>::ComputeInternal(ComputeContext& context) const {
return Status::OK();
}

uint32_t output_size = gsl::narrow<uint32_t>(n);
uint32_t output_size = onnxruntime::narrow<uint32_t>(n);
RangeProgram program{};
#if defined(__GNUC__)
#pragma GCC diagnostic push
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ Status BinaryElementwise::ComputeInternal(ComputeContext& context) const {
}
}

uint32_t vec_size = gsl::narrow<uint32_t>((size + 3) / 4);
uint32_t vec_size = onnxruntime::narrow<uint32_t>((size + 3) / 4);
BinaryElementwiseProgram program{kernel_name_,
expression_,
is_broadcast,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Status UnaryElementwise::ComputeInternal(ComputeContext& context) const {
if (size == 0) {
return Status::OK();
}
uint32_t vec_size = gsl::narrow<uint32_t>((size + 3) / 4);
uint32_t vec_size = onnxruntime::narrow<uint32_t>((size + 3) / 4);
UnaryElementwiseProgram program{kernel_name_, expression_, additional_impl_, additional_usage_};
program
.AddInputs({{input_tensor, ProgramTensorMetadataDependency::Type, {vec_size}, 4}})
Expand Down
6 changes: 3 additions & 3 deletions onnxruntime/core/providers/webgpu/nn/layer_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ static size_t NormalizeAxis(int64_t axis, size_t tensor_rank) {
if (axis < -rank && axis >= rank) {
ORT_THROW("invalid axis: ", axis);
}
return gsl::narrow<size_t>(axis < 0 ? axis + rank : axis);
return onnxruntime::narrow<size_t>(axis < 0 ? axis + rank : axis);
}

static std::string SumVector(std::string x, int components) {
Expand Down Expand Up @@ -92,10 +92,10 @@ Status LayerNorm<simplified>::ComputeInternal(onnxruntime::webgpu::ComputeContex
const bool is_fp16 = x->GetElementType() == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;

const size_t axis = NormalizeAxis(axis_, x_shape.NumDimensions());
const uint32_t norm_count = gsl::narrow<uint32_t>(x_shape.SizeToDimension(axis));
const uint32_t norm_count = onnxruntime::narrow<uint32_t>(x_shape.SizeToDimension(axis));
const int64_t norm_size = x_shape.SizeFromDimension(axis);
const int components = GetMaxComponents(norm_size);
const uint32_t norm_size_vectorized = gsl::narrow<uint32_t>((norm_size + components - 1) / components);
const uint32_t norm_size_vectorized = onnxruntime::narrow<uint32_t>((norm_size + components - 1) / components);

const auto scale_size = scale->Shape().Size();
const auto bias_size = (bias) ? bias->Shape().Size() : 0;
Expand Down
10 changes: 5 additions & 5 deletions onnxruntime/core/providers/webgpu/program_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ Status ProgramManager::NormalizeDispatchGroupSize(uint32_t& x, uint32_t& y, uint

auto limit_per_dimension = limits_.maxComputeWorkgroupsPerDimension;
if (x > limit_per_dimension || y > limit_per_dimension || z > limit_per_dimension) {
auto size = static_cast<double>(x) * static_cast<double>(y) * static_cast<double>(z);
uint32_t dispatch_avg = gsl::narrow<uint32_t>(std::ceil(std::sqrt(size)));
double size = static_cast<double>(x) * static_cast<double>(y) * static_cast<double>(z);
double dispatch_avg = std::ceil(std::sqrt(size));
if (dispatch_avg > limit_per_dimension) {
dispatch_avg = gsl::narrow<uint32_t>(std::ceil(std::cbrt(size)));
dispatch_avg = std::ceil(std::cbrt(size));
ORT_RETURN_IF(dispatch_avg > limit_per_dimension, "The dispatch group size exceeds WebGPU maximum.");
x = y = z = dispatch_avg;
x = y = z = static_cast<uint32_t>(dispatch_avg);
} else {
x = y = dispatch_avg;
x = y = static_cast<uint32_t>(dispatch_avg);
z = 1;
}
}
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/webgpu/shader_variable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ ShaderIndicesHelper::ShaderIndicesHelper(std::string_view name, ProgramVariableD
: name_(name),
type_(type),
num_components_{NumberOfComponents(type)},
rank_{gsl::narrow<int>(dims.NumDimensions())},
rank_{static_cast<int>(dims.NumDimensions())},
dims_{dims},
usage_(usage),
indices_type_{GetIndicesType(rank_)},
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/webgpu/tensor/cast.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Status Cast::ComputeInternal(ComputeContext& context) const {
if (size == 0) {
return Status::OK();
}
uint32_t vec_size = gsl::narrow<uint32_t>((size + 3) / 4);
uint32_t vec_size = onnxruntime::narrow<uint32_t>((size + 3) / 4);

CastProgram program{to_};
program
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/webgpu/tensor/cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class Cast final : public WebGpuKernel {
int64_t to;
Status status = info.GetAttr("to", &to);
ORT_ENFORCE(status.IsOK(), "Attribute to is not set.");
to_ = gsl::narrow<int32_t>(to);
to_ = onnxruntime::narrow<int32_t>(to);

// ignore attribute 'saturate' as float8 is not supported in WebGPU
}
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/webgpu/tensor/concat.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ Status Concat::ComputeInternal(ComputeContext& context) const {
return Status::OK();
}

uint32_t output_size = gsl::narrow_cast<int32_t>(prepare.output_tensor->Shape().Size());
uint32_t output_size = onnxruntime::narrow<int32_t>(prepare.output_tensor->Shape().Size());

size_t axis = static_cast<size_t>(prepare.axis);
ConcatProgram program{axis};
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/webgpu/tensor/expand.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Status Expand::ComputeInternal(ComputeContext& context) const {
: 1;
const int components_o = output_shape.IsScalar() ? 1 : output_shape[output_shape.NumDimensions() - 1] % 4 == 0 ? 4
: 1;
uint32_t data_size = gsl::narrow<uint32_t>(output_shape.Size() / components_o);
uint32_t data_size = onnxruntime::narrow<uint32_t>(output_shape.Size() / components_o);

ExpandProgram program{};
program
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/webgpu/tensor/gather.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Status GatherProgram::GenerateShaderCode(ShaderHelper& shader) const {
Status Gather::ComputeInternal(ComputeContext& context) const {
Prepare p;
ORT_RETURN_IF_ERROR(PrepareForCompute(&context.KernelContext(), p));
uint32_t data_size = gsl::narrow<uint32_t>(p.output_tensor->Shape().Size());
uint32_t data_size = onnxruntime::narrow<uint32_t>(p.output_tensor->Shape().Size());
if (data_size == 0) {
return Status::OK();
}
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/webgpu/tensor/pad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ Status Pad::ComputeInternal(ComputeContext& context) const {
}

auto* output_tensor = context.Output(0, output_shape);
uint32_t output_size = gsl::narrow<uint32_t>(output_shape.Size());
uint32_t output_size = onnxruntime::narrow<uint32_t>(output_shape.Size());
if (output_size == 0) {
// Do not need to fill output, return
return Status::OK();
Expand Down
Loading
Loading