Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
2b9ec20
Update `Optional[x]` -> `x | None` and `Union[x, y]` to `x | y`
hmellor Oct 11, 2025
ca98f39
Fix interface for docs build
hmellor Oct 11, 2025
19598a8
Fix weird things for docs build
hmellor Oct 11, 2025
17b1453
Fix type detection for docs/cli
hmellor Oct 11, 2025
ce4aab9
Merge remote-tracking branch 'upstream/main' into update-typing-syntax
hmellor Oct 11, 2025
4ac5c2d
Fix some fused moe typing
hmellor Oct 11, 2025
62b15cf
Fix type checking for tensor schema
hmellor Oct 11, 2025
c152c3e
Fix tupe checking for get kwargs
hmellor Oct 11, 2025
a844ec4
Fix quant utils tensor typing
hmellor Oct 11, 2025
c7320a1
Type hint using the class not the method
hmellor Oct 11, 2025
08f50d7
Merge remote-tracking branch 'upstream/main' into update-typing-syntax
hmellor Oct 11, 2025
baabaf9
pre-commit
hmellor Oct 11, 2025
f8acf6f
Merge branch 'main' into update-typing-syntax
hmellor Oct 11, 2025
55a70a1
Merge branch 'main' into update-typing-syntax
hmellor Oct 12, 2025
5725dca
Fix more `Tensor` typing
hmellor Oct 12, 2025
db15339
Remove rule ignores for rules that are no longer in `ruff`
hmellor Oct 12, 2025
e399aaf
Remove rule skip that doesn't fail anymore
hmellor Oct 12, 2025
d509777
Remove skip for rule UP045 (`Optional` -> `|`)
hmellor Oct 12, 2025
18c0cae
Fix a couple more unions
hmellor Oct 12, 2025
d8f8b2e
`tuple[list[str]]` to `list[str]` in Qwen MoE` packed_modules_mapping
hmellor Oct 12, 2025
64850ed
Merge branch 'main' into update-typing-syntax
hmellor Oct 12, 2025
f0ed6e6
pre-commit from main
hmellor Oct 12, 2025
73bda1a
Remove `from __future__ import annotations` because we no longer need it
hmellor Oct 12, 2025
8984b34
Fix some more unions
hmellor Oct 12, 2025
ef333a0
Fix the rest of the unions
hmellor Oct 12, 2025
bf62308
Use forward reference for generic type
hmellor Oct 12, 2025
1777484
Fix a bunch of forward references
hmellor Oct 12, 2025
6eb5320
Add `from __future__ import annotations` back to lm format enforcer b…
hmellor Oct 12, 2025
4ffcc9c
Fix bad forward reference
hmellor Oct 12, 2025
83dbc75
Fix another bad forward import
hmellor Oct 12, 2025
a8cf5af
More forward references
hmellor Oct 12, 2025
76ee29b
Hopefully final fix for forward references
hmellor Oct 12, 2025
71123b8
Merge branch 'main' into update-typing-syntax
hmellor Oct 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
27 changes: 13 additions & 14 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import time
import traceback
from dataclasses import dataclass, field
from typing import Optional, Union

import aiohttp
import huggingface_hub.constants
Expand All @@ -28,13 +27,13 @@ class RequestFuncInput:
prompt_len: int
output_len: int
model: str
model_name: Optional[str] = None
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict | list[dict]] = None
model_name: str | None = None
logprobs: int | None = None
extra_body: dict | None = None
multi_modal_content: dict | list[dict] | None = None
ignore_eos: bool = False
language: Optional[str] = None
request_id: Optional[str] = None
language: str | None = None
request_id: str | None = None


@dataclass
Expand All @@ -52,7 +51,7 @@ class RequestFuncOutput:

async def async_request_tgi(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
Expand Down Expand Up @@ -133,7 +132,7 @@ async def async_request_tgi(

async def async_request_trt_llm(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
Expand Down Expand Up @@ -204,7 +203,7 @@ async def async_request_trt_llm(

async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
Expand Down Expand Up @@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(

async def async_request_openai_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
Expand Down Expand Up @@ -367,7 +366,7 @@ async def async_request_openai_completions(

async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("chat/completions", "profile")), (
Expand Down Expand Up @@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(

async def async_request_openai_audio(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
# Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile
Expand Down Expand Up @@ -610,7 +609,7 @@ def get_tokenizer(
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
**kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path
):
Expand Down
5 changes: 2 additions & 3 deletions benchmarks/benchmark_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import json
import random
import time
from typing import Optional

from transformers import PreTrainedTokenizerBase

Expand Down Expand Up @@ -80,7 +79,7 @@ def sample_requests_from_dataset(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int],
fixed_output_len: Optional[int],
fixed_output_len: int | None,
) -> list[Request]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
Expand Down Expand Up @@ -128,7 +127,7 @@ def sample_requests_from_random(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int],
fixed_output_len: Optional[int],
fixed_output_len: int | None,
prefix_len: int,
) -> list[Request]:
requests = []
Expand Down
3 changes: 1 addition & 2 deletions benchmarks/benchmark_prioritization.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import json
import random
import time
from typing import Optional

from transformers import AutoTokenizer, PreTrainedTokenizerBase

Expand All @@ -24,7 +23,7 @@ def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
fixed_output_len: int | None,
) -> list[tuple[str, int, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
Expand Down
7 changes: 3 additions & 4 deletions benchmarks/benchmark_serving_structured_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import warnings
from collections.abc import AsyncGenerator
from dataclasses import dataclass
from typing import Optional

import datasets
import numpy as np
Expand Down Expand Up @@ -316,7 +315,7 @@ def calculate_metrics(
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
goodput_config_dict: Optional[dict[str, float]] = None,
goodput_config_dict: dict[str, float] | None = None,
) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = []
total_input = 0
Expand Down Expand Up @@ -436,9 +435,9 @@ async def benchmark(
selected_percentile_metrics: list[str],
selected_percentiles: list[str],
ignore_eos: bool,
max_concurrency: Optional[int],
max_concurrency: int | None,
structured_output_ratio: float,
goodput_config_dict: Optional[dict[str, float]] = None,
goodput_config_dict: dict[str, float] | None = None,
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
Expand Down
16 changes: 8 additions & 8 deletions benchmarks/benchmark_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import os
import time
from types import TracebackType
from typing import Any, Optional, Union
from typing import Any


def convert_to_pytorch_benchmark_format(
Expand Down Expand Up @@ -92,7 +92,7 @@ class TimeCollector:
def __init__(self, scale: int) -> None:
self.cnt: int = 0
self._sum: int = 0
self._max: Optional[int] = None
self._max: int | None = None
self.scale = scale
self.start_time: int = time.monotonic_ns()

Expand All @@ -104,22 +104,22 @@ def collect(self, v: int) -> None:
else:
self._max = max(self._max, v)

def avg(self) -> Union[float, str]:
def avg(self) -> float | str:
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"

def max(self) -> Union[float, str]:
def max(self) -> float | str:
return self._max / self.scale if self._max else "N/A"

def dump_avg_max(self) -> list[Union[float, str]]:
def dump_avg_max(self) -> list[float | str]:
return [self.avg(), self.max()]

def __enter__(self) -> None:
self.start_time = time.monotonic_ns()

def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
exc_traceback: Optional[TracebackType],
exc_type: type[BaseException] | None,
exc_value: BaseException | None,
exc_traceback: TracebackType | None,
) -> None:
self.collect(time.monotonic_ns() - self.start_time)
3 changes: 1 addition & 2 deletions benchmarks/cutlass_benchmarks/sparse_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import itertools
import pickle as pkl
import time
from collections.abc import Iterable
from typing import Callable
from collections.abc import Callable, Iterable

import torch
import torch.utils.benchmark as TBenchmark
Expand Down
11 changes: 5 additions & 6 deletions benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import itertools
import pickle as pkl
import time
from collections.abc import Iterable
from typing import Callable, Optional
from collections.abc import Callable, Iterable

import torch
import torch.utils.benchmark as TBenchmark
Expand Down Expand Up @@ -53,7 +52,7 @@ def bench_int8(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
"""Benchmark INT8-based kernels."""
assert dtype == torch.int8
Expand Down Expand Up @@ -108,7 +107,7 @@ def bench_fp8(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
"""Benchmark FP8-based kernels."""
assert dtype == torch.float8_e4m3fn
Expand Down Expand Up @@ -183,7 +182,7 @@ def bench(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
Expand All @@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
def run(
dtype: torch.dtype,
MKNs: Iterable[tuple[int, int, int]],
bench_kernels: Optional[list[str]] = None,
bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
results = []
for m, k, n in MKNs:
Expand Down
9 changes: 4 additions & 5 deletions benchmarks/fused_kernels/layernorm_rms_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@

import pickle as pkl
import time
from collections.abc import Iterable
from collections.abc import Callable, Iterable
from dataclasses import dataclass
from itertools import product
from typing import Callable, Optional

import torch
import torch.utils.benchmark as TBenchmark
Expand Down Expand Up @@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
def unfused_int8_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
# Norm
Expand All @@ -68,7 +67,7 @@ def unfused_int8_impl(
def unfused_fp8_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
# Norm
Expand All @@ -85,7 +84,7 @@ def unfused_fp8_impl(
def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor,
residual: Optional[torch.Tensor],
residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
out, _ = ops.rms_norm_dynamic_per_token_quant(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/kernels/bench_per_token_quant_fp8.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Callable
from collections.abc import Callable
from unittest.mock import patch

import pandas as pd
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/kernels/benchmark_device_communicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
import json
import os
import time
from collections.abc import Callable
from contextlib import nullcontext
from typing import Callable, Optional

import torch
import torch.distributed as dist
Expand Down Expand Up @@ -264,12 +264,12 @@ def benchmark_allreduce(
def benchmark_allreduce_single(
self,
sequence_length: int,
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
should_use_fn: Callable[[torch.Tensor], bool],
context,
num_warmup: int,
num_trials: int,
) -> Optional[float]:
) -> float | None:
"""Benchmark method with CUDA graph optimization."""
try:
# Create test tensor (2D: sequence_length x hidden_size)
Expand Down
Loading