diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py index d31e67057d8f..fbe5f744148e 100644 --- a/benchmarks/kernels/benchmark_activation.py +++ b/benchmarks/kernels/benchmark_activation.py @@ -8,10 +8,9 @@ import vllm.model_executor.layers.activation # noqa F401 from vllm.model_executor.custom_op import CustomOp -from vllm.platforms import current_platform from vllm.triton_utils import triton from vllm.utils.argparse_utils import FlexibleArgumentParser -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed batch_size_range = [1, 16, 128] seq_len_range = [1, 16, 64, 1024, 4096] @@ -30,7 +29,7 @@ def benchmark_activation( device = "cuda" num_tokens = batch_size * seq_len dim = intermediate_size - current_platform.seed_everything(42) + set_random_seed(42) torch.set_default_device(device) if func_name == "gelu_and_mul": diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index 6fa5c248670e..2292d2f87288 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -6,9 +6,8 @@ import torch from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed @torch.inference_mode() @@ -22,7 +21,7 @@ def main( num_warmup_iters: int = 5, num_iters: int = 100, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device("cuda") layer = RMSNorm(hidden_size).to(dtype=dtype) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index f82ec5d2b832..26a281f4e4fb 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -24,6 +24,7 @@ from vllm.transformers_utils.config import get_config from vllm.triton_utils import triton from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import set_random_seed FP8_DTYPE = current_platform.fp8_dtype() @@ -431,7 +432,7 @@ def merge_unique_dicts(list1, list2): class BenchmarkWorker: def __init__(self, seed: int) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(seed) + set_random_seed(seed) self.seed = seed # Get the device ID to allocate tensors and kernels # on the respective GPU. This is required for Ray to work @@ -451,7 +452,7 @@ def benchmark( block_quant_shape: list[int] = None, use_deep_gemm: bool = False, ) -> tuple[dict[str, int], float]: - current_platform.seed_everything(self.seed) + set_random_seed(self.seed) dtype_str = _get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 ) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index b8913a217c60..77b77a15b53a 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -18,6 +18,7 @@ from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import set_random_seed FP8_DTYPE = current_platform.fp8_dtype() @@ -261,7 +262,7 @@ def run(input: tuple): class BenchmarkWorker: def __init__(self, seed: int) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(seed) + set_random_seed(seed) self.seed = seed # Get the device ID to allocate tensors and kernels # on the respective GPU. This is required for Ray to work @@ -279,7 +280,7 @@ def benchmark( use_int8_w8a16: bool, use_customized_permute: bool = False, ) -> tuple[dict[str, int], float]: - current_platform.seed_everything(self.seed) + set_random_seed(self.seed) permute_time = benchmark_permute( num_tokens, diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index 09de5fa822f8..3e0365135778 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -37,9 +37,9 @@ import torch from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import set_random_seed device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -94,7 +94,7 @@ def benchmark_mrope( benchmark_iter: int = 100, csv_writer=None, ): - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) # the parameters to compute the q k v size based on tp_size mrope_helper_class = get_rope( diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 46ab2a5fe5e9..be871d3d1aa0 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -13,6 +13,7 @@ from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random, + set_random_seed, ) logger = init_logger(__name__) @@ -38,7 +39,7 @@ def main( device: str = "cuda", kv_cache_dtype: str | None = None, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) scale = float(1.0 / (head_size**0.5)) query = torch.empty( diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 3c2ac9128947..9a21cfe94e5b 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -6,9 +6,8 @@ import torch from vllm import _custom_ops as ops -from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed @torch.inference_mode() @@ -23,7 +22,7 @@ def main( num_warmup_iters: int = 5, num_iters: int = 100, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device("cuda") x = torch.randn(num_tokens, hidden_size, dtype=dtype) diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py index 0d3aef0c630b..99067d8ac371 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache.py @@ -8,11 +8,11 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random, + set_random_seed, ) logger = init_logger(__name__) @@ -36,7 +36,7 @@ def run_benchmark( if kv_cache_dtype == "fp8" and head_size % 16: raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") - current_platform.seed_everything(42) + set_random_seed(42) torch.set_default_device(device) # create random key / value tensors [T, H, D]. diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py index 12f17ea575d9..bca66f301127 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -11,11 +11,11 @@ triton_reshape_and_cache_flash, ) from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random_flash, + set_random_seed, ) logger = init_logger(__name__) @@ -49,7 +49,7 @@ def run_benchmark( if implementation == "triton" and kv_cache_layout == "HND": return float("nan") # Triton does not support HND layout yet. - current_platform.seed_everything(42) + set_random_seed(42) torch.set_default_device(device) # create random key / value tensors [T, H, D]. diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py index de01ff197eab..da32bc30cb2a 100644 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -23,9 +23,9 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( persistent_masked_m_silu_mul_quant, ) -from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used +from vllm.utils.torch_utils import set_random_seed @triton.jit @@ -207,7 +207,7 @@ def benchmark( ): def generate_data(seed_offset=0): """Generate input data with given seed offset""" - current_platform.seed_everything(42 + seed_offset) + set_random_seed(42 + seed_offset) y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous() if gen_strategy == "random_imbalanced": diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 11a3b3133abf..6a4b5fd6b882 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -154,4 +154,4 @@ The interface for the model/module may change during vLLM's development. If you !!! warning "Deprecations" - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0. - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. - - `seed_everything` platform interface is deprecated. It will be removed in v0.14.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead. + - `seed_everything` platform interface is deprecated. It will be removed in v0.15.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead. diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 6f21f47b9dd1..3bea498f1b87 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -11,6 +11,7 @@ import numpy as np import torch +from typing_extensions import deprecated from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger @@ -365,6 +366,10 @@ def inference_mode(cls): return torch.inference_mode(mode=True) @classmethod + @deprecated( + "`seed_everything` is deprecated. It will be removed in v0.15.0 or later. " + "Please use `vllm.utils.torch_utils.set_random_seed` instead." + ) def seed_everything(cls, seed: int | None = None) -> None: """ Set the seed of each random module. @@ -372,10 +377,6 @@ def seed_everything(cls, seed: int | None = None) -> None: Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20 """ - logger.info_once( - "`seed_everything` is deprecated. It will be removed in v0.14.0 or later. " - "Please use `vllm.utils.torch_utils.set_random_seed` instead." - ) if seed is not None: random.seed(seed) np.random.seed(seed)