diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 0fd448c2153c..11a3b3133abf 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -154,3 +154,4 @@ The interface for the model/module may change during vLLM's development. If you !!! warning "Deprecations" - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0. - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. + - `seed_everything` platform interface is deprecated. It will be removed in v0.14.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead. diff --git a/tests/compile/distributed/test_async_tp.py b/tests/compile/distributed/test_async_tp.py index 2eb18e25c98b..e02f038b4edf 100644 --- a/tests/compile/distributed/test_async_tp.py +++ b/tests/compile/distributed/test_async_tp.py @@ -26,6 +26,7 @@ ) from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import set_random_seed from ...models.registry import HF_EXAMPLE_MODELS from ...utils import ( @@ -301,7 +302,7 @@ def async_tp_pass_on_test_model( dtype: torch.dtype, dynamic: bool, ): - current_platform.seed_everything(0) + set_random_seed(0) device = torch.device(f"cuda:{local_rank}") torch.cuda.set_device(device) diff --git a/tests/compile/distributed/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py index fc8d1f98ebf8..d0a194c2b044 100644 --- a/tests/compile/distributed/test_fusion_all_reduce.py +++ b/tests/compile/distributed/test_fusion_all_reduce.py @@ -32,6 +32,7 @@ ) from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import set_random_seed from ...utils import has_module_attribute, multi_gpu_test from ..backend import TestBackend @@ -263,7 +264,7 @@ def all_reduce_fusion_pass_on_test_model( enable_rms_norm_custom_op, enable_quant_fp8_custom_op, ): - current_platform.seed_everything(0) + set_random_seed(0) device = torch.device(f"cuda:{local_rank}") torch.cuda.set_device(device) diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py index d9fdc3acc3d6..35916ba99652 100644 --- a/tests/compile/distributed/test_sequence_parallelism.py +++ b/tests/compile/distributed/test_sequence_parallelism.py @@ -31,6 +31,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import set_random_seed from ...utils import multi_gpu_test from ..backend import TestBackend @@ -232,7 +233,7 @@ def sequence_parallelism_pass_on_test_model( fuse_norm_quant: bool, dynamic: bool, ): - current_platform.seed_everything(0) + set_random_seed(0) device = torch.device(f"cuda:{local_rank}") torch.cuda.set_device(device) diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py index 8f58c470d217..68ffb1ee34ad 100644 --- a/tests/kernels/attention/test_aiter_flash_attn.py +++ b/tests/kernels/attention/test_aiter_flash_attn.py @@ -8,6 +8,7 @@ import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401 from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [128, 256] @@ -104,7 +105,7 @@ def test_varlen_with_paged_kv( if not is_flash_attn_varlen_func_available(): pytest.skip("flash_attn_varlen_func required to run this test.") torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 96bdcf16d568..24b058ed24fa 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -13,6 +13,7 @@ from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.platforms import current_platform from vllm.utils.mem_utils import get_max_shared_memory_bytes +from vllm.utils.torch_utils import set_random_seed FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. @@ -150,7 +151,7 @@ def test_paged_attention( global PARTITION_SIZE - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_heads = num_heads diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index 3f76033254d3..19892ce26b6b 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -9,6 +9,7 @@ from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck from vllm import _custom_ops as ops from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed COPYING_DIRECTION = [("cuda", "cpu"), ("cuda", "cuda"), ("cpu", "cuda")] DTYPES = [torch.bfloat16, torch.float] @@ -64,7 +65,7 @@ def test_reshape_and_cache( ) -> None: if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) torch.cuda.set_device(device) # Create a random slot mapping. @@ -185,7 +186,7 @@ def test_reshape_and_cache_flash( kv_cache_layout: str, implementation: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) torch.cuda.set_device(device) assert implementation in ["cuda", "triton"] @@ -355,7 +356,7 @@ def test_swap_blocks( if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - current_platform.seed_everything(seed) + set_random_seed(seed) src_device = device if direction[0] == "cuda" else "cpu" dst_device = device if direction[1] == "cuda" else "cpu" @@ -444,7 +445,7 @@ def test_fp8_e4m3_conversion( seed: int, device: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) low = -224.0 high = 224.0 @@ -507,7 +508,7 @@ def test_concat_and_cache_mla( device: str, kv_cache_dtype: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) torch.cuda.set_device(device) @@ -584,7 +585,7 @@ def test_concat_and_cache_ds_mla( if dtype.itemsize != 2: pytest.skip("ds_mla only supports 16-bit input") kv_cache_dtype = "fp8_ds_mla" - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) torch.cuda.set_device(device) @@ -695,7 +696,7 @@ def test_swap_blocks_mla( device: str, kv_cache_dtype: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) torch.cuda.set_device(device) @@ -947,7 +948,7 @@ def test_concat_and_cache_mla_cpu( ) -> None: device = "cpu" kv_cache_dtype = "auto" - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) total_slots = num_blocks * block_size diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py index d86041d71feb..80c5c853debb 100755 --- a/tests/kernels/attention/test_cascade_flash_attn.py +++ b/tests/kernels/attention/test_cascade_flash_attn.py @@ -6,6 +6,7 @@ import torch from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states try: @@ -39,7 +40,7 @@ def test_merge_kernel( dtype: torch.dtype, ): torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] assert num_query_heads % num_kv_heads == 0 @@ -103,7 +104,7 @@ def test_cascade( f'to: "{fa_version_unsupported_reason(fa_version)}"' ) - current_platform.seed_everything(0) + set_random_seed(0) window_size = (-1, -1) scale = head_size**-0.5 diff --git a/tests/kernels/attention/test_cpu_attn.py b/tests/kernels/attention/test_cpu_attn.py index be5d66197f6e..ef0099f635a5 100644 --- a/tests/kernels/attention/test_cpu_attn.py +++ b/tests/kernels/attention/test_cpu_attn.py @@ -8,6 +8,7 @@ import torch from vllm.platforms import CpuArchEnum, current_platform +from vllm.utils.torch_utils import set_random_seed from vllm.v1.attention.backends.cpu_attn import _get_attn_isa if not current_platform.is_cpu(): @@ -190,7 +191,7 @@ def varlen_with_paged_kv( use_sink: bool, isa: str, ) -> None: - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py index bbd5df5419f8..2714cd81819e 100644 --- a/tests/kernels/attention/test_flash_attn.py +++ b/tests/kernels/attention/test_flash_attn.py @@ -6,6 +6,7 @@ import torch from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed try: from vllm.vllm_flash_attn import ( @@ -129,7 +130,7 @@ def test_varlen_with_paged_kv( "Flash attention with quantized inputs is only " "supported on version 3 with bfloat16 base type" ) - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py index eedeec33e0d4..570bf7fc865a 100644 --- a/tests/kernels/attention/test_flashinfer.py +++ b/tests/kernels/attention/test_flashinfer.py @@ -5,6 +5,7 @@ import pytest from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed try: import flashinfer @@ -101,7 +102,7 @@ def test_flashinfer_decode_with_paged_kv( sliding_window: int | None, ) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] @@ -196,7 +197,7 @@ def test_flashinfer_prefill_with_paged_kv( sliding_window: int | None, ) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -299,7 +300,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( ) -> None: pytest.skip("TODO: fix the accuracy issue") torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -409,7 +410,7 @@ def test_flashinfer_decode_with_paged_fp8_kv( ) -> None: # test doesn't work for num_heads = (16,16) torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 220d827b9d5f..1edb3dd1671f 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -10,6 +10,7 @@ ) from vllm.platforms import current_platform from vllm.utils.math_utils import round_up +from vllm.utils.torch_utils import set_random_seed if not current_platform.is_device_capability_family(100): pytest.skip( @@ -80,7 +81,7 @@ def test_flashinfer_trtllm_decode_with_baseline( has_sinks: bool, ) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(42) + set_random_seed(42) q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes q_quant_dtype = q_quant_dtype or dtype @@ -279,7 +280,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( has_sinks: bool, ) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(42) + set_random_seed(42) q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes q_quant_dtype = q_quant_dtype or dtype diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py index ec938caff2c6..37fd85ccec04 100644 --- a/tests/kernels/attention/test_lightning_attn.py +++ b/tests/kernels/attention/test_lightning_attn.py @@ -5,7 +5,7 @@ import torch from vllm.model_executor.layers.lightning_attn import linear_decode_forward_triton -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed NUM_HEADS = [4, 8] HEAD_SIZES = [64] @@ -124,7 +124,7 @@ def test_linear_decode_forward_triton( torch.set_default_device("cuda") torch.manual_seed(42) torch.cuda.manual_seed_all(42) - current_platform.seed_everything(42) + set_random_seed(42) base = 0.01 q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype) k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype) @@ -167,7 +167,7 @@ def test_linear_decode_forward_triton_with_padding( torch.set_default_device("cuda") torch.manual_seed(42) torch.cuda.manual_seed_all(42) - current_platform.seed_everything(42) + set_random_seed(42) batch_size = 4 base = 0.01 @@ -231,7 +231,7 @@ def test_lightning_attention_reference( torch.set_default_device("cuda") torch.manual_seed(42) torch.cuda.manual_seed_all(42) - current_platform.seed_everything(42) + set_random_seed(42) base = 0.01 q = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype) diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index 7405e4d41da9..32aba1a45747 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -19,6 +19,7 @@ from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.rocm import RocmPlatform +from vllm.utils.torch_utils import set_random_seed @pytest.fixture(autouse=True) @@ -123,7 +124,7 @@ def test_mha_attn_forward( dtype: torch.dtype, device: str, ): - current_platform.seed_everything(0) + set_random_seed(0) torch.set_default_device(device) torch.set_default_dtype(dtype) @@ -168,7 +169,7 @@ def test_mha_attn_varlen_forward( dtype: torch.dtype, device: str, ): - current_platform.seed_everything(0) + set_random_seed(0) torch.set_default_device(device) torch.set_default_dtype(dtype) diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index e041e8c8d2ff..b2c955b4901a 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -13,7 +13,7 @@ from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode from vllm.attention.ops.prefix_prefill import context_attention_fwd from vllm.platforms import current_platform -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 64] @@ -125,7 +125,7 @@ def test_contexted_kv_attention( ): pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache") - current_platform.seed_everything(0) + set_random_seed(0) torch.set_default_device(device) # Need this, otherwise when we capture the graph the process @@ -346,7 +346,7 @@ def test_contexted_kv_attention_alibi( ): pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache") - current_platform.seed_everything(0) + set_random_seed(0) torch.set_default_device(device) # Need this, otherwise when we capture the graph the process diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index 7fb08e5780f5..55e3593481cb 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -8,6 +8,7 @@ from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.platforms import current_platform from vllm.utils.math_utils import next_power_of_2 +from vllm.utils.torch_utils import set_random_seed NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [128, 256] @@ -113,7 +114,7 @@ def test_triton_unified_attn( ) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py index e8777ec4f59e..1055c4745d4e 100644 --- a/tests/kernels/core/test_activation.py +++ b/tests/kernels/core/test_activation.py @@ -18,7 +18,7 @@ SiluAndMul, SwigluOAIAndMul, ) -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing @@ -52,7 +52,7 @@ def test_act_and_mul( seed: int, device: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) if activation == "silu_and_mul": @@ -129,7 +129,7 @@ def test_activation( seed: int, device: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, d, dtype=dtype) layer = activation[0]() diff --git a/tests/kernels/core/test_fused_qk_norm_rope.py b/tests/kernels/core/test_fused_qk_norm_rope.py index 05d61ec02fd2..02cd470e7704 100644 --- a/tests/kernels/core/test_fused_qk_norm_rope.py +++ b/tests/kernels/core/test_fused_qk_norm_rope.py @@ -8,6 +8,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed DTYPES = [torch.bfloat16, torch.float16] IS_NEOX = [True, False] @@ -64,7 +65,7 @@ def test_fused_qk_norm_rope_matches_reference( rotary_ratio: float, ): torch.set_default_device(device) - current_platform.seed_everything(seed) + set_random_seed(seed) num_heads, num_kv_heads, head_dim = 16, 4, 128 num_tokens = 4 diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py index 49bd77f6795f..5ad032cbacac 100644 --- a/tests/kernels/core/test_layernorm.py +++ b/tests/kernels/core/test_layernorm.py @@ -7,7 +7,7 @@ from tests.kernels.quant_utils import FP8_DTYPE from tests.kernels.utils import opcheck from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing @@ -34,7 +34,7 @@ def test_rms_norm( device: str, strided_input: bool, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) layer = RMSNorm(hidden_size).to(dtype=dtype) layer.weight.data.normal_(mean=1.0, std=0.1) @@ -88,7 +88,7 @@ def test_fused_rms_norm_quant( device: str, strided_input: bool, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1) diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index ba5d593b2d35..c091ea49d92c 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -10,6 +10,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config +from vllm.utils.torch_utils import set_random_seed device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -24,7 +25,7 @@ def generate_test_data( device: torch.device, ): """Generate test data for given configuration.""" - current_platform.seed_everything(42) + set_random_seed(42) # Create 2D positions (3, num_tokens) for multimodal case positions = torch.randint( 0, max_position_embeddings // 4, (3, num_tokens), device=device diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index d18f01314c8f..c7715ca35e72 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -9,7 +9,7 @@ from tests.kernels.allclose_default import get_default_atol, get_default_rtol from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed IS_NEOX_STYLE = [True, False] DTYPES = [torch.bfloat16, torch.float] @@ -79,7 +79,7 @@ def test_rotary_embedding( if rotary_dim is None: rotary_dim = head_size - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py index 4647b97c4771..d16205694971 100644 --- a/tests/kernels/mamba/test_causal_conv1d.py +++ b/tests/kernels/mamba/test_causal_conv1d.py @@ -12,7 +12,7 @@ causal_conv1d_fn, causal_conv1d_update, ) -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed def causal_conv1d_ref( @@ -154,7 +154,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, ity if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - current_platform.seed_everything(0) + set_random_seed(0) batch = 2 x = torch.randn(batch, dim, seqlen, device=device, dtype=itype) x_ref = x.clone() @@ -201,7 +201,7 @@ def test_causal_conv1d_update_with_batch_gather( rtol, atol = 1e-2, 5e-2 # set seed - current_platform.seed_everything(0) + set_random_seed(0) padding = 5 if with_padding else 0 padded_batch_size = batch_size + padding @@ -278,7 +278,7 @@ def test_causal_conv1d_varlen( if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - current_platform.seed_everything(0) + set_random_seed(0) seqlens = [] batch_size = batch padding = 3 if with_padding else 0 diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py index 6fca33acd48a..98879ff6ed7f 100644 --- a/tests/kernels/mamba/test_mamba_mixer2.py +++ b/tests/kernels/mamba/test_mamba_mixer2.py @@ -12,8 +12,8 @@ initialize_model_parallel, ) from vllm.model_executor.layers.mamba.mamba_mixer2 import Mixer2RMSNormGated -from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import set_random_seed @multi_gpu_test(num_gpus=2) @@ -68,7 +68,7 @@ def mixer2_gated_norm_tensor_parallel( dtype: torch.dtype, device: str, ): - current_platform.seed_everything(0) + set_random_seed(0) device = torch.device(f"cuda:{local_rank}") torch.cuda.set_device(device) diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py index 50e48aad6eba..f50ab5344b15 100644 --- a/tests/kernels/mamba/test_mamba_ssm.py +++ b/tests/kernels/mamba/test_mamba_ssm.py @@ -13,7 +13,7 @@ selective_scan_fn, selective_state_update, ) -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed def selective_state_update_ref( @@ -271,7 +271,7 @@ def test_selective_scan( rtolw = max(rtolw, rtol) atolw = max(atolw, atol) # set seed - current_platform.seed_everything(0) + set_random_seed(0) batch_size = 1 dim = 4 dstate = 8 @@ -401,7 +401,7 @@ def test_selective_state_update(dim, dstate, has_z, itype): if torch.version.hip: atol *= 2 # set seed - current_platform.seed_everything(0) + set_random_seed(0) batch_size = 1 state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device) x = torch.randn(batch_size, dim, device=device, dtype=itype) @@ -438,7 +438,7 @@ def test_selective_state_update_varlen(dim, dstate, has_z, itype, max_seq_len): if torch.version.hip: atol *= 2 # set seed - current_platform.seed_everything(0) + set_random_seed(0) batch_size = 4 token_counts = torch.randint(1, max_seq_len + 1, (batch_size,), device=device) total_tokens = int(token_counts.sum().item()) @@ -857,7 +857,7 @@ def test_selective_state_update_with_num_accepted_tokens( if torch.version.hip: atol *= 2 - current_platform.seed_everything(0) + set_random_seed(0) batch_size = 4 tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device) @@ -983,7 +983,7 @@ def test_selective_state_update_varlen_with_num_accepted( if torch.version.hip: atol *= 2 - current_platform.seed_everything(0) + set_random_seed(0) batch_size = 4 tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device) diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index 0b0b82e484a1..40aa3d017d78 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -9,7 +9,7 @@ from vllm.model_executor.layers.mamba.ops.ssd_combined import ( mamba_chunk_scan_combined_varlen, ) -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from vllm.v1.attention.backends.mamba2_attn import compute_varlen_chunk_metadata # Added by the IBM Team, 2024 @@ -82,7 +82,7 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None): def generate_random_inputs(batch_size, seqlen, n_heads, d_head, itype, device="cuda"): - current_platform.seed_everything(0) + set_random_seed(0) A = -torch.exp(torch.rand(n_heads, dtype=itype, device=device)) dt = F.softplus( torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) - 4 diff --git a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py index 95db6327c4f1..08e50c52cbed 100644 --- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py +++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py @@ -10,7 +10,7 @@ from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.fused_moe.config import FUSED_MOE_UNQUANTIZED_CONFIG -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from .common import ( Config, @@ -40,7 +40,7 @@ def rank_worker( config: Config, weights: WeightTensors, ): - current_platform.seed_everything(pgi.rank) + set_random_seed(pgi.rank) # sanity check from vllm import envs diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py index a3e264c5f5e2..3cdc7b82130b 100644 --- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py +++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py @@ -9,7 +9,7 @@ import torch from vllm.config import VllmConfig -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from .common import Config, RankTensors, WeightTensors, make_modular_kernel from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config @@ -82,7 +82,7 @@ def rank_worker( config: Config, weights: WeightTensors, ): - current_platform.seed_everything(pgi.rank) + set_random_seed(pgi.rank) # sanity check from vllm import envs diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 2ef170f1ab30..c9d425b5b990 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.platforms import current_platform from vllm.triton_utils import tl +from vllm.utils.torch_utils import set_random_seed MNK_FACTORS = [ (1, 128, 128), @@ -115,7 +116,7 @@ def test_batched_mm( ): """Note: float8_e4m3fn is not supported on CUDA architecture < 89, and those tests will be skipped on unsupported hardware.""" - current_platform.seed_everything(7) + set_random_seed(7) use_fp8_w8a8 = dtype == torch.float8_e4m3fn @@ -252,7 +253,7 @@ def test_fused_moe_batched_experts( ): """Note: float8_e4m3fn is not supported on CUDA architecture < 89, and those tests will be skipped on unsupported hardware.""" - current_platform.seed_everything(7) + set_random_seed(7) use_fp8_w8a8 = dtype == torch.float8_e4m3fn diff --git a/tests/kernels/moe/test_cpu_fused_moe.py b/tests/kernels/moe/test_cpu_fused_moe.py index 4dda45a6c740..f2c1d0382981 100644 --- a/tests/kernels/moe/test_cpu_fused_moe.py +++ b/tests/kernels/moe/test_cpu_fused_moe.py @@ -8,6 +8,7 @@ from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed if not current_platform.is_cpu(): pytest.skip("skipping CPU-only tests", allow_module_level=True) @@ -114,7 +115,7 @@ def test_cpu_fused_moe( act: str, isa: str, ): - current_platform.seed_everything(0) + set_random_seed(0) topk_num = max(expert_num // 2, 1) up_dim = 2 * intermediate_size diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 0160694d7bb5..4a57affdfbf4 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -20,6 +20,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed NUM_EXPERTS = [40, 64] TOP_KS = [6, 8] @@ -277,7 +278,7 @@ def test_cutlass_moe_8_bit_no_graph( workspace_init, ep_size: int | None = None, ): - current_platform.seed_everything(7) + set_random_seed(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch) @@ -332,7 +333,7 @@ def test_cutlass_moe_8_bit_cuda_graph( monkeypatch, workspace_init, ): - current_platform.seed_everything(7) + set_random_seed(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): dtype = torch.half @@ -469,7 +470,7 @@ def test_run_cutlass_moe_fp8( ep_size: int, workspace_init, ): - current_platform.seed_everything(7) + set_random_seed(7) with set_current_vllm_config(vllm_config): mt = MOETensors8Bit.make_moe_tensors_8bit( m, k, n, e, per_act_token, per_out_channel diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index f427734ef09e..8987b688ab4a 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -22,13 +22,13 @@ ) from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel -from vllm.platforms import current_platform from vllm.utils.deep_gemm import ( get_mk_alignment_for_contiguous_layout, is_deep_gemm_e8m0_used, is_deep_gemm_supported, ) from vllm.utils.import_utils import has_deep_ep, has_deep_gemm +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.workspace import init_workspace_manager from ...utils import multi_gpu_test @@ -367,7 +367,7 @@ def _test_deepep_deepgemm_moe( device = torch.device(f"cuda:{pgi.local_rank}") init_workspace_manager(device) - current_platform.seed_everything(pgi.rank) + set_random_seed(pgi.rank) w1 = w1.to(device=torch.cuda.current_device()) w2 = w2.to(device=torch.cuda.current_device()) @@ -456,7 +456,7 @@ def test_ht_deepep_deepgemm_moe( """ m, n, k = mnk - current_platform.seed_everything(7) + set_random_seed(7) if topk > num_experts: pytest.skip(f"Skipping test: topk={topk} > E={num_experts}") @@ -531,7 +531,7 @@ def test_ll_deepep_deepgemm_moe( assert not is_deep_gemm_e8m0_used() m, n, k = mnk - current_platform.seed_everything(7) + set_random_seed(7) if topk > num_experts: pytest.skip(f"Skipping test: topk={topk} > E={num_experts}") diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index e698ca92a151..e57e0d72067e 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -20,8 +20,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, ) -from vllm.platforms import current_platform from vllm.utils.import_utils import has_deep_ep +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.workspace import init_workspace_manager from ...utils import multi_gpu_test @@ -446,7 +446,7 @@ def test_deep_ep_moe( low_latency_mode = False use_fp8_dispatch = False - current_platform.seed_everything(7) + set_random_seed(7) world_size, dp_size = world_dp_size config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts) @@ -507,7 +507,7 @@ def test_low_latency_deep_ep_moe( f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}" ) - current_platform.seed_everything(7) + set_random_seed(7) world_size, dp_size = world_dp_size config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts) diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index bf4ef2d30466..29fdbffe470a 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8 from vllm.model_executor.models.llama4 import Llama4MoE from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed try: from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe @@ -158,7 +159,7 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( ): if not current_platform.has_device_capability(100): pytest.skip("Test is only supported for sm >= 100") - current_platform.seed_everything(7) + set_random_seed(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True) @@ -222,7 +223,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( monkeypatch, workspace_init, ): - current_platform.seed_everything(7) + set_random_seed(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): td = TestData.make_moe_tensors_8bit( diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py index 133a8a4a30a6..1262eea70bab 100644 --- a/tests/kernels/moe/test_flashinfer_moe.py +++ b/tests/kernels/moe/test_flashinfer_moe.py @@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +from vllm.utils.torch_utils import set_random_seed if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability( 100 @@ -60,7 +61,7 @@ def test_flashinfer_fp4_moe_no_graph( activation: str, workspace_init, ): - current_platform.seed_everything(7) + set_random_seed(7) with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py index c02183852532..96d793d85dc2 100644 --- a/tests/kernels/moe/test_grouped_topk.py +++ b/tests/kernels/moe/test_grouped_topk.py @@ -19,6 +19,7 @@ fused_grouped_topk, ) from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed @pytest.mark.skipif( @@ -52,7 +53,7 @@ def test_grouped_topk( ) get_cached_compilation_config.cache_clear() - current_platform.seed_everything(0) + set_random_seed(0) hidden_states = torch.randn((n_token, n_hidden), dtype=dtype, device="cuda") gating_output = torch.randn((n_token, n_expert), dtype=dtype, device="cuda") e_score_correction_bias = torch.randn( diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index 6ebf1016c166..ec31e66140a1 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -15,7 +15,7 @@ from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx -from vllm.utils.torch_utils import cuda_device_count_stateless +from vllm.utils.torch_utils import cuda_device_count_stateless, set_random_seed from vllm.v1.worker.workspace import init_workspace_manager from .modular_kernel_tools.common import ( @@ -82,7 +82,7 @@ def rank_worker( device = torch.device(f"cuda:{pgi.local_rank}") init_workspace_manager(device) - current_platform.seed_everything(pgi.rank) + set_random_seed(pgi.rank) # sanity check from vllm import envs diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py index 1abb08f878b2..8733ba4d8e31 100644 --- a/tests/kernels/moe/test_modular_oai_triton_moe.py +++ b/tests/kernels/moe/test_modular_oai_triton_moe.py @@ -34,6 +34,7 @@ ) from vllm.model_executor.layers.utils import shuffle_weight from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed MNK = [ (1, 512, 384), @@ -211,7 +212,7 @@ def test_oai_triton_moe( unfused: bool, workspace_init, ): - current_platform.seed_everything(0) + set_random_seed(0) ( w1, w2, diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index fd6ce6bfbd78..0c814bbde4cb 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -60,6 +60,7 @@ from vllm.model_executor.models.mixtral import MixtralMoE from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.workspace import init_workspace_manager NUM_EXPERTS = [8, 64, 192] @@ -234,7 +235,7 @@ def test_fused_moe( monkeypatch, workspace_init, ): - current_platform.seed_everything(7) + set_random_seed(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size)) diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py index 1abfc11fb460..652a2ee21614 100644 --- a/tests/kernels/moe/test_moe_align_block_size.py +++ b/tests/kernels/moe/test_moe_align_block_size.py @@ -14,12 +14,13 @@ ) from vllm.platforms import current_platform from vllm.utils.math_utils import round_up +from vllm.utils.torch_utils import set_random_seed NUM_TOKENS = [1, 3, 256, 2256, 4096] NUM_EXPERTS = [32, 160, 256, 257] TOP_KS = [1, 2, 16, 32] BLOCK_SIZES = [32, 128] -current_platform.seed_everything(0) +set_random_seed(0) def _group_tokens_by_expert( diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index 12dd322dccc5..45127ce0ac63 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -17,11 +17,12 @@ moe_unpermute, ) from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed NUM_EXPERTS = [16, 64, 256] TOP_KS = [2, 6, 8] EP_SIZE = [1, 4, 16] -current_platform.seed_everything(0) +set_random_seed(0) if current_platform.is_rocm(): pytest.skip( @@ -226,7 +227,7 @@ def test_moe_permute_unpermute( n_local_expert, expert_map, _ = determine_expert_map(ep_size, ep_rank, n_expert) expert_map = expert_map.cuda() start_expert = n_local_expert * ep_rank - current_platform.seed_everything(0) + set_random_seed(0) hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype) gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype) topk_weights, topk_ids, token_expert_indices = fused_topk( diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index e67bd76a1618..fd7388e1cff8 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed if not current_platform.has_device_capability(100): pytest.skip( @@ -42,7 +43,7 @@ def test_cutlass_fp4_moe_no_graph( m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, workspace_init ): - current_platform.seed_everything(7) + set_random_seed(7) with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index dd4eb4da913b..3a5801ae4996 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv +from vllm.utils.torch_utils import set_random_seed from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch @@ -290,7 +291,7 @@ def test_cutlass_moe_pplx( world_dp_size: tuple[int, int], use_internode: bool, ): - current_platform.seed_everything(7) + set_random_seed(7) with set_current_vllm_config(vllm_config): dtype = torch.half diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index 35e554e16cb3..c08a54f0e9f6 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -44,8 +44,8 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, ) -from vllm.platforms import current_platform from vllm.utils.math_utils import round_up +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.workspace import init_workspace_manager from ...utils import multi_gpu_test @@ -184,7 +184,7 @@ def test_fused_moe_batched_experts( dtype: torch.dtype, workspace_init, ): - current_platform.seed_everything(7) + set_random_seed(7) a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 @@ -491,7 +491,7 @@ def test_pplx_prepare_finalize_slow( if per_act_token_quant and block_shape is not None: pytest.skip("Skip illegal quantization combination") - current_platform.seed_everything(7) + set_random_seed(7) m, n, k = mnk world_size, dp_size = world_dp_size device = "cuda" @@ -809,7 +809,7 @@ def test_pplx_moe_slow( block_shape: list[int] | None, use_internode: bool, ): - current_platform.seed_everything(7) + set_random_seed(7) m, n, k = mnk world_size, dp_size = world_dp_size @@ -888,7 +888,7 @@ def format_result(msg, ex=None): new_vllm_config.parallel_config.enable_expert_parallel = True _set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, pgi.local_rank) - current_platform.seed_everything(7) + set_random_seed(7) combos = itertools.product( PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]] ) @@ -982,7 +982,7 @@ def test_pplx_prepare_finalize( world_dp_size: tuple[int, int], use_internode: bool, ): - current_platform.seed_everything(7) + set_random_seed(7) world_size, dp_size = world_dp_size parallel_launch( world_size * dp_size, @@ -1005,7 +1005,7 @@ def test_pplx_moe( use_internode: bool, use_shared_experts: bool, ): - current_platform.seed_everything(7) + set_random_seed(7) world_size, dp_size = world_dp_size parallel_launch( world_size, diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index b220205759e2..62b7ecb17fbe 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -13,6 +13,7 @@ from vllm.platforms import current_platform from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm from vllm.utils.math_utils import cdiv, round_up +from vllm.utils.torch_utils import set_random_seed if current_platform.is_fp8_fnuz(): pytest.skip( @@ -201,7 +202,7 @@ def token_random(E, T, H2, tokens_per_expert): @torch.inference_mode() def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype): group_size = 128 - current_platform.seed_everything(42) + set_random_seed(42) tokens_per_expert = torch.randint( low=0, diff --git a/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py b/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py index ace0794fea69..cca02928b498 100644 --- a/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py +++ b/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py @@ -11,6 +11,7 @@ from vllm.platforms import current_platform from vllm.triton_utils import triton from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used +from vllm.utils.torch_utils import set_random_seed FLOAT8_DTYPE = torch.float8_e4m3fn GROUP_SIZE = 128 @@ -72,7 +73,7 @@ def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Ten reason="ROCm does not support DeepGemm.", ) def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int): - current_platform.seed_everything(42) + set_random_seed(42) input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda") diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py index 069bd7435534..337bc177e6df 100644 --- a/tests/kernels/quantization/test_awq_triton.py +++ b/tests/kernels/quantization/test_awq_triton.py @@ -13,7 +13,7 @@ awq_dequantize_triton, awq_gemm_triton, ) -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed device = "cuda" @@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size): zeros_cols = qweight_cols zeros_dtype = torch.int32 - current_platform.seed_everything(0) + set_random_seed(0) qweight = torch.randint( 0, @@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size): qzeros_rows = scales_rows qzeros_cols = qweight_cols - current_platform.seed_everything(0) + set_random_seed(0) input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device) qweight = torch.randint( diff --git a/tests/kernels/quantization/test_cutlass_w4a8_moe.py b/tests/kernels/quantization/test_cutlass_w4a8_moe.py index a855f7333b61..de0e347d8fe7 100644 --- a/tests/kernels/quantization/test_cutlass_w4a8_moe.py +++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py @@ -17,6 +17,7 @@ ) from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types +from vllm.utils.torch_utils import set_random_seed IS_SUPPORTED_BY_GPU = ( current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9 @@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor: @pytest.mark.parametrize("random_zero", [True, False]) def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero): num_experts, N, K = shape - current_platform.seed_everything(42) + set_random_seed(42) setup = make_moe_test_setup( num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero ) @@ -308,7 +309,7 @@ def forward(self, a: torch.Tensor) -> torch.Tensor: reason="W4A8 Grouped GEMM is not supported on this GPU type.", ) def test_cutlass_w4a8_moe_mm_cuda_graph(): - current_platform.seed_everything(42) + set_random_seed(42) # Fixed config for CUDA graph test (single parameter point). num_experts = 8 K = 512 diff --git a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py index 1e5c7dafb0f5..94fa38b5aae4 100644 --- a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py +++ b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py @@ -12,6 +12,7 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm +from vllm.utils.torch_utils import set_random_seed if not current_platform.has_device_capability(100): pytest.skip( @@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm( if backend == "trtllm" and dtype == torch.float16: pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations") - current_platform.seed_everything(seed) + set_random_seed(seed) m, n, packed_k = shape k = packed_k * 2 block_size = 16 diff --git a/tests/kernels/quantization/test_flashinfer_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_scaled_mm.py index b30821b6895b..2c945ffcc4cd 100644 --- a/tests/kernels/quantization/test_flashinfer_scaled_mm.py +++ b/tests/kernels/quantization/test_flashinfer_scaled_mm.py @@ -6,6 +6,7 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm +from vllm.utils.torch_utils import set_random_seed if not current_platform.has_device_capability(100): pytest.skip( @@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm( device: str, autotune: bool, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) m, n, k = shape a = torch.randn((m, k), dtype=dtype, device=device) b = torch.randn((n, k), dtype=dtype, device=device) / k diff --git a/tests/kernels/quantization/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py index 19aa21b96a57..452206495299 100644 --- a/tests/kernels/quantization/test_fp8_quant.py +++ b/tests/kernels/quantization/test_fp8_quant.py @@ -11,7 +11,7 @@ ref_dynamic_per_token_quant, ) from tests.kernels.utils import opcheck -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed DTYPES = [torch.bfloat16, torch.float] HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193] @@ -51,7 +51,7 @@ def opcheck_fp8_quant( def test_dynamic_per_token_fp8_quant( num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) x = ( torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6 @@ -81,7 +81,7 @@ def test_dynamic_per_token_fp8_quant( def test_dynamic_per_tensor_fp8_quant( num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") @@ -101,7 +101,7 @@ def test_dynamic_per_tensor_fp8_quant( @torch.inference_mode() @pytest.mark.parametrize("seed", SEEDS) def test_fp8_quant_large(seed: int) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings hidden_size = 1152 # Smallest hidden_size to reproduce the error diff --git a/tests/kernels/quantization/test_fp8_quant_group.py b/tests/kernels/quantization/test_fp8_quant_group.py index f5e1cde94b6e..989bcf81a01c 100644 --- a/tests/kernels/quantization/test_fp8_quant_group.py +++ b/tests/kernels/quantization/test_fp8_quant_group.py @@ -7,7 +7,7 @@ from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed @pytest.mark.parametrize( @@ -30,7 +30,7 @@ def test_quantfp8_group_functionality( Tests both CUDA and native implementations, column-major scales, and verifies consistency between implementations. """ - current_platform.seed_everything(seed) + set_random_seed(seed) x = torch.randn((batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8 expected_num_groups = (hidden_dim + group_size - 1) // group_size @@ -83,7 +83,7 @@ def test_quantfp8_group_functionality( @pytest.mark.parametrize("use_ue8m0", [True, False]) @torch.inference_mode() def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) group_size = 64 @@ -136,7 +136,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None: @pytest.mark.parametrize("seed", [42]) @torch.inference_mode() def test_quantfp8_group_edge_cases(seed: int) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) batch_size = 16 group_size = 64 diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py index 0988ba01759f..912d5fee4e59 100644 --- a/tests/kernels/quantization/test_gguf.py +++ b/tests/kernels/quantization/test_gguf.py @@ -11,7 +11,7 @@ import vllm._custom_ops as ops from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample") GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample") @@ -91,7 +91,7 @@ def test_dequantize( @pytest.mark.parametrize("quant_type", QUANT_TYPES) @torch.inference_mode() def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): - current_platform.seed_everything(0) + set_random_seed(0) tensors = get_gguf_sample_tensors(hidden_size, quant_type) x = torch.rand((1, hidden_size), dtype=dtype, device="cuda") @@ -134,7 +134,7 @@ def test_mmq( dtype: torch.dtype, quant_type: GGMLQuantizationType, ): - current_platform.seed_everything(0) + set_random_seed(0) tensors = get_gguf_sample_tensors(hidden_size, quant_type) x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda") @@ -169,7 +169,7 @@ def test_moe( quant_type: GGMLQuantizationType, top_k: int, ): - current_platform.seed_everything(0) + set_random_seed(0) H, E = 1024, 256 x = torch.rand((num_tokens, H), dtype=dtype, device="cuda") diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py index 48e947db5fa7..cb2cd55facfd 100644 --- a/tests/kernels/quantization/test_int8_quant.py +++ b/tests/kernels/quantization/test_int8_quant.py @@ -7,7 +7,7 @@ from tests.kernels.quant_utils import ref_dynamic_per_token_quant from tests.kernels.utils import opcheck from vllm._custom_ops import scaled_int8_quant -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed DTYPES = [torch.bfloat16, torch.float] HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193] @@ -46,7 +46,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True): def test_dynamic_scaled_int8_quant( num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 @@ -70,7 +70,7 @@ def test_dynamic_scaled_int8_quant( def test_dynamic_scaled_int8_azp_quant( num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300 @@ -111,7 +111,7 @@ def test_dynamic_scaled_int8_azp_quant( def test_static_scaled_int8_quant( num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 @@ -144,7 +144,7 @@ def test_static_scaled_int8_azp_quant( scale: float, azp: int, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300 diff --git a/tests/kernels/quantization/test_mxfp4_qutlass.py b/tests/kernels/quantization/test_mxfp4_qutlass.py index 0bacbef2046b..0ad8e48ab159 100644 --- a/tests/kernels/quantization/test_mxfp4_qutlass.py +++ b/tests/kernels/quantization/test_mxfp4_qutlass.py @@ -24,6 +24,7 @@ from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed if not torch.cuda.is_available(): pytest.skip("CUDA required for these tests.", allow_module_level=True) @@ -205,7 +206,7 @@ def _forward_quantize_ref( @pytest.fixture(autouse=True) def _seed_each_test(): - current_platform.seed_everything(0) + set_random_seed(0) np.random.seed(0) torch.random.manual_seed(0) diff --git a/tests/kernels/quantization/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py index 12f1008ecf27..d17c69663fbd 100644 --- a/tests/kernels/quantization/test_nvfp4_quant.py +++ b/tests/kernels/quantization/test_nvfp4_quant.py @@ -6,6 +6,7 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.scalar_type import scalar_types +from vllm.utils.torch_utils import set_random_seed if not current_platform.has_device_capability(100): pytest.skip( @@ -134,7 +135,7 @@ def test_quantize_to_fp4( seed: int, device: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) m, n = shape @@ -156,7 +157,7 @@ def test_quantize_to_fp4( @torch.inference_mode() def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None: dtype = torch.float16 - current_platform.seed_everything(42) + set_random_seed(42) torch.set_default_device("cuda:0") m, n = pad_shape diff --git a/tests/kernels/quantization/test_nvfp4_qutlass.py b/tests/kernels/quantization/test_nvfp4_qutlass.py index 3824a080f504..bb25c4ab9aaf 100644 --- a/tests/kernels/quantization/test_nvfp4_qutlass.py +++ b/tests/kernels/quantization/test_nvfp4_qutlass.py @@ -25,6 +25,7 @@ from vllm._custom_ops import fusedQuantizeNv from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed if not torch.cuda.is_available(): pytest.skip("CUDA required for these tests.", allow_module_level=True) @@ -193,7 +194,7 @@ def _forward_quantize_ref(x: torch.Tensor, h: torch.Tensor, rot_size: int): @pytest.fixture(autouse=True) def _seed_each_test(): - current_platform.seed_everything(0) + set_random_seed(0) np.random.seed(0) torch.random.manual_seed(0) diff --git a/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py index 434564737c88..e7e16817593b 100644 --- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py +++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py @@ -6,6 +6,7 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed if not current_platform.has_device_capability(100): pytest.skip( @@ -59,7 +60,7 @@ def test_nvfp4_gemm( seed: int, device: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) m, n, packed_k = shape k = packed_k * 2 block_size = 16 diff --git a/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py index 4617464a3978..1c9140007f38 100644 --- a/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py +++ b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py @@ -11,6 +11,7 @@ from vllm._custom_ops import scaled_fp4_quant from vllm.model_executor.layers.activation import SiluAndMul from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed if not current_platform.has_device_capability(100): pytest.skip( @@ -33,7 +34,7 @@ def test_silu_mul_nvfp4_quant( dtype: torch.dtype, shape: tuple[int, int], ) -> None: - current_platform.seed_everything(42) + set_random_seed(42) device = "cuda:0" torch.set_default_device(device) diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py index 6633a8bbd3c6..1cef5eb93a5c 100644 --- a/tests/kernels/quantization/test_triton_scaled_mm.py +++ b/tests/kernels/quantization/test_triton_scaled_mm.py @@ -11,6 +11,7 @@ import torch from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed device = "cuda" @@ -85,7 +86,7 @@ def test_scaled_mm( ): is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point() - current_platform.seed_everything(0) + set_random_seed(0) # NOTE: There are cases, where if the matrix is large enough, an output # like 65504.4 can be produced, and can easily turn into inf when diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py index a4619f5846b1..8270cf885f60 100644 --- a/tests/kernels/test_apply_repetition_penalties.py +++ b/tests/kernels/test_apply_repetition_penalties.py @@ -9,6 +9,7 @@ apply_repetition_penalties_torch, ) from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025] # [stress, stress, stress, Qwen, llama 4] @@ -38,7 +39,7 @@ def test_apply_repetition_penalties( Test the apply_repetition_penalties custom op against a reference implementation. """ - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device("cuda:0") # Create test data @@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None: dtype = torch.float32 seed = 0 - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device("cuda:0") # Create test data diff --git a/tests/kernels/test_fla_layernorm_guard.py b/tests/kernels/test_fla_layernorm_guard.py index f944c6dcfa73..2ece5497cb06 100644 --- a/tests/kernels/test_fla_layernorm_guard.py +++ b/tests/kernels/test_fla_layernorm_guard.py @@ -10,7 +10,7 @@ layernorm_fn, rms_norm_ref, ) -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed def layer_norm_ref( @@ -114,7 +114,7 @@ def test_layer_norm_fwd_basic( is_rms_norm: bool, ) -> None: """Test basic layer norm forward pass without z (gate) tensor.""" - current_platform.seed_everything(seed) + set_random_seed(seed) device = torch.device("cuda:0") # Create inputs @@ -156,7 +156,7 @@ def test_layer_norm_fwd_with_gate( is_rms_norm: bool, ) -> None: """Test layer norm forward pass with z (gate) tensor.""" - current_platform.seed_everything(42) + set_random_seed(42) device = torch.device("cuda:0") # Create inputs @@ -213,7 +213,7 @@ def test_layer_norm_fwd_with_groups( f"hidden_size {hidden_size} not divisible by group_size {group_size}" ) - current_platform.seed_everything(42) + set_random_seed(42) device = torch.device("cuda:0") # Create inputs @@ -253,7 +253,7 @@ def test_layer_norm_rows_per_block( dtype: torch.dtype, ) -> None: """Test that rows_per_block logic works correctly for various M sizes.""" - current_platform.seed_everything(42) + set_random_seed(42) device = torch.device("cuda:0") hidden_size = 1024 @@ -278,7 +278,7 @@ def test_layer_norm_rows_per_block( def test_strided_input(dtype: torch.dtype) -> None: """Test that the kernel handles non-contiguous (strided) inputs correctly.""" - current_platform.seed_everything(42) + set_random_seed(42) device = torch.device("cuda:0") num_tokens = 128 hidden_size = 1024 @@ -318,7 +318,7 @@ def test_output_buffer_provided( dtype: torch.dtype, ) -> None: """Test that the kernel works when an output buffer is provided.""" - current_platform.seed_everything(42) + set_random_seed(42) device = torch.device("cuda:0") # Create inputs @@ -359,7 +359,7 @@ def test_multidimensional_input( dtype: torch.dtype, ) -> None: """Test that the autograd function handles multidimensional inputs.""" - current_platform.seed_everything(42) + set_random_seed(42) device = torch.device("cuda:0") hidden_size = shape[-1] diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py index 91c8b861c3c5..a4d314be095c 100644 --- a/tests/lora/test_fused_moe_lora_kernel.py +++ b/tests/lora/test_fused_moe_lora_kernel.py @@ -18,8 +18,8 @@ get_tensor_model_parallel_world_size, ) from vllm.lora.ops.triton_ops import fused_moe_lora -from vllm.platforms import current_platform from vllm.utils.network_utils import get_open_port +from vllm.utils.torch_utils import set_random_seed @pytest.fixture(autouse=True) @@ -265,7 +265,7 @@ def test_fused_moe_lora_kernel( seed, ): torch.set_default_device(device) - current_platform.seed_everything(seed) + set_random_seed(seed) # the number of randomly generated sentences. num_sequences = 10 # generate data @@ -358,7 +358,7 @@ def test_fused_moe_lora_kernel_fully_sharded( seed, column_parallel, ): - current_platform.seed_everything(seed) + set_random_seed(seed) # the number of randomly generated sentences. num_sequences = 10 # generate data @@ -415,7 +415,7 @@ def use_fused_moe_lora_kernel_tensor_parallel( def _get_shard_slice(shard_size): return slice(local_rank * shard_size, (local_rank + 1) * shard_size) - current_platform.seed_everything(seed) + set_random_seed(seed) device = torch.device(f"cuda:{local_rank}") torch.cuda.set_device(device) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index dfec4236835a..611204b22f47 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -43,8 +43,8 @@ VocabParallelEmbedding, get_masked_input_and_mask, ) -from vllm.model_executor.utils import set_random_seed from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from .utils import DummyLoRAManager diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index e4df9751077d..5083f500c5cd 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -9,7 +9,7 @@ import vllm.lora.ops.triton_ops as triton_ops from vllm.lora.ops.triton_ops import LoRAKernelMeta from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from .utils import PunicaTensors, assert_close, generate_data_for_nslices @@ -395,7 +395,7 @@ def test_kernels( Tests LoRA kernels. """ torch.set_default_device(device) - current_platform.seed_everything(seed) + set_random_seed(seed) if op_type == "shrink": check_lora_shrink_kernel( @@ -447,7 +447,7 @@ def test_kernels_hidden_size( Tests SGMV and LoRA kernels. """ torch.set_default_device(device) - current_platform.seed_everything(seed) + set_random_seed(seed) if op_type == "shrink": check_lora_shrink_kernel( diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py index 82ba958a58c4..24e49e9d61c8 100644 --- a/tests/models/test_vision.py +++ b/tests/models/test_vision.py @@ -21,6 +21,7 @@ from vllm.platforms import current_platform from vllm.utils.network_utils import get_open_port from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import set_random_seed pytestmark = pytest.mark.cpu_test @@ -98,7 +99,7 @@ def run_dp_sharded_vision_model_vs_direct( """ # Set random seed for reproducibility - current_platform.seed_everything(0) + set_random_seed(0) device = f"{current_platform.device_name}:{local_rank}" current_platform.set_device(device) @@ -284,7 +285,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct( calling the model directly. """ # Set random seed for reproducibility - current_platform.seed_everything(0) + set_random_seed(0) device = f"{current_platform.device_name}:{local_rank}" current_platform.set_device(device) torch.set_default_device(device) @@ -408,7 +409,7 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker( ): """Test run_dp_sharded_mrope_vision_model with uneven load distribution.""" # Set up distributed environment - current_platform.seed_everything(123) + set_random_seed(123) device = f"{current_platform.device_name}:{local_rank}" current_platform.set_device(device) torch.set_default_device(device) diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index f4f40babaf79..6e08b9316be4 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -19,7 +19,11 @@ from vllm.config import ModelConfig from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, is_torch_equal_or_newer +from vllm.utils.torch_utils import ( + STR_DTYPE_TO_TORCH_DTYPE, + is_torch_equal_or_newer, + set_random_seed, +) from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, set_kv_cache_layout, @@ -320,7 +324,7 @@ def _test_backend_correctness( multiple GPUs. This tests that backends work correctly with different head counts. """ - current_platform.seed_everything(42) + set_random_seed(42) hf_config_override = None if tensor_parallel_size > 1: diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py index 3516c0013879..4d9d54e038e8 100644 --- a/tests/v1/kv_offload/test_cpu_gpu.py +++ b/tests/v1/kv_offload/test_cpu_gpu.py @@ -7,6 +7,7 @@ import torch from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers @@ -62,7 +63,7 @@ def test_transfer( seed: int, device: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) # create per-layer GPU KV caches based on available attn_backends attn_backends_list = BACKENDS_TO_TEST diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py index 84968dee6b60..b7ebf5919607 100644 --- a/tests/v1/tpu/test_mha_attn.py +++ b/tests/v1/tpu/test_mha_attn.py @@ -15,6 +15,7 @@ from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.attention.selector import _cached_get_attn_backend from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed @pytest.fixture(autouse=True) @@ -63,7 +64,7 @@ def test_mha_attn_forward( head_size: int, device: str, ): - current_platform.seed_everything(0) + set_random_seed(0) # These are expected to be f32 q = torch.randn(batch_size, seq_len, num_heads * head_size, device=device) k = torch.randn(batch_size, seq_len, num_kv_heads * head_size, device=device) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 59f1ac705829..4ab5aa66c315 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -26,6 +26,7 @@ from vllm.sampling_params import SamplingParams from vllm.utils.mem_constants import GiB_bytes from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import set_random_seed from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput from vllm.v1.kv_cache_interface import ( @@ -776,7 +777,7 @@ def test_hybrid_attention_mamba_tensor_shapes(): will not corrupt an attention block and vice versa """ - current_platform.seed_everything(42) + set_random_seed(42) update_environment_variables( { diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index b50f0cb3a61a..8d79940b858f 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -2,10 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.model_executor.parameter import BasevLLMParameter, PackedvLLMParameter -from vllm.model_executor.utils import set_random_seed __all__ = [ - "set_random_seed", "BasevLLMParameter", "PackedvLLMParameter", ] diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index b89371d98754..d4e87707c847 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -10,12 +10,6 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer -def set_random_seed(seed: int | None) -> None: - from vllm.platforms import current_platform - - current_platform.seed_everything(seed) - - def set_weight_attrs( weight: torch.Tensor, weight_attrs: dict[str, Any] | None, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index d4b40045df38..833b66d5b80f 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -372,6 +372,10 @@ def seed_everything(cls, seed: int | None = None) -> None: Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20 """ + logger.info_once( + "`seed_everything` is deprecated. It will be removed in v0.14.0 or later. " + "Please use `vllm.utils.torch_utils.set_random_seed` instead." + ) if seed is not None: random.seed(seed) np.random.seed(seed) diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index db596052a04d..ca0cecc4a0ad 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -3,6 +3,7 @@ import contextlib import importlib.metadata import os +import random import threading from collections.abc import Callable, Collection from functools import lru_cache @@ -278,6 +279,13 @@ def kv_cache_dtype_str_to_dtype( return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype] +def set_random_seed(seed: int | None) -> None: + if seed is not None: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + def create_kv_caches_with_random_flash( num_blocks: int, block_size: int, @@ -290,9 +298,7 @@ def create_kv_caches_with_random_flash( device: str | None = "cuda", cache_layout: str | None = "NHD", ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: - from vllm.platforms import current_platform - - current_platform.seed_everything(seed) + set_random_seed(seed) dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size) @@ -335,9 +341,8 @@ def create_kv_caches_with_random( raise ValueError( f"Does not support key cache of type fp8 with head_size {head_size}" ) - from vllm.platforms import current_platform - current_platform.seed_everything(seed) + set_random_seed(seed) dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index e54b995ab908..654f58834a15 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -10,10 +10,10 @@ from vllm import envs from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.model_executor.utils import set_random_seed from vllm.platforms import CpuArchEnum, current_platform from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo from vllm.profiler.wrapper import TorchProfilerWrapper +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.cpu_model_runner import CPUModelRunner from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index e5f1fa9179ec..c8441c09b2f9 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -34,7 +34,6 @@ ) from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.model_executor import set_random_seed from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.model_executor.warmup.kernel_warmup import kernel_warmup from vllm.platforms import current_platform @@ -43,6 +42,7 @@ from vllm.tasks import SupportedTask from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_utils import MemorySnapshot, memory_profiling +from vllm.utils.torch_utils import set_random_seed from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index ab22d0af63a5..3ece4c58214a 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -20,12 +20,11 @@ ) from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.model_executor import set_random_seed from vllm.platforms import current_platform from vllm.platforms.tpu import USE_TPU_INFERENCE from vllm.tasks import SupportedTask from vllm.utils.math_utils import cdiv -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 1faa1a24ff0e..fe0850771dd0 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -9,9 +9,9 @@ from vllm.config import VllmConfig from vllm.distributed import get_world_group from vllm.logger import init_logger -from vllm.model_executor import set_random_seed from vllm.platforms import current_platform from vllm.profiler.wrapper import TorchProfilerWrapper +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment from vllm.v1.worker.xpu_model_runner import XPUModelRunner