Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/design/plugin_system.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,3 +154,4 @@ The interface for the model/module may change during vLLM's development. If you
!!! warning "Deprecations"
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
- `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
- `seed_everything` platform interface is deprecated. It will be removed in v0.14.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead.
3 changes: 2 additions & 1 deletion tests/compile/distributed/test_async_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)
from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed

from ...models.registry import HF_EXAMPLE_MODELS
from ...utils import (
Expand Down Expand Up @@ -301,7 +302,7 @@ def async_tp_pass_on_test_model(
dtype: torch.dtype,
dynamic: bool,
):
current_platform.seed_everything(0)
set_random_seed(0)

device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
Expand Down
3 changes: 2 additions & 1 deletion tests/compile/distributed/test_fusion_all_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
)
from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed

from ...utils import has_module_attribute, multi_gpu_test
from ..backend import TestBackend
Expand Down Expand Up @@ -263,7 +264,7 @@ def all_reduce_fusion_pass_on_test_model(
enable_rms_norm_custom_op,
enable_quant_fp8_custom_op,
):
current_platform.seed_everything(0)
set_random_seed(0)

device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
Expand Down
3 changes: 2 additions & 1 deletion tests/compile/distributed/test_sequence_parallelism.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
from vllm.platforms import current_platform
from vllm.utils.system_utils import update_environment_variables
from vllm.utils.torch_utils import set_random_seed

from ...utils import multi_gpu_test
from ..backend import TestBackend
Expand Down Expand Up @@ -232,7 +233,7 @@ def sequence_parallelism_pass_on_test_model(
fuse_norm_quant: bool,
dynamic: bool,
):
current_platform.seed_everything(0)
set_random_seed(0)

device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/attention/test_aiter_flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401
from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed

NUM_HEADS = [(4, 4), (8, 2)]
HEAD_SIZES = [128, 256]
Expand Down Expand Up @@ -104,7 +105,7 @@ def test_varlen_with_paged_kv(
if not is_flash_attn_varlen_func_available():
pytest.skip("flash_attn_varlen_func required to run this test.")
torch.set_default_device("cuda")
current_platform.seed_everything(0)
set_random_seed(0)
num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens]
Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/attention/test_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
from vllm.platforms import current_platform
from vllm.utils.mem_utils import get_max_shared_memory_bytes
from vllm.utils.torch_utils import set_random_seed

FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
Expand Down Expand Up @@ -150,7 +151,7 @@ def test_paged_attention(

global PARTITION_SIZE

current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads
Expand Down
17 changes: 9 additions & 8 deletions tests/kernels/attention/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed

COPYING_DIRECTION = [("cuda", "cpu"), ("cuda", "cuda"), ("cpu", "cuda")]
DTYPES = [torch.bfloat16, torch.float]
Expand Down Expand Up @@ -64,7 +65,7 @@ def test_reshape_and_cache(
) -> None:
if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip()
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device(device)
torch.cuda.set_device(device)
# Create a random slot mapping.
Expand Down Expand Up @@ -185,7 +186,7 @@ def test_reshape_and_cache_flash(
kv_cache_layout: str,
implementation: str,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device(device)
torch.cuda.set_device(device)
assert implementation in ["cuda", "triton"]
Expand Down Expand Up @@ -355,7 +356,7 @@ def test_swap_blocks(
if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip()

current_platform.seed_everything(seed)
set_random_seed(seed)

src_device = device if direction[0] == "cuda" else "cpu"
dst_device = device if direction[1] == "cuda" else "cpu"
Expand Down Expand Up @@ -444,7 +445,7 @@ def test_fp8_e4m3_conversion(
seed: int,
device: str,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)

low = -224.0
high = 224.0
Expand Down Expand Up @@ -507,7 +508,7 @@ def test_concat_and_cache_mla(
device: str,
kv_cache_dtype: str,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device(device)
torch.cuda.set_device(device)

Expand Down Expand Up @@ -584,7 +585,7 @@ def test_concat_and_cache_ds_mla(
if dtype.itemsize != 2:
pytest.skip("ds_mla only supports 16-bit input")
kv_cache_dtype = "fp8_ds_mla"
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device(device)
torch.cuda.set_device(device)

Expand Down Expand Up @@ -695,7 +696,7 @@ def test_swap_blocks_mla(
device: str,
kv_cache_dtype: str,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device(device)
torch.cuda.set_device(device)

Expand Down Expand Up @@ -947,7 +948,7 @@ def test_concat_and_cache_mla_cpu(
) -> None:
device = "cpu"
kv_cache_dtype = "auto"
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device(device)

total_slots = num_blocks * block_size
Expand Down
5 changes: 3 additions & 2 deletions tests/kernels/attention/test_cascade_flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import torch

from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states

try:
Expand Down Expand Up @@ -39,7 +40,7 @@ def test_merge_kernel(
dtype: torch.dtype,
):
torch.set_default_device("cuda")
current_platform.seed_everything(0)
set_random_seed(0)
num_query_heads = num_heads[0]
num_kv_heads = num_heads[1]
assert num_query_heads % num_kv_heads == 0
Expand Down Expand Up @@ -103,7 +104,7 @@ def test_cascade(
f'to: "{fa_version_unsupported_reason(fa_version)}"'
)

current_platform.seed_everything(0)
set_random_seed(0)

window_size = (-1, -1)
scale = head_size**-0.5
Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/attention/test_cpu_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import torch

from vllm.platforms import CpuArchEnum, current_platform
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.attention.backends.cpu_attn import _get_attn_isa

if not current_platform.is_cpu():
Expand Down Expand Up @@ -190,7 +191,7 @@ def varlen_with_paged_kv(
use_sink: bool,
isa: str,
) -> None:
current_platform.seed_everything(0)
set_random_seed(0)
num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens]
Expand Down
3 changes: 2 additions & 1 deletion tests/kernels/attention/test_flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import torch

from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed

try:
from vllm.vllm_flash_attn import (
Expand Down Expand Up @@ -129,7 +130,7 @@ def test_varlen_with_paged_kv(
"Flash attention with quantized inputs is only "
"supported on version 3 with bfloat16 base type"
)
current_platform.seed_everything(0)
set_random_seed(0)
num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens]
Expand Down
9 changes: 5 additions & 4 deletions tests/kernels/attention/test_flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest

from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed

try:
import flashinfer
Expand Down Expand Up @@ -101,7 +102,7 @@ def test_flashinfer_decode_with_paged_kv(
sliding_window: int | None,
) -> None:
torch.set_default_device("cuda")
current_platform.seed_everything(0)
set_random_seed(0)
num_seqs = len(kv_lens)
num_query_heads = num_heads[0]
num_kv_heads = num_heads[1]
Expand Down Expand Up @@ -196,7 +197,7 @@ def test_flashinfer_prefill_with_paged_kv(
sliding_window: int | None,
) -> None:
torch.set_default_device("cuda")
current_platform.seed_everything(0)
set_random_seed(0)
num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens]
Expand Down Expand Up @@ -299,7 +300,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
) -> None:
pytest.skip("TODO: fix the accuracy issue")
torch.set_default_device("cuda")
current_platform.seed_everything(0)
set_random_seed(0)
num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens]
Expand Down Expand Up @@ -409,7 +410,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
) -> None:
# test doesn't work for num_heads = (16,16)
torch.set_default_device("cuda")
current_platform.seed_everything(0)
set_random_seed(0)
num_seqs = len(kv_lens)
num_query_heads = num_heads[0]
num_kv_heads = num_heads[1]
Expand Down
5 changes: 3 additions & 2 deletions tests/kernels/attention/test_flashinfer_trtllm_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
)
from vllm.platforms import current_platform
from vllm.utils.math_utils import round_up
from vllm.utils.torch_utils import set_random_seed

if not current_platform.is_device_capability_family(100):
pytest.skip(
Expand Down Expand Up @@ -80,7 +81,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
has_sinks: bool,
) -> None:
torch.set_default_device("cuda")
current_platform.seed_everything(42)
set_random_seed(42)

q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
q_quant_dtype = q_quant_dtype or dtype
Expand Down Expand Up @@ -279,7 +280,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
has_sinks: bool,
) -> None:
torch.set_default_device("cuda")
current_platform.seed_everything(42)
set_random_seed(42)

q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
q_quant_dtype = q_quant_dtype or dtype
Expand Down
8 changes: 4 additions & 4 deletions tests/kernels/attention/test_lightning_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torch

from vllm.model_executor.layers.lightning_attn import linear_decode_forward_triton
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed

NUM_HEADS = [4, 8]
HEAD_SIZES = [64]
Expand Down Expand Up @@ -124,7 +124,7 @@ def test_linear_decode_forward_triton(
torch.set_default_device("cuda")
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
current_platform.seed_everything(42)
set_random_seed(42)
base = 0.01
q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
Expand Down Expand Up @@ -167,7 +167,7 @@ def test_linear_decode_forward_triton_with_padding(
torch.set_default_device("cuda")
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
current_platform.seed_everything(42)
set_random_seed(42)

batch_size = 4
base = 0.01
Expand Down Expand Up @@ -231,7 +231,7 @@ def test_lightning_attention_reference(
torch.set_default_device("cuda")
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
current_platform.seed_everything(42)
set_random_seed(42)

base = 0.01
q = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
Expand Down
5 changes: 3 additions & 2 deletions tests/kernels/attention/test_mha_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform
from vllm.platforms.rocm import RocmPlatform
from vllm.utils.torch_utils import set_random_seed


@pytest.fixture(autouse=True)
Expand Down Expand Up @@ -123,7 +124,7 @@ def test_mha_attn_forward(
dtype: torch.dtype,
device: str,
):
current_platform.seed_everything(0)
set_random_seed(0)
torch.set_default_device(device)
torch.set_default_dtype(dtype)

Expand Down Expand Up @@ -168,7 +169,7 @@ def test_mha_attn_varlen_forward(
dtype: torch.dtype,
device: str,
):
current_platform.seed_everything(0)
set_random_seed(0)
torch.set_default_device(device)
torch.set_default_dtype(dtype)

Expand Down
6 changes: 3 additions & 3 deletions tests/kernels/attention/test_prefix_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
from vllm.attention.ops.prefix_prefill import context_attention_fwd
from vllm.platforms import current_platform
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed

NUM_HEADS = [64]
NUM_QUERIES_PER_KV = [1, 64]
Expand Down Expand Up @@ -125,7 +125,7 @@ def test_contexted_kv_attention(
):
pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")

current_platform.seed_everything(0)
set_random_seed(0)
torch.set_default_device(device)

# Need this, otherwise when we capture the graph the process
Expand Down Expand Up @@ -346,7 +346,7 @@ def test_contexted_kv_attention_alibi(
):
pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")

current_platform.seed_everything(0)
set_random_seed(0)
torch.set_default_device(device)

# Need this, otherwise when we capture the graph the process
Expand Down
Loading