Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
9318236
add pre-commit check as first CI step to catch linting issues early
tahsintunan Sep 1, 2025
250b7e4
replace arbitrary use of big llama models with smaller models
tahsintunan Sep 1, 2025
1c8e033
split test_models into separate basic correctness and sliding window …
tahsintunan Sep 1, 2025
9ecbd1e
skip test_collective_rpc if num_gpu < tp
tahsintunan Sep 1, 2025
115af2e
enable prefix caching in test_sampling_params_e2e
tahsintunan Sep 1, 2025
82aa2b3
refactor shutdown test to use explicit server termination
tahsintunan Sep 1, 2025
1749e88
remove pre-commit check from buildkite
tahsintunan Sep 2, 2025
62e37dc
Merge branch 'main' into ci-tiny-models
tahsintunan Sep 2, 2025
9fefe3e
replace models with hmellor/tiny-random-LlamaForCausalLM
tahsintunan Sep 2, 2025
16ac0fe
Merge branch 'main' into ci-tiny-models
tahsintunan Sep 2, 2025
8bae25d
remove tiny-random-llama from test_basic_correctness due to tokenizat…
tahsintunan Sep 2, 2025
3429a6b
fix memory profiling test flakiness
tahsintunan Sep 2, 2025
86b35f9
use small model to fix CI timeout
tahsintunan Sep 2, 2025
b13a504
use opt-125m for TP correctness tests
tahsintunan Sep 3, 2025
3f103c3
use meta-llama for SP tests
tahsintunan Sep 3, 2025
b892976
Use opt-125m for pytorch checkpoint test
tahsintunan Sep 13, 2025
eaf9786
use tiny-random-LlamaForCausalLM in SP tests
tahsintunan Sep 13, 2025
8c1707f
Merge branch 'main' into ci-tiny-models
njhill Sep 18, 2025
af5a75b
Merge branch 'main' into ci-tiny-models
tahsintunan Sep 19, 2025
a6cdfc3
Merge commit '17edd8a' into pr/tahsintunan/24057
hmellor Oct 8, 2025
6d813f7
ruff
hmellor Oct 8, 2025
2b7421a
Merge commit 'd6953be' into pr/tahsintunan/24057
hmellor Oct 8, 2025
5d2b46a
Merge branch 'main' into pr/tahsintunan/24057
hmellor Oct 8, 2025
4bacf51
Don't use Pythia because it's max model len is too short
hmellor Oct 8, 2025
ad78423
Revert one test which doesn't pass to unblock the rest
hmellor Oct 8, 2025
200df3e
Merge branch 'main' into ci-tiny-models
tahsintunan Oct 14, 2025
93bc36d
fix failing tests
tahsintunan Oct 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from ..utils import multi_gpu_test

MODELS = [
"google/gemma-2-2b-it",
"hmellor/tiny-random-Gemma2ForCausalLM",
"meta-llama/Llama-3.2-1B-Instruct",
]

Expand All @@ -29,7 +29,7 @@

def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("distilbert/distilgpt2")
llm = LLM("hmellor/tiny-random-LlamaForCausalLM")
weak_llm = weakref.ref(llm)
del llm
# If there's any circular reference to vllm, this fails
Expand Down Expand Up @@ -125,14 +125,14 @@ def test_models(
@pytest.mark.parametrize(
"model, distributed_executor_backend, attention_backend, test_suite, extra_env",
[
("distilbert/distilgpt2", "ray", "", "L4", {}),
("distilbert/distilgpt2", "mp", "", "L4", {}),
("distilbert/distilgpt2", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
("distilbert/distilgpt2", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
("facebook/opt-125m", "ray", "", "L4", {}),
("facebook/opt-125m", "mp", "", "L4", {}),
("facebook/opt-125m", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
("facebook/opt-125m", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
("distilbert/distilgpt2", "ray", "", "A100", {}),
("distilbert/distilgpt2", "mp", "", "A100", {}),
("facebook/opt-125m", "ray", "", "A100", {}),
("facebook/opt-125m", "mp", "", "A100", {}),
],
)
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
Expand Down
2 changes: 1 addition & 1 deletion tests/basic_correctness/test_cpu_offload.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@

def test_cpu_offload():
compare_two_settings(
"meta-llama/Llama-3.2-1B-Instruct", [], ["--cpu-offload-gb", "1"]
"hmellor/tiny-random-LlamaForCausalLM", [], ["--cpu-offload-gb", "1"]
)
4 changes: 2 additions & 2 deletions tests/basic_correctness/test_cumem.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def model(x):
"model",
[
# sleep mode with safetensors
"meta-llama/Llama-3.2-1B",
"hmellor/tiny-random-LlamaForCausalLM",
# sleep mode with pytorch checkpoint
"facebook/opt-125m",
],
Expand Down Expand Up @@ -174,7 +174,7 @@ def test_end_to_end(model: str):

@create_new_process_for_each_test()
def test_deep_sleep():
model = "Qwen/Qwen3-0.6B"
model = "hmellor/tiny-random-LlamaForCausalLM"
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
Expand Down
4 changes: 2 additions & 2 deletions tests/distributed/test_sequence_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,14 +269,14 @@ def _compare_sp(

SP_TEXT_GENERATION_MODELS = {
# [Decoder-only]
"meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.fast(),
"hmellor/tiny-random-LlamaForCausalLM": SPTestSettings.fast(),
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(),
}

SP_TEST_MODELS = [
# TODO support other models
# [LANGUAGE GENERATION]
"meta-llama/Llama-3.2-1B-Instruct",
"hmellor/tiny-random-LlamaForCausalLM",
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
]

Expand Down
5 changes: 4 additions & 1 deletion tests/entrypoints/llm/test_collective_rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest
import torch

from vllm import LLM

Expand All @@ -12,6 +13,8 @@
@pytest.mark.parametrize("backend", ["mp", "ray"])
@create_new_process_for_each_test()
def test_collective_rpc(tp_size, backend, monkeypatch):
if torch.cuda.device_count() < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
if tp_size == 1 and backend == "ray":
pytest.skip("Skip duplicate test case")
if tp_size == 1:
Expand All @@ -24,7 +27,7 @@ def echo_rank(self):

monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
llm = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
model="hmellor/tiny-random-LlamaForCausalLM",
enforce_eager=True,
load_format="dummy",
tensor_parallel_size=tp_size,
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_run_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from vllm.entrypoints.openai.protocol import BatchRequestOutput

MODEL_NAME = "Qwen/Qwen3-0.6B"
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"

# ruff: noqa: E501
INPUT_BATCH = (
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_serving_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.lora.request import LoRARequest

MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
LORA_LOADING_SUCCESS_MESSAGE = "Success: LoRA adapter '{lora_name}' added successfully."
LORA_UNLOADING_SUCCESS_MESSAGE = (
Expand Down
104 changes: 80 additions & 24 deletions tests/entrypoints/openai/test_shutdown.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,93 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import signal
import subprocess
import sys
import time

import openai
import pytest

from ...utils import RemoteOpenAIServer
from ...utils import get_open_port

MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"


@pytest.mark.asyncio
async def test_shutdown_on_engine_failure():
# dtype, max-len etc set so that this can run in CI
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
"--max-num-seqs",
"128",
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
async with remote_server.get_async_client() as client:
with pytest.raises((openai.APIConnectionError, openai.InternalServerError)):
# Asking for lots of prompt logprobs will currently crash the
# engine. This may change in the future when that bug is fixed
prompt = "Hello " * 4000
await client.completions.create(
model=MODEL_NAME, prompt=prompt, extra_body={"prompt_logprobs": 10}
"""Verify that API returns connection error when server process is killed.

Starts a vLLM server, kills it to simulate a crash, then verifies that
subsequent API calls fail appropriately.
"""

port = get_open_port()

proc = subprocess.Popen(
[
# dtype, max-len etc set so that this can run in CI
sys.executable,
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
MODEL_NAME,
"--dtype",
"bfloat16",
"--max-model-len",
"128",
"--enforce-eager",
"--port",
str(port),
"--gpu-memory-utilization",
"0.05",
"--max-num-seqs",
"2",
"--disable-frontend-multiprocessing",
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
)

# Wait for server startup
start_time = time.time()
client = openai.AsyncOpenAI(
base_url=f"http://localhost:{port}/v1",
api_key="dummy",
max_retries=0,
timeout=10,
)

# Poll until server is ready
while time.time() - start_time < 30:
try:
await client.completions.create(
model=MODEL_NAME, prompt="Hello", max_tokens=1
)
break
except Exception:
time.sleep(0.5)
if proc.poll() is not None:
stdout, stderr = proc.communicate(timeout=1)
pytest.fail(
f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
)
else:
proc.terminate()
proc.wait(timeout=5)
pytest.fail("Server failed to start in 30 seconds")

# Kill server to simulate crash
proc.terminate()
time.sleep(1)

# Verify API calls now fail
with pytest.raises((openai.APIConnectionError, openai.APIStatusError)):
await client.completions.create(
model=MODEL_NAME, prompt="This should fail", max_tokens=1
)

# Now the server should shut down
return_code = remote_server.proc.wait(timeout=8)
assert return_code is not None
return_code = proc.wait(timeout=5)
assert return_code is not None
1 change: 1 addition & 0 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ def check_available_online(
"guard": "meta-llama/Llama-Guard-3-1B",
"hermes": "NousResearch/Hermes-3-Llama-3.1-8B",
"fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
"tiny": "hmellor/tiny-random-LlamaForCausalLM",
},
),
"LLaMAForCausalLM": _HfExamplesInfo(
Expand Down
10 changes: 4 additions & 6 deletions tests/samplers/test_no_bad_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,13 @@ def _generate(


class TestOneTokenBadWord:
MODEL = "TheBloke/Llama-2-7B-fp16"
MODEL = "hmellor/tiny-random-LlamaForCausalLM"

PROMPT = "Hi! How are"
TARGET_TOKEN = "you"
PROMPT = "How old are "
TARGET_TOKEN = "mn"

def setup_method(self, method):
self.tokenizer = AutoTokenizer.from_pretrained(
self.MODEL, add_prefix_space=True
)
self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL)

self.num_prompt_tokens = len(self._encode(self.PROMPT))
self.target_token_id = self._encode(
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/core/test_scheduler_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from vllm import LLM

MODEL = "meta-llama/Llama-3.2-1B"
MODEL = "hmellor/tiny-random-LlamaForCausalLM"
PROMPT = "Hello my name is Robert and I"


Expand Down
6 changes: 4 additions & 2 deletions tests/v1/engine/test_engine_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@
if not current_platform.is_cuda():
pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)

MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
PROMPT = "Hello my name is Robert and I love quantization kernels"
# test_engine_core_concurrent_batches assumes exactly 12 tokens per prompt.
# Adjust prompt if changing model to maintain 12-token length.
PROMPT = "I am Gyoubu Masataka Oniwa"
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids


Expand Down
2 changes: 1 addition & 1 deletion tests/v1/entrypoints/openai/test_multi_api_servers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tests.utils import RemoteOpenAIServer
from tests.v1.utils import check_request_balancing

MODEL_NAME = "ibm-research/PowerMoE-3b"
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"

DP_SIZE = os.getenv("DP_SIZE", "1")

Expand Down
7 changes: 2 additions & 5 deletions tests/v1/sample/test_sampling_params_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,13 @@

from vllm import LLM, SamplingParams

MODEL = "meta-llama/Llama-3.2-1B"
MODEL = "hmellor/tiny-random-LlamaForCausalLM"
PROMPT = "Hello my name is Robert and I"


@pytest.fixture(scope="module")
def llm() -> LLM:
# Disable prefix caching so that we can test prompt logprobs.
# TODO remove this after https://github.com/vllm-project/vllm/pull/13949
# is merged
return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
return LLM(MODEL, enforce_eager=True)


def test_n_gt_1(llm):
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/shutdown/test_delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM

MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]


@pytest.mark.asyncio
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/shutdown/test_forward_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.exceptions import EngineDeadError

MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]


def evil_forward(self, *args, **kwargs):
Expand Down
8 changes: 5 additions & 3 deletions tests/v1/shutdown/test_startup_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from vllm.utils import cuda_device_count_stateless
from vllm.v1.engine.async_llm import AsyncLLM

MODELS = ["meta-llama/Llama-3.2-1B"]
MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]


def evil_method(self, *args, **kwargs):
Expand Down Expand Up @@ -76,8 +76,10 @@ def test_llm_startup_error(
Test profiling (forward()) and load weights failures.
TODO(andy) - LLM without multiprocessing.
"""
if model != "meta-llama/Llama-3.2-1B":
pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
# Skip non-Llama models since we monkeypatch LlamaForCausalLM specifically.
# If MODELS list grows, each architecture needs its own test variant.
if model != "JackFram/llama-68m":
pytest.skip(reason="Only test JackFram/llama-68m")
if cuda_device_count_stateless() < tensor_parallel_size:
pytest.skip(reason="Not enough CUDA devices")

Expand Down