|
3 | 3 | from __future__ import annotations |
4 | 4 |
|
5 | 5 | import random |
| 6 | +from dataclasses import dataclass |
6 | 7 | from typing import Any, Union |
7 | 8 |
|
8 | 9 | import pytest |
|
13 | 14 | from vllm.assets.base import VLLM_S3_BUCKET_URL |
14 | 15 | from vllm.assets.image import VLM_IMAGES_DIR |
15 | 16 | from vllm.distributed import cleanup_dist_env_and_memory |
| 17 | +from vllm.outputs import RequestOutput |
16 | 18 | from vllm.platforms import current_platform |
| 19 | +from vllm.v1.spec_decode.metrics import compute_acceptance_rate |
17 | 20 |
|
18 | 21 |
|
19 | 22 | def get_test_prompts(mm_enabled: bool): |
@@ -69,9 +72,17 @@ def get_test_prompts(mm_enabled: bool): |
69 | 72 |
|
70 | 73 | @pytest.fixture |
71 | 74 | def sampling_config(): |
| 75 | + return greedy_sampling() |
| 76 | + |
| 77 | + |
| 78 | +def greedy_sampling() -> SamplingParams: |
72 | 79 | return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False) |
73 | 80 |
|
74 | 81 |
|
| 82 | +def stochastic_sampling() -> SamplingParams: |
| 83 | + return SamplingParams(temperature=1.0, max_tokens=10, ignore_eos=False) |
| 84 | + |
| 85 | + |
75 | 86 | @pytest.fixture |
76 | 87 | def model_name(): |
77 | 88 | return "meta-llama/Llama-3.1-8B-Instruct" |
@@ -230,3 +241,107 @@ def test_eagle_correctness( |
230 | 241 | del spec_llm |
231 | 242 | torch.cuda.empty_cache() |
232 | 243 | cleanup_dist_env_and_memory() |
| 244 | + |
| 245 | + |
| 246 | +@dataclass |
| 247 | +class ArgsTest: |
| 248 | + model: str |
| 249 | + draft_model: str |
| 250 | + sampling_config: SamplingParams |
| 251 | + expected_acceptance_rate: float |
| 252 | + expected_same_output_fraction: float |
| 253 | + # Defaults |
| 254 | + enforce_eager: bool = True |
| 255 | + max_model_len: int = 1024 |
| 256 | + gpu_memory_utilization: float = 0.5 |
| 257 | + |
| 258 | + |
| 259 | +cases = [ |
| 260 | + ArgsTest( |
| 261 | + model="meta-llama/Llama-3.2-1B-Instruct", |
| 262 | + draft_model="meta-llama/Llama-3.2-1B-Instruct", |
| 263 | + sampling_config=greedy_sampling(), |
| 264 | + expected_acceptance_rate=0.85, |
| 265 | + expected_same_output_fraction=0.5, |
| 266 | + ), |
| 267 | + ArgsTest( |
| 268 | + model="Qwen/Qwen3-1.7B", |
| 269 | + draft_model="Qwen/Qwen3-0.6B", |
| 270 | + sampling_config=stochastic_sampling(), |
| 271 | + expected_acceptance_rate=0.9, |
| 272 | + expected_same_output_fraction=0.9, |
| 273 | + ), |
| 274 | + ArgsTest( |
| 275 | + model="Qwen/Qwen3-1.7B", |
| 276 | + draft_model="Qwen/Qwen3-0.6B", |
| 277 | + sampling_config=greedy_sampling(), |
| 278 | + expected_acceptance_rate=1.0, |
| 279 | + expected_same_output_fraction=1.0, |
| 280 | + ), |
| 281 | +] |
| 282 | + |
| 283 | + |
| 284 | +@pytest.mark.parametrize("args", cases) |
| 285 | +def test_draft_model_correctness(args: ArgsTest, |
| 286 | + monkeypatch: pytest.MonkeyPatch): |
| 287 | + """Compare the outputs using and not using speculative decoding. |
| 288 | + In the greedy decoding case, the outputs must match EXACTLY.""" |
| 289 | + monkeypatch.setenv("VLLM_USE_V1", "1") |
| 290 | + test_prompts = get_test_prompts(mm_enabled=False) |
| 291 | + |
| 292 | + spec_llm = LLM( |
| 293 | + model=args.model, |
| 294 | + speculative_config={ |
| 295 | + "model": args.draft_model, |
| 296 | + "method": "draft_model", |
| 297 | + "num_speculative_tokens": 3, |
| 298 | + "max_model_len": args.max_model_len, |
| 299 | + "enforce_eager": args.enforce_eager, |
| 300 | + }, |
| 301 | + max_model_len=args.max_model_len, |
| 302 | + gpu_memory_utilization=args.gpu_memory_utilization, |
| 303 | + enforce_eager=args.enforce_eager, |
| 304 | + disable_log_stats=False # enables get_metrics() |
| 305 | + ) |
| 306 | + spec_outputs = spec_llm.chat(test_prompts, args.sampling_config) |
| 307 | + acceptance_rate = compute_acceptance_rate(spec_llm.get_metrics()) |
| 308 | + del spec_llm # CLEANUP |
| 309 | + torch.cuda.empty_cache() |
| 310 | + cleanup_dist_env_and_memory() |
| 311 | + |
| 312 | + ref_llm = LLM( |
| 313 | + model=args.model, |
| 314 | + max_model_len=args.max_model_len, |
| 315 | + gpu_memory_utilization=args.gpu_memory_utilization, |
| 316 | + enforce_eager=args.enforce_eager, |
| 317 | + ) |
| 318 | + ref_outputs = ref_llm.chat(test_prompts, args.sampling_config) |
| 319 | + del ref_llm # CLEANUP |
| 320 | + torch.cuda.empty_cache() |
| 321 | + cleanup_dist_env_and_memory() |
| 322 | + |
| 323 | + assert len(ref_outputs) > 0 |
| 324 | + assert len(ref_outputs) == len(spec_outputs) |
| 325 | + |
| 326 | + assert_outputs_match(ref_outputs, spec_outputs, |
| 327 | + args.expected_same_output_fraction) |
| 328 | + |
| 329 | + assert acceptance_rate >= args.expected_acceptance_rate |
| 330 | + |
| 331 | + |
| 332 | +def assert_outputs_match(ref_outputs: list[RequestOutput], |
| 333 | + spec_outputs: list[RequestOutput], fraction: float): |
| 334 | + """Assert that at least "fraction" of the prompts match exactly""" |
| 335 | + matches = 0 |
| 336 | + misses = 0 |
| 337 | + for ref_output, spec_output in zip(ref_outputs, spec_outputs): |
| 338 | + if ref_output.outputs[0].text == spec_output.outputs[0].text: |
| 339 | + matches += 1 |
| 340 | + else: |
| 341 | + misses += 1 |
| 342 | + print(f"ref_output: {ref_output.outputs[0].text}") |
| 343 | + print(f"spec_output: {spec_output.outputs[0].text}") |
| 344 | + |
| 345 | + # Heuristic: at least a certain fraction of the outputs to match exactly |
| 346 | + # Upon failure, inspect the outputs to check for inaccuracy. |
| 347 | + assert matches >= int(fraction * len(ref_outputs)) |
0 commit comments