tomasruizt
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 5 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/v1/e2e/test_spec_decode.py‎
Lines changed: 115 additions & 0 deletions b/‎tests/v1/e2e/test_spec_decode.py‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎vllm/benchmarks/throughput.py‎
Lines changed: 35 additions & 4 deletions b/‎vllm/benchmarks/throughput.py‎
Lines changed: 35 additions & 4 deletions
diff --git a/‎vllm/config/__init__.py‎
Lines changed: 4 additions & 5 deletions b/‎vllm/config/__init__.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎vllm/engine/arg_utils.py‎
Lines changed: 1 addition & 4 deletions b/‎vllm/engine/arg_utils.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎vllm/model_executor/model_loader/__init__.py‎
Lines changed: 4 additions & 2 deletions b/‎vllm/model_executor/model_loader/__init__.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎vllm/model_executor/model_loader/base_loader.py‎
Lines changed: 6 additions & 3 deletions b/‎vllm/model_executor/model_loader/base_loader.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎vllm/model_executor/model_loader/gguf_loader.py‎
Lines changed: 6 additions & 3 deletions b/‎vllm/model_executor/model_loader/gguf_loader.py‎
Lines changed: 6 additions & 3 deletions
@@ -1,3 +1,6 @@
+# Scripts for development
+scripts/
+
 # version file generated by setuptools-scm
 /vllm/_version.py
 
 
@@ -154,6 +154,11 @@ markers = [
     "skip_v1: do not run this test with v1",
     "optional: optional tests that are automatically skipped, include --optional to run them",
 ]
+# Show print statements and logs during test execution
+addopts = "-s --tb=short --log-cli-level=INFO"
+log_cli = true
+log_cli_format = "%(asctime)s [%(levelname)8s] %(name)s: %(message)s"
+log_cli_date_format = "%Y-%m-%d %H:%M:%S"
 
 [tool.ty.src]
 root = "./vllm"
 
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import random
+from dataclasses import dataclass
 from typing import Any, Union
 
 import pytest
@@ -13,7 +14,9 @@
 from vllm.assets.base import VLLM_S3_BUCKET_URL
 from vllm.assets.image import VLM_IMAGES_DIR
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
+from vllm.v1.spec_decode.metrics import compute_acceptance_rate
 
 
 def get_test_prompts(mm_enabled: bool):
@@ -69,9 +72,17 @@ def get_test_prompts(mm_enabled: bool):
 
 @pytest.fixture
 def sampling_config():
+    return greedy_sampling()
+
+
+def greedy_sampling() -> SamplingParams:
     return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
 
 
+def stochastic_sampling() -> SamplingParams:
+    return SamplingParams(temperature=1.0, max_tokens=10, ignore_eos=False)
+
+
 @pytest.fixture
 def model_name():
     return "meta-llama/Llama-3.1-8B-Instruct"
@@ -230,3 +241,107 @@ def test_eagle_correctness(
         del spec_llm
         torch.cuda.empty_cache()
         cleanup_dist_env_and_memory()
+
+
+@dataclass
+class ArgsTest:
+    model: str
+    draft_model: str
+    sampling_config: SamplingParams
+    expected_acceptance_rate: float
+    expected_same_output_fraction: float
+    # Defaults
+    enforce_eager: bool = True
+    max_model_len: int = 1024
+    gpu_memory_utilization: float = 0.5
+
+
+cases = [
+    ArgsTest(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        draft_model="meta-llama/Llama-3.2-1B-Instruct",
+        sampling_config=greedy_sampling(),
+        expected_acceptance_rate=0.85,
+        expected_same_output_fraction=0.5,
+    ),
+    ArgsTest(
+        model="Qwen/Qwen3-1.7B",
+        draft_model="Qwen/Qwen3-0.6B",
+        sampling_config=stochastic_sampling(),
+        expected_acceptance_rate=0.9,
+        expected_same_output_fraction=0.9,
+    ),
+    ArgsTest(
+        model="Qwen/Qwen3-1.7B",
+        draft_model="Qwen/Qwen3-0.6B",
+        sampling_config=greedy_sampling(),
+        expected_acceptance_rate=1.0,
+        expected_same_output_fraction=1.0,
+    ),
+]
+
+
+@pytest.mark.parametrize("args", cases)
+def test_draft_model_correctness(args: ArgsTest,
+                                 monkeypatch: pytest.MonkeyPatch):
+    """Compare the outputs using and not using speculative decoding.
+    In the greedy decoding case, the outputs must match EXACTLY."""
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    test_prompts = get_test_prompts(mm_enabled=False)
+
+    spec_llm = LLM(
+        model=args.model,
+        speculative_config={
+            "model": args.draft_model,
+            "method": "draft_model",
+            "num_speculative_tokens": 3,
+            "max_model_len": args.max_model_len,
+            "enforce_eager": args.enforce_eager,
+        },
+        max_model_len=args.max_model_len,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        enforce_eager=args.enforce_eager,
+        disable_log_stats=False  # enables get_metrics()
+    )
+    spec_outputs = spec_llm.chat(test_prompts, args.sampling_config)
+    acceptance_rate = compute_acceptance_rate(spec_llm.get_metrics())
+    del spec_llm  # CLEANUP
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    ref_llm = LLM(
+        model=args.model,
+        max_model_len=args.max_model_len,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        enforce_eager=args.enforce_eager,
+    )
+    ref_outputs = ref_llm.chat(test_prompts, args.sampling_config)
+    del ref_llm  # CLEANUP
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    assert len(ref_outputs) > 0
+    assert len(ref_outputs) == len(spec_outputs)
+
+    assert_outputs_match(ref_outputs, spec_outputs,
+                         args.expected_same_output_fraction)
+
+    assert acceptance_rate >= args.expected_acceptance_rate
+
+
+def assert_outputs_match(ref_outputs: list[RequestOutput],
+                         spec_outputs: list[RequestOutput], fraction: float):
+    """Assert that at least "fraction" of the prompts match exactly"""
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")
+
+    # Heuristic: at least a certain fraction of the outputs to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches >= int(fraction * len(ref_outputs))
@@ -31,14 +31,17 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import merge_async_iterators
+from vllm.v1.metrics.reader import Metric
+from vllm.v1.spec_decode.metrics import compute_acceptance_rate
 
 
 def run_vllm(
     requests: list[SampleRequest],
     n: int,
     engine_args: EngineArgs,
+    do_profile: bool,
     disable_detokenize: bool = False,
-) -> tuple[float, Optional[list[RequestOutput]]]:
+) -> "Results":
     from vllm import LLM, SamplingParams
     llm = LLM(**dataclasses.asdict(engine_args))
     assert all(
@@ -74,12 +77,16 @@ def run_vllm(
 
     outputs = None
     if not use_beam_search:
+        if do_profile:
+            llm.start_profile()
         start = time.perf_counter()
         outputs = llm.generate(prompts,
                                sampling_params,
                                lora_request=lora_requests,
                                use_tqdm=True)
         end = time.perf_counter()
+        if do_profile:
+            llm.stop_profile()
     else:
         assert lora_requests is None, "BeamSearch API does not support LoRA"
         prompts = [request.prompt for request in requests]
@@ -96,7 +103,8 @@ def run_vllm(
                 ignore_eos=True,
             ))
         end = time.perf_counter()
-    return end - start, outputs
+    runtime = end - start
+    return Results(runtime=runtime, metrics=llm.get_metrics(), outputs=outputs)
 
 
 def run_vllm_chat(
@@ -138,6 +146,13 @@ def run_vllm_chat(
     return end - start, outputs
 
 
+@dataclasses.dataclass
+class Results:
+    runtime: float
+    metrics: list[Metric]
+    outputs: list
+
+
 async def run_vllm_async(
     requests: list[SampleRequest],
     n: int,
@@ -496,6 +511,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
         type=str,
         default=None,
         help='Path to save the throughput results in JSON format.')
+    parser.add_argument(
+        "--print-acceptance-rate",
+        action="store_true",
+        default=False,
+        help="Print the acceptance rate of the speculative decoding model.",
+    )
     parser.add_argument("--async-engine",
                         action='store_true',
                         default=False,
@@ -543,6 +564,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
                         type=str,
                         default=None,
                         help="Split of the HF dataset.")
+    parser.add_argument("--profile",
+                        action="store_true",
+                        default=False,
+                        help="Profile the model.")
 
     # prefix repetition dataset
     prefix_repetition_group = parser.add_argument_group(
@@ -604,9 +629,12 @@ def main(args: argparse.Namespace):
                     args.disable_detokenize,
                 ))
         else:
-            elapsed_time, request_outputs = run_vllm(
+            bresults = run_vllm(
                 requests, args.n, EngineArgs.from_cli_args(args),
-                args.disable_detokenize)
+                do_profile=args.profile,
+                disable_detokenize=args.disable_detokenize)
+            elapsed_time = bresults.runtime
+            request_outputs = bresults.outputs
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -651,6 +679,9 @@ def main(args: argparse.Namespace):
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
     print(f"Total num prompt tokens:  {total_prompt_tokens}")
     print(f"Total num output tokens:  {total_output_tokens}")
+    if args.print_acceptance_rate:
+        rate = compute_acceptance_rate(bresults.metrics)
+        print(f"Acceptance rate: {rate:.2f}")
 
     # Output JSON results if specified
     if args.output_json:
 
@@ -2168,6 +2168,7 @@ def __post_init__(self):
                     code_revision=self.code_revision,
                     tokenizer_revision=self.target_model_config.
                     tokenizer_revision,
+                    max_model_len=self.max_model_len,
                     spec_target_max_model_len=self.target_model_config.
                     max_model_len,
                     quantization=self.quantization,
@@ -2209,11 +2210,6 @@ def __post_init__(self):
                             )
                 else:
                     self.method = "draft_model"
-                    raise NotImplementedError(
-                        "Speculative decoding with draft model is not "
-                        "supported yet. Please consider using other "
-                        "speculative decoding methods such as ngram, medusa, "
-                        "eagle, or deepseek_mtp.")
 
                 # Replace hf_config for EAGLE draft_model
                 if self.method in ("eagle", "eagle3"):
@@ -2424,6 +2420,9 @@ def num_lookahead_slots(self) -> int:
     def use_eagle(self) -> bool:
         return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp")
 
+    def uses_draft_model(self) -> bool:
+        return self.method == "draft_model"
+
     def __repr__(self) -> str:
         method = self.method
         model = None if method == "ngram" else self.draft_model_config.model
 
@@ -1474,10 +1474,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         # V1 supports N-gram, Medusa, and Eagle speculative decoding.
         if (self.speculative_config is not None
                 and self.speculative_config.get("method") == "draft_model"):
-            raise NotImplementedError(
-                "Speculative decoding with draft model is not supported yet. "
-                "Please consider using other speculative decoding methods "
-                "such as ngram, medusa, eagle, or deepseek_mtp.")
+            return True
 
         V1_BACKENDS = [
             "FLASH_ATTN_VLLM_V1",
 
@@ -111,12 +111,14 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
 
 def get_model(*,
               vllm_config: VllmConfig,
-              model_config: Optional[ModelConfig] = None) -> nn.Module:
+              model_config: Optional[ModelConfig] = None,
+              prefix: str = "") -> nn.Module:
     loader = get_model_loader(vllm_config.load_config)
     if model_config is None:
         model_config = vllm_config.model_config
     return loader.load_model(vllm_config=vllm_config,
-                             model_config=model_config)
+                             model_config=model_config,
+                             prefix=prefix)
 
 
 __all__ = [
 
@@ -31,8 +31,10 @@ def load_weights(self, model: nn.Module,
         inplace weights loading for an already-initialized model"""
         raise NotImplementedError
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
+    def load_model(self,
+                   vllm_config: VllmConfig,
+                   model_config: ModelConfig,
+                   prefix: str = "") -> nn.Module:
         """Load a model with the given configurations."""
         device_config = vllm_config.device_config
         load_config = vllm_config.load_config
@@ -42,7 +44,8 @@ def load_model(self, vllm_config: VllmConfig,
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
                 model = initialize_model(vllm_config=vllm_config,
-                                         model_config=model_config)
+                                         model_config=model_config,
+                                         prefix=prefix)
 
             logger.debug("Loading weights on %s ...", load_device)
             # Quantization does not happen in `load_weights` but after it
 
@@ -123,8 +123,10 @@ def load_weights(self, model: nn.Module,
         model.load_weights(
             self._get_weights_iterator(local_model_path, gguf_weights_map))
 
-    def load_model(self, vllm_config: VllmConfig,
-                   model_config: ModelConfig) -> nn.Module:
+    def load_model(self,
+                   vllm_config: VllmConfig,
+                   model_config: ModelConfig,
+                   prefix: str = "") -> nn.Module:
         device_config = vllm_config.device_config
         local_model_path = self._prepare_weights(model_config.model)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
@@ -147,7 +149,8 @@ def load_model(self, vllm_config: VllmConfig,
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
-                model = initialize_model(vllm_config=vllm_config)
+                model = initialize_model(vllm_config=vllm_config,
+                                         prefix=prefix)
             self.load_weights(model, model_config)
 
             process_weights_after_loading(model, model_config, target_device)