vllm-project · vllm-bot · Mar 31, 2025 · Mar 25, 2025 · Mar 25, 2025 · Mar 30, 2025
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -63,10 +63,12 @@
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "disable_log_requests": "", 
             "tensor_parallel_size": 4,
-            "swap_space": 16, 
-            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
-            "num_speculative_tokens": 4,
-            "speculative_draft_tensor_parallel_size": 1
+            "swap_space": 16,
+            "speculative_config": {
+                "model": "turboderp/Qwama-0.5B-Instruct",
+                "num_speculative_tokens": 4,
+                "draft_tensor_parallel_size": 1
+            }
         },
         "client_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
@@ -52,7 +52,7 @@ python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model
 ```
 
 :::{warning}
-Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
+Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now.
 :::
 
 Then use a client:

diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
@@ -69,10 +69,12 @@
     max_model_len=max_model_len,
     max_num_seqs=args.max_num_seqs,
     gpu_memory_utilization=0.8,
-    speculative_model=eagle_dir,
-    num_speculative_tokens=args.num_spec_tokens,
-    speculative_draft_tensor_parallel_size=args.draft_tp,
-    speculative_max_model_len=max_model_len,
+    speculative_config={
+        "model": eagle_dir,
+        "num_speculative_tokens": args.num_spec_tokens,
+        "draft_tensor_parallel_size": args.draft_tp,
+        "max_model_len": max_model_len,
+    },
     disable_log_stats=False,
 )
 

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
@@ -248,8 +248,10 @@ def test_metric_spec_decode(
             dtype=dtype,
             disable_log_stats=False,
             gpu_memory_utilization=0.4,
-            speculative_model=model,
-            num_speculative_tokens=k,
+            speculative_config={
+                "model": model,
+                "num_speculative_tokens": k,
+            },
     ) as vllm_model:
 
         # Force log interval to be 0 to catch all metrics.
@@ -300,8 +302,10 @@ def test_metric_spec_decode_interval(
         dtype=dtype,
         disable_log_stats=False,
         gpu_memory_utilization=0.4,
-        speculative_model=model,
-        num_speculative_tokens=k,
+        speculative_config={
+            "model": model,
+            "num_speculative_tokens": k,
+        },
         enforce_eager=True,
     )
 

@@ -54,8 +54,10 @@ def _initalize_kv_caches_v1(self, vllm_config):
             model_info.default,
             tokenizer=model_info.tokenizer,
             tokenizer_mode=model_info.tokenizer_mode,
-            speculative_model=model_info.speculative_model,
-            num_speculative_tokens=1 if model_info.speculative_model else None,
+            speculative_config={
+                "model": model_info.speculative_model,
+                "num_speculative_tokens": 1,
+            } if model_info.speculative_model else None,
             trust_remote_code=model_info.trust_remote_code,
             load_format="dummy",
             hf_overrides=hf_overrides,

diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -3,6 +3,7 @@
 tensor parallelism.
 """
 
+import json
 from typing import Optional
 
 import pytest
@@ -28,14 +29,14 @@
 @pytest.mark.parametrize("test_llm_kwargs", [
     [
         "--speculative_config",
-        str({
+        json.dumps({
             "model": "JackFram/llama-68m",
             "num_speculative_tokens": 3,
         }),
     ],
     [
         "--speculative_config",
-        str({
+        json.dumps({
             "model": "ngram",
             "num_speculative_tokens": 5,
             "prompt_lookup_max": 3,
@@ -88,15 +89,15 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
     "model, test_llm_kwargs",
     [("JackFram/llama-68m", [
         "--speculative_config",
-        str({
+        json.dumps({
             "model": "JackFram/llama-68m",
             "num_speculative_tokens": 5,
             "draft_tensor_parallel_size": 1,
         }),
     ]),
      ("ibm-granite/granite-3b-code-instruct", [
          "--speculative_config",
-         str({
+         json.dumps({
              "model": "ibm-granite/granite-3b-code-instruct",
              "num_speculative_tokens": 5,
              "draft_tensor_parallel_size": 1,
@@ -147,20 +148,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
 @pytest.mark.parametrize("model, test_llm_kwargs",
                          [("JackFram/llama-68m", [
                              "--speculative_config",
-                             str({
+                             json.dumps({
                                  "model": "JackFram/llama-68m",
                                  "num_speculative_tokens": 3,
                              }),
                          ]),
                           ("JackFram/llama-68m", [
                               "--speculative_config",
-                              str({
+                              json.dumps({
                                   "model": "JackFram/llama-68m",
                                   "num_speculative_tokens": 3,
                                   "draft_tensor_parallel_size": 1,
                               }),
                           ])])
-@pytest.mark.parametrize("logprobs", [None, 2])
+@pytest.mark.parametrize("logprobs", [None])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
 def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
@@ -171,9 +172,68 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
     """Verify spec decode works well with same and different TP size for
     the draft model with chunked prefill.
     """
-    if logprobs:
-        test_llm_kwargs.extend(
-            ["--disable_logprobs_during_spec_decoding", "False"])
+    run_equality_correctness_test_tp(model,
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     max_output_len=32,
+                                     seed=seed,
+                                     temperature=0.0,
+                                     logprobs=logprobs)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [[
+        # Skip cuda graph recording for fast test.
+        "--enforce-eager",
+        "--tensor_parallel_size",
+        "2",
+
+        # precision
+        "--dtype",
+        "bfloat16",
+    ]])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [["--enable-chunked-prefill", "False"],
+     [
+         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
+         "--max-num-seqs", "4"
+     ]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
+@pytest.mark.parametrize("model, test_llm_kwargs",
+                         [("JackFram/llama-68m", [
+                             "--speculative_config",
+                             json.dumps({
+                                 "model": "JackFram/llama-68m",
+                                 "num_speculative_tokens": 3,
+                                 "disable_logprobs": False,
+                             }),
+                         ]),
+                          ("JackFram/llama-68m", [
+                              "--speculative_config",
+                              json.dumps({
+                                  "model": "JackFram/llama-68m",
+                                  "num_speculative_tokens": 3,
+                                  "draft_tensor_parallel_size": 1,
+                                  "disable_logprobs": False,
+                              }),
+                          ])])
+@pytest.mark.parametrize("logprobs", [2])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_chunked_prefill_tp2_with_logprobs(
+        model, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
+        batch_size: int, seed: int):
+    """Verify spec decode works well with same and different TP size for
+    the draft model with chunked prefill.
+    """
     run_equality_correctness_test_tp(model,
                                      common_llm_kwargs,
                                      per_test_common_llm_kwargs,

diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -3,6 +3,8 @@
 tensor parallelism.
 """
 
+import json
+
 import openai
 import pytest
 import torch
@@ -33,7 +35,7 @@
         #TODO(wooyeon): add spec_draft_dp=2 case
         [
             "--speculative_config",
-            str({
+            json.dumps({
                 "model": f"{SPEC_MODEL}",
                 "num_speculative_tokens": 5,
                 "draft_tensor_parallel_size": 1,
@@ -80,7 +82,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
             # Artificially limit the draft model max model len; this forces vLLM
             # to skip speculation once the sequences grow beyond 32-k tokens.
             "--speculative_config",
-            str({
+            json.dumps({
                 "model": f"{SPEC_MODEL}",
                 "num_speculative_tokens": 5,
                 "max_model_len": 32,

diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
@@ -49,7 +49,9 @@ def test_unsupported_configs(monkeypatch):
         with pytest.raises(NotImplementedError):
             AsyncEngineArgs(
                 model=MODEL,
-                speculative_model=MODEL,
+                speculative_config={
+                    "model": MODEL,
+                },
             ).create_engine_config()
 
         with pytest.raises(NotImplementedError):

diff --git a/vllm/config.py b/vllm/config.py
@@ -2028,14 +2028,13 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
 
     def __post_init__(self):
 
-        # Note: After next release, the method parameter will be used to
-        # specify the speculative method, which helps to extend the
-        # configuration of non-model-based proposers, and the model parameter
-        # will be used when the draft model or head is needed.
-        # If users do not specify the method, the speculative method will
-        # be detected automatically if possible. If the speculative method can
-        # not be detected, it will be considered as the draft-model-based
-        # method by default.
+        # Note: "method" is a new parameter that helps to extend the
+        # configuration of non-model-based proposers, and the "model" parameter
+        # will be used to set the draft model, eagle head, or additional weight
+        # when needed. If users do not specify "method", the speculative method
+        # will be detected automatically if possible. If the speculative method
+        # can not be detected, it will be considered as the "draft_model" by
+        # default.
 
         if self.model is None and self.num_speculative_tokens is not None:
             # TODO(Shangming): Refactor mtp configuration logic when supporting
@@ -2050,8 +2049,8 @@ def __post_init__(self):
                 raise ValueError("num_speculative_tokens was provided without "
                                  "speculative model.")
 
-        # Automatically configure the ngram method during configuration
-        # refactoring to ensure a smooth transition.
+        # Automatically configure the method for ngram when "model" is used
+        # instead of "method"
         if self.method is None and (self.model is not None
                                     and self.model in ("ngram", "[ngram]")):
             self.method = "ngram"