vllm-project · simon-mo · Mar 7, 2026 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
@@ -98,7 +98,7 @@ def test_without_spec_decoding(
 
 @single_gpu_only
 @large_gpu_mark(min_gb=16)
-def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
+def test_with_eagle3_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
     """Test consistency and acceptance rates with some different combos of
     preemption, executor, async scheduling, prefill chunking,
     spec decoding model length.
@@ -154,6 +154,42 @@ def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch)
     )
 
 
+def test_with_ngram_gpu_spec_decoding(monkeypatch: pytest.MonkeyPatch):
+    """Test ngram_gpu speculative decoding with different configurations.
+
+    This test specifically validates ngram_gpu behavior with various:
+    - Number of speculative tokens (2-6)
+    - Prompt lookup window sizes (min/max)
+    - Async scheduling enabled (as in production)
+    - Different executors and chunking settings
+    """
+
+    # Variant with larger speculation window
+    ngram_gpu_config = {
+        "method": "ngram_gpu",
+        "num_speculative_tokens": 3,
+        "prompt_lookup_max": 3,
+        "prompt_lookup_min": 2,
+    }
+
+    # Test configurations covering various scenarios
+    # test_preemption, executor, async_scheduling,
+    # spec_config, test_prefill_chunking
+    test_configs = [
+        (False, "mp", False, None, False),
+        (False, "mp", False, ngram_gpu_config, False),
+        (True, "mp", False, ngram_gpu_config, True),
+        (False, "mp", True, ngram_gpu_config, False),
+        (True, "mp", True, ngram_gpu_config, False),
+        (True, "uni", True, ngram_gpu_config, False),
+        (True, "mp", True, ngram_gpu_config, True),
+    ]
+
+    # Use MODEL (Qwen) for ngram_gpu tests as it's lighter weight
+    # and ngram_gpu doesn't require a specific draft model
+    run_tests(monkeypatch, MODEL, test_configs, [{}])
+
+
 @dynamo_config.patch(cache_size_limit=16)
 def run_tests(
     monkeypatch: pytest.MonkeyPatch,
@@ -282,19 +318,20 @@ def run_test(
         else dict(gpu_memory_utilization=0.9)
     )
     spec_mml = (spec_config or {}).get("max_model_len")
+    spec_method = (spec_config or {}).get("method", "none")
     test_config = (
         f"executor={executor}, preemption={test_preemption}, "
         f"async_sched={async_scheduling}, "
         f"chunk_prefill={test_prefill_chunking}, "
-        f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
+        f"spec_decoding={spec_decoding}, spec_method={spec_method}, spec_mml={spec_mml}"
     )
     print("-" * 80)
     print(f"---- TESTING {test_str}: {test_config}")
     print("-" * 80)
 
     with VllmRunner(
         model,
-        max_model_len=512,
+        max_model_len=4096,
         enable_chunked_prefill=test_prefill_chunking,
         # Force prefill chunking
         max_num_batched_tokens=48 if test_prefill_chunking else None,

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
@@ -183,6 +183,34 @@ def test_ngram_and_suffix_correctness(
     cleanup_dist_env_and_memory()
 
 
+@pytest.mark.parametrize("async_scheduling", [True], ids=["async"])
+@single_gpu_only
+@large_gpu_mark(min_gb=20)
+def test_ngram_gpu_default_with_async_scheduling(
+    async_scheduling: bool,
+):
+    """
+    Test ngram_gpu speculative decoding (k=3) correctness with and without
+    async scheduling, validated via GSM8K accuracy.
+    Uses Qwen/Qwen3-8B (ref GSM8K accuracy: 87%-92%).
+    """
+    qwen3_model = "Qwen/Qwen3-8B"
+    spec_llm = LLM(
+        model=qwen3_model,
+        speculative_config={
+            "method": "ngram_gpu",
+            "prompt_lookup_max": 3,
+            "prompt_lookup_min": 2,
+            "num_speculative_tokens": 2,
+        },
+        max_model_len=4096,
+        async_scheduling=async_scheduling,
+    )
+    evaluate_llm_for_gsm8k(spec_llm, expected_accuracy_threshold=0.8)
+    del spec_llm
+    cleanup_dist_env_and_memory()
+
+
 @single_gpu_only
 @large_gpu_mark(min_gb=20)
 def test_suffix_decoding_acceptance(

@@ -907,6 +907,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
         # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE.
         disable_cache = not is_compile_cache_enabled(self.inductor_config)
 
+        # TODO(patchy): ngram gpu kernel will cause vllm torch compile cache errors.
+        is_ngram_gpu_enabled = (
+            vllm_config.speculative_config is not None
+            and vllm_config.speculative_config.use_ngram_gpu()
+        )
+        disable_cache = disable_cache or is_ngram_gpu_enabled
+
         if disable_cache:
             logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
         else:

@@ -47,13 +47,15 @@
     "step3p5_mtp",
 ]
 EagleModelTypes = Literal["eagle", "eagle3", "extract_hidden_states", MTPModelTypes]
+NgramGPUTypes = Literal["ngram_gpu"]
 SpeculativeMethod = Literal[
     "ngram",
     "medusa",
     "mlp_speculator",
     "draft_model",
     "suffix",
     EagleModelTypes,
+    NgramGPUTypes,
 ]
 
 
@@ -364,6 +366,8 @@ def __post_init__(self):
                     self.quantization = self.target_model_config.quantization
             elif self.method in ("ngram", "[ngram]"):
                 self.model = "ngram"
+            elif self.method == "ngram_gpu":
+                self.model = "ngram_gpu"
             elif self.method == "suffix":
                 self.model = "suffix"
             elif self.method == "extract_hidden_states":
@@ -374,8 +378,9 @@ def __post_init__(self):
                 )
 
         if self.method in ("ngram", "[ngram]"):
-            # Unified to "ngram" internally
             self.method = "ngram"
+
+        if self.method in ("ngram", "ngram_gpu"):
             # Set default values if not provided
             if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
                 # TODO(woosuk): Tune these values. They are arbitrarily chosen.
@@ -832,6 +837,9 @@ def uses_draft_model(self) -> bool:
     def uses_extract_hidden_states(self) -> bool:
         return self.method == "extract_hidden_states"
 
+    def use_ngram_gpu(self) -> bool:
+        return self.method == "ngram_gpu"
+
     def __repr__(self) -> str:
         method = self.method
         model = (

@@ -41,7 +41,7 @@
 from .parallel import ParallelConfig
 from .profiler import ProfilerConfig
 from .scheduler import SchedulerConfig
-from .speculative import EagleModelTypes, SpeculativeConfig
+from .speculative import EagleModelTypes, NgramGPUTypes, SpeculativeConfig
 from .structured_outputs import StructuredOutputsConfig
 from .utils import SupportsHash, config, replace
 from .weight_transfer import WeightTransferConfig
@@ -698,11 +698,13 @@ def __post_init__(self):
             if self.speculative_config is not None:
                 if (
                     self.speculative_config.method not in get_args(EagleModelTypes)
+                    and self.speculative_config.method not in get_args(NgramGPUTypes)
                     and self.speculative_config.method != "draft_model"
                 ):
                     raise ValueError(
                         "Currently, async scheduling is only supported "
-                        "with EAGLE/MTP/Draft Model kind of speculative decoding."
+                        "with EAGLE/MTP/Draft Model/NGram GPU kind of "
+                        "speculative decoding"
                     )
                 if self.speculative_config.disable_padded_drafter_batch:
                     raise ValueError(
@@ -720,6 +722,7 @@ def __post_init__(self):
             if (
                 self.speculative_config is not None
                 and self.speculative_config.method not in get_args(EagleModelTypes)
+                and self.speculative_config.method not in get_args(NgramGPUTypes)
             ):
                 logger.warning_once(
                     "Async scheduling not supported with %s-based "

@@ -385,6 +385,7 @@ def extract_tool_calls_streaming(
             prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
                 "arguments"
             )
+            assert current_tool_call is not None
             cur_arguments = current_tool_call.get("arguments")
 
             logger.debug("diffing old arguments: %s", prev_arguments)
@@ -489,6 +490,7 @@ def extract_tool_calls_streaming(
 
             # handle saving the state for the current tool into
             # the "prev" list for use in diffing for the next iteration
+            assert isinstance(current_tool_call, dict)
             if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
                 self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
             else: