Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
121 commits
Select commit Hold shift + click to select a range
c92e4b8
[CI Failure] Fix Gemma3 RoPE configuration for sliding attention laye…
hl475 Nov 21, 2025
d183dcb
fix typo error
Nov 21, 2025
293e3ae
fix return values in ngram gpu
Nov 21, 2025
4534c88
python3.13 pre-commit check
Nov 24, 2025
07e6b8a
fix pre-commit and sign-off
Nov 24, 2025
2f08629
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Nov 24, 2025
e70b060
fix ngram gpu kernel compile issue
Nov 25, 2025
cde94b2
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Nov 25, 2025
33c4437
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Nov 25, 2025
25d36b1
fix docs bug
Nov 26, 2025
71b0dca
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Nov 26, 2025
183556e
v.01
Nov 29, 2025
f6f871f
test
Nov 30, 2025
1fbf296
fix large batch performance.
Dec 2, 2025
b5243ec
refactor ngram gpu
Dec 2, 2025
0081487
modify nvtx
Dec 2, 2025
bcf454f
change copy to async
Dec 2, 2025
0d2638b
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 2, 2025
34cc523
remove irrelevant files
Dec 2, 2025
c9f2724
use discard_request_mask in ngram
Dec 2, 2025
16eb87c
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 2, 2025
82ff639
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 2, 2025
3abd884
remove irrelevant computations
Dec 4, 2025
cd9ecc9
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Dec 4, 2025
38cf7fd
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 4, 2025
b518ef2
remove irrelevant comments
Dec 4, 2025
d07f4a7
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Dec 4, 2025
3d28827
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 4, 2025
8920a59
move token ids tensor gpu init inline
Dec 4, 2025
6967bb2
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Dec 4, 2025
25d6b1f
remove unused status check
Dec 4, 2025
3a6df84
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 4, 2025
65260a4
detailed comments in ngram gpu
Dec 18, 2025
37b1bb2
remove irrlevant input params for _dummy_run
Dec 18, 2025
14243fb
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 18, 2025
430fc13
move the preprocess of token_ids_gpu_tensor and mask tensor into Ngra…
Dec 18, 2025
4e0eca7
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Dec 18, 2025
ddf24aa
merge conflicts fixed
Dec 19, 2025
63180be
change the CompileConfig to match latest vllm config
Dec 19, 2025
30b463a
fix documents
Dec 19, 2025
e1a44d1
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 19, 2025
9e7b089
fix vllm config in ngram gpu
Dec 22, 2025
3d0510e
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 22, 2025
8b7865c
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Dec 22, 2025
2769039
enable ngram gpu in sync mode
Dec 23, 2025
6a3a26c
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Dec 23, 2025
588bb65
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Dec 23, 2025
b28ffd3
merge conflicts fixed
Dec 30, 2025
732ce0c
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 1, 2026
8ef01b7
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 3, 2026
4538bea
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 3, 2026
f505d97
merge conflicts fixed
Jan 8, 2026
3cff47f
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Jan 8, 2026
cc3700a
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 8, 2026
f141cc1
merge conflicts fixed
Jan 9, 2026
1d94b70
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 9, 2026
064707c
modify ngram gpu process
Jan 9, 2026
93375ff
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Jan 9, 2026
eac7085
remove irrelevant codes
Jan 9, 2026
6b46372
merge conflicts
Jan 12, 2026
2a14605
vllm async conf check
Jan 12, 2026
c683c56
merge conflicts fixed
Jan 14, 2026
ab2b0d5
change sync data access to async
Jan 14, 2026
30824ed
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 14, 2026
a87fe7d
pre-commit fixed
Jan 14, 2026
879c488
merge conflicts fixed
Jan 20, 2026
4b9511b
Merge branch 'vllm-project:main' into patchy/async_ngram
PatchouliTIS Jan 20, 2026
ff33c28
comments resolved, redundent codes removed.
Jan 21, 2026
1ee4cce
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 21, 2026
3faf03e
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 21, 2026
baf359c
Merge branch 'main' of github.com:vllm-project/vllm into patchy/async…
Jan 22, 2026
4451866
remove overcomments and disorganized codes
Jan 22, 2026
e9d510f
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 23, 2026
7c16ba0
typo error fixed in gpu_model_runner
Jan 23, 2026
7050336
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Jan 23, 2026
1dd58e6
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 23, 2026
3764a0f
merge conflicts fixed
Jan 26, 2026
64737f3
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 28, 2026
2f8ab86
Merge branch 'main' of github.com:vllm-project/vllm into patchy/async…
Jan 29, 2026
b0104b2
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 29, 2026
c345829
merge conflicts fixed
Jan 29, 2026
4f467f5
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Jan 29, 2026
9a9b35e
pre-commits error fixed
Jan 29, 2026
c188d8b
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 30, 2026
2d5edf9
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Jan 31, 2026
85de9af
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Feb 2, 2026
2db87e8
merge conflicts fixed
Feb 4, 2026
234ff7b
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Feb 4, 2026
3221bb0
merge conflicts
Feb 6, 2026
e42ecd3
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Feb 11, 2026
c68efc7
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Feb 14, 2026
a356135
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Feb 15, 2026
eb20909
merge conflicts fixed
Feb 26, 2026
18e462d
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Feb 26, 2026
3dc6545
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Feb 27, 2026
3388786
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Feb 27, 2026
67394d7
fixed bugs during preemption
Feb 27, 2026
a34c5be
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Feb 27, 2026
c8b8d71
fix bugs in preemption and add GSM8k Tests
Feb 28, 2026
0eab1f9
Merge branch 'main' of github.com:vllm-project/vllm into patchy/async…
Feb 28, 2026
8ae962b
fix merge conflicts in gpu_model_runner
Feb 28, 2026
1e12b12
fix merge conflicts in gpu_model_runner
Feb 28, 2026
3bf97f2
fix bugs
Feb 28, 2026
a2d216f
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Feb 28, 2026
33a8aab
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Mar 2, 2026
7e7ecac
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Mar 2, 2026
5fdb7bc
merge conflicts
Mar 3, 2026
cb4fa70
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Mar 3, 2026
6748677
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Mar 3, 2026
21e26fc
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Mar 4, 2026
0f1046c
Merge branch 'main' of github.com:vllm-project/vllm into patchy/async…
Mar 5, 2026
afd7933
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Mar 5, 2026
942c8ae
remove cuda api call
Mar 5, 2026
b9e9c8a
Merge branch 'patchy/async_ngram' of https://github.com/PatchouliTIS/…
Mar 5, 2026
9da1866
Merge branch 'main' of github.com:vllm-project/vllm into patchy/async…
Mar 5, 2026
a5e7bb3
pre-commits fixed
Mar 5, 2026
07fa301
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Mar 6, 2026
e48c64f
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Mar 6, 2026
cb508b6
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Mar 6, 2026
bc71da2
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Mar 7, 2026
fc64156
Merge branch 'main' into patchy/async_ngram
PatchouliTIS Mar 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 40 additions & 3 deletions tests/v1/e2e/test_async_scheduling.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_without_spec_decoding(

@single_gpu_only
@large_gpu_mark(min_gb=16)
def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
def test_with_eagle3_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
"""Test consistency and acceptance rates with some different combos of
preemption, executor, async scheduling, prefill chunking,
spec decoding model length.
Expand Down Expand Up @@ -154,6 +154,42 @@ def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch)
)


def test_with_ngram_gpu_spec_decoding(monkeypatch: pytest.MonkeyPatch):
"""Test ngram_gpu speculative decoding with different configurations.

This test specifically validates ngram_gpu behavior with various:
- Number of speculative tokens (2-6)
- Prompt lookup window sizes (min/max)
- Async scheduling enabled (as in production)
- Different executors and chunking settings
"""

# Variant with larger speculation window
ngram_gpu_config = {
"method": "ngram_gpu",
"num_speculative_tokens": 3,
"prompt_lookup_max": 3,
"prompt_lookup_min": 2,
}

# Test configurations covering various scenarios
# test_preemption, executor, async_scheduling,
# spec_config, test_prefill_chunking
test_configs = [
(False, "mp", False, None, False),
(False, "mp", False, ngram_gpu_config, False),
(True, "mp", False, ngram_gpu_config, True),
(False, "mp", True, ngram_gpu_config, False),
(True, "mp", True, ngram_gpu_config, False),
(True, "uni", True, ngram_gpu_config, False),
(True, "mp", True, ngram_gpu_config, True),
]

# Use MODEL (Qwen) for ngram_gpu tests as it's lighter weight
# and ngram_gpu doesn't require a specific draft model
run_tests(monkeypatch, MODEL, test_configs, [{}])


@dynamo_config.patch(cache_size_limit=16)
def run_tests(
monkeypatch: pytest.MonkeyPatch,
Expand Down Expand Up @@ -282,19 +318,20 @@ def run_test(
else dict(gpu_memory_utilization=0.9)
)
spec_mml = (spec_config or {}).get("max_model_len")
spec_method = (spec_config or {}).get("method", "none")
test_config = (
f"executor={executor}, preemption={test_preemption}, "
f"async_sched={async_scheduling}, "
f"chunk_prefill={test_prefill_chunking}, "
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
f"spec_decoding={spec_decoding}, spec_method={spec_method}, spec_mml={spec_mml}"
)
print("-" * 80)
print(f"---- TESTING {test_str}: {test_config}")
print("-" * 80)

with VllmRunner(
model,
max_model_len=512,
max_model_len=4096,
enable_chunked_prefill=test_prefill_chunking,
# Force prefill chunking
max_num_batched_tokens=48 if test_prefill_chunking else None,
Expand Down
28 changes: 28 additions & 0 deletions tests/v1/e2e/test_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,34 @@ def test_ngram_and_suffix_correctness(
cleanup_dist_env_and_memory()


@pytest.mark.parametrize("async_scheduling", [True], ids=["async"])
@single_gpu_only
@large_gpu_mark(min_gb=20)
def test_ngram_gpu_default_with_async_scheduling(
async_scheduling: bool,
):
"""
Test ngram_gpu speculative decoding (k=3) correctness with and without
async scheduling, validated via GSM8K accuracy.
Uses Qwen/Qwen3-8B (ref GSM8K accuracy: 87%-92%).
"""
qwen3_model = "Qwen/Qwen3-8B"
spec_llm = LLM(
model=qwen3_model,
speculative_config={
"method": "ngram_gpu",
"prompt_lookup_max": 3,
"prompt_lookup_min": 2,
"num_speculative_tokens": 2,
},
max_model_len=4096,
async_scheduling=async_scheduling,
)
evaluate_llm_for_gsm8k(spec_llm, expected_accuracy_threshold=0.8)
del spec_llm
cleanup_dist_env_and_memory()


@single_gpu_only
@large_gpu_mark(min_gb=20)
def test_suffix_decoding_acceptance(
Expand Down
7 changes: 7 additions & 0 deletions vllm/compilation/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
# Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE.
disable_cache = not is_compile_cache_enabled(self.inductor_config)

# TODO(patchy): ngram gpu kernel will cause vllm torch compile cache errors.
is_ngram_gpu_enabled = (
vllm_config.speculative_config is not None
and vllm_config.speculative_config.use_ngram_gpu()
)
disable_cache = disable_cache or is_ngram_gpu_enabled

if disable_cache:
logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
else:
Expand Down
10 changes: 9 additions & 1 deletion vllm/config/speculative.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,15 @@
"step3p5_mtp",
]
EagleModelTypes = Literal["eagle", "eagle3", "extract_hidden_states", MTPModelTypes]
NgramGPUTypes = Literal["ngram_gpu"]
SpeculativeMethod = Literal[
"ngram",
"medusa",
"mlp_speculator",
"draft_model",
"suffix",
EagleModelTypes,
NgramGPUTypes,
]


Expand Down Expand Up @@ -364,6 +366,8 @@ def __post_init__(self):
self.quantization = self.target_model_config.quantization
elif self.method in ("ngram", "[ngram]"):
self.model = "ngram"
elif self.method == "ngram_gpu":
self.model = "ngram_gpu"
elif self.method == "suffix":
self.model = "suffix"
elif self.method == "extract_hidden_states":
Expand All @@ -374,8 +378,9 @@ def __post_init__(self):
)

if self.method in ("ngram", "[ngram]"):
# Unified to "ngram" internally
self.method = "ngram"

if self.method in ("ngram", "ngram_gpu"):
# Set default values if not provided
if self.prompt_lookup_min is None and self.prompt_lookup_max is None:
# TODO(woosuk): Tune these values. They are arbitrarily chosen.
Expand Down Expand Up @@ -832,6 +837,9 @@ def uses_draft_model(self) -> bool:
def uses_extract_hidden_states(self) -> bool:
return self.method == "extract_hidden_states"

def use_ngram_gpu(self) -> bool:
return self.method == "ngram_gpu"

def __repr__(self) -> str:
method = self.method
model = (
Expand Down
7 changes: 5 additions & 2 deletions vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from .parallel import ParallelConfig
from .profiler import ProfilerConfig
from .scheduler import SchedulerConfig
from .speculative import EagleModelTypes, SpeculativeConfig
from .speculative import EagleModelTypes, NgramGPUTypes, SpeculativeConfig
from .structured_outputs import StructuredOutputsConfig
from .utils import SupportsHash, config, replace
from .weight_transfer import WeightTransferConfig
Expand Down Expand Up @@ -698,11 +698,13 @@ def __post_init__(self):
if self.speculative_config is not None:
if (
self.speculative_config.method not in get_args(EagleModelTypes)
and self.speculative_config.method not in get_args(NgramGPUTypes)
and self.speculative_config.method != "draft_model"
):
raise ValueError(
"Currently, async scheduling is only supported "
"with EAGLE/MTP/Draft Model kind of speculative decoding."
"with EAGLE/MTP/Draft Model/NGram GPU kind of "
"speculative decoding"
)
if self.speculative_config.disable_padded_drafter_batch:
raise ValueError(
Expand All @@ -720,6 +722,7 @@ def __post_init__(self):
if (
self.speculative_config is not None
and self.speculative_config.method not in get_args(EagleModelTypes)
and self.speculative_config.method not in get_args(NgramGPUTypes)
):
logger.warning_once(
"Async scheduling not supported with %s-based "
Expand Down
2 changes: 2 additions & 0 deletions vllm/tool_parsers/hermes_tool_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,7 @@ def extract_tool_calls_streaming(
prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
"arguments"
)
assert current_tool_call is not None
cur_arguments = current_tool_call.get("arguments")

logger.debug("diffing old arguments: %s", prev_arguments)
Expand Down Expand Up @@ -489,6 +490,7 @@ def extract_tool_calls_streaming(

# handle saving the state for the current tool into
# the "prev" list for use in diffing for the next iteration
assert isinstance(current_tool_call, dict)
if self.current_tool_id == len(self.prev_tool_call_arr) - 1:
self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
else:
Expand Down
Loading