-
-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[Bugfix] Fix NemotronH MTP + Chunked Prefill #35447
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
tdoublep
merged 16 commits into
vllm-project:main
from
CentML:nemotron-h-mtp-chunkedprefill-bugfix
Mar 17, 2026
Merged
Changes from all commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
4c47382
fix chunked prefill for mamba2 MTP
benchislett 7fca59d
reproducer
benchislett 6ec6aba
update test case
benchislett 7b5f9a7
fix assert
benchislett 2501932
revert layernorm change
benchislett 9fbf1e6
gpu-compatible mamba fix
benchislett b14fd43
add coverage to test for align-mode case
benchislett 58de553
Merge branch 'main' into nemotron-h-mtp-chunkedprefill-bugfix
benchislett a29219b
slight refactor
benchislett 364ce6e
use a smaller model for test
benchislett a77b706
Merge branch 'main' into nemotron-h-mtp-chunkedprefill-bugfix
benchislett 938f9b8
Merge branch 'main' into nemotron-h-mtp-chunkedprefill-bugfix
benchislett 7a1a98a
Merge branch 'main' into nemotron-h-mtp-chunkedprefill-bugfix
benchislett a4f6afe
Merge branch 'main' into nemotron-h-mtp-chunkedprefill-bugfix
benchislett 1cd7c5c
add large gpu marks for e2e test
benchislett 094597f
remove unused assert
benchislett File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
|
|
||
| import pytest | ||
|
|
||
| from vllm import SamplingParams | ||
| from vllm.platforms import current_platform | ||
|
|
||
| from ...utils import large_gpu_mark, multi_gpu_marks | ||
|
|
||
| # A trivial request with a short prompt to ensure we run a mixed batch | ||
| SMALL_MESSAGE = [ | ||
| { | ||
| "role": "user", | ||
| "content": "The secret beta value is 64. What is the secret beta?", | ||
| } | ||
| ] | ||
|
|
||
| # Sample prompt with a bunch of filler in between the critical fact and the request. | ||
| # Both parts need to be processed properly for the model to generate the correct answer | ||
| MESSAGES = [ | ||
| { | ||
| "role": "user", | ||
| "content": ( | ||
| "Important: The secret number is 42. " | ||
| "The sky is green in this hypothetical world. " | ||
| "Apples grow on trees in the forest. " | ||
| "Rivers flow through the valleys and mountains. " | ||
| "Birds sing songs in the early morning light. " | ||
| "The weather today is sunny with clear skies ahead. " | ||
| "Flowers bloom in the garden during spring season. " | ||
| "Now answer with ONLY the number and nothing else: " | ||
| "What is the secret number plus one?" | ||
| ), | ||
| } | ||
| ] | ||
|
|
||
|
|
||
| @pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available") | ||
| @pytest.mark.parametrize( | ||
| "model_name", | ||
| [ | ||
| pytest.param("Qwen/Qwen3.5-4B", marks=[large_gpu_mark(min_gb=40)]), | ||
| pytest.param( | ||
| "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8", | ||
| marks=[large_gpu_mark(min_gb=80)] + multi_gpu_marks(num_gpus=2), | ||
| ), | ||
| ], | ||
| ) | ||
| @pytest.mark.parametrize("enable_prefix_caching", [False, True]) | ||
| def test_mtp_speculative_mixed_batch_short_prefill( | ||
| vllm_runner, model_name, enable_prefix_caching | ||
| ): | ||
| """Test to ensure MTP speculative decoding correctly handles | ||
| short prefill chunks that fall below the reorder_batch_threshold.""" | ||
|
|
||
| # Set so large that both prefills will be classified as decodes in a mixed batch | ||
| # note, with prefix caching we require chunk_size >= mamba_block_size | ||
| chunk_size = 256 if not enable_prefix_caching else 16384 | ||
| num_draft_tokens = 100 | ||
|
|
||
| with vllm_runner( | ||
| model_name, | ||
| speculative_config={ | ||
| "method": "mtp", | ||
| "num_speculative_tokens": num_draft_tokens, | ||
| }, | ||
| max_num_batched_tokens=chunk_size, | ||
| max_model_len=512, | ||
| enforce_eager=True, | ||
| tensor_parallel_size=2, | ||
| trust_remote_code=True, | ||
| enable_chunked_prefill=True, | ||
| enable_prefix_caching=enable_prefix_caching, | ||
| mamba_cache_mode="align" if enable_prefix_caching else "none", | ||
| ) as llm: | ||
| sampling_params = SamplingParams( | ||
| temperature=0.0, | ||
| max_tokens=128, | ||
| ) | ||
|
|
||
| # First small message gets prefilled first, under normal conditions since the | ||
| # batch is not yet mixed. Then the second prefill arrives as a mixed batch, but | ||
| # is shorter than num_speculative_tokens, so it gets misclassified as a decode | ||
| # and processed with the wrong state management logic, causing the critical | ||
| # fact from the first chunk to be lost and the model to generate nonsense. | ||
| outputs = llm.get_llm().chat( | ||
| [SMALL_MESSAGE, MESSAGES], | ||
| sampling_params, | ||
| chat_template_kwargs={"enable_thinking": False}, | ||
| ) | ||
|
|
||
| responses = [] | ||
| for output in outputs: | ||
| generated_text = output.outputs[0].text | ||
| print(f"Generated text: {generated_text!r}") | ||
| responses.append(generated_text) | ||
|
|
||
| assert "64" in responses[0], ( | ||
| "The first response should contain the correct value of 64." | ||
| ) | ||
| assert "43" in responses[1], ( | ||
| "The second response should contain the correct value of 42+1=43." | ||
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.