diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh index 5a6834757..ef0a0a8ad 100755 --- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh +++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh @@ -2,7 +2,7 @@ # DeepSeek-V4-Pro H200 vLLM MTP variant of the recipe at # https://vllm.ai/blog/deepseek-v4. Mirrors dsv4_fp8_h200.sh but adds -# --speculative-config '{"method":"mtp","num_speculative_tokens":1}' and +# --speculative-config '{"method":"mtp","num_speculative_tokens":2}' and # routes prompts through chat-formatted encoding via --dsv4 (required for # meaningful MTP acceptance numbers per AGENTS.md). @@ -65,7 +65,7 @@ $MAX_MODEL_LEN_ARG \ --max-num-batched-tokens 512 \ --no-enable-flashinfer-autotune \ --compilation-config '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' \ ---speculative-config '{"method":"mtp","num_speculative_tokens":1}' \ +--speculative-config '{"method":"mtp","num_speculative_tokens":2}' \ --tokenizer-mode deepseek_v4 \ --tool-call-parser deepseek_v4 \ --enable-auto-tool-choice \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 98fa4e8b3..b04ae1947 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2207,3 +2207,10 @@ - "run_benchmark_serving uses --dsv4 (chat-formatted prompts) per the AGENTS.md MTP rule, since EAGLE-style speculative decoding regresses acceptance on raw random tokens" - "Search space mirrors the non-MTP H200 entry: TP=8, EP=8, DP-attn=true, CONC 4-64 for both 1k1k and 8k1k, with spec-decoding: mtp" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1222 + +- config-keys: + - dsv4-fp8-h200-vllm-mtp + description: + - "Bump --speculative-config num_speculative_tokens from 1 to 2 (`{\"method\":\"mtp\",\"num_speculative_tokens\":2}`)" + - "Re-test whether H200 MTP kernels accept 2 draft tokens — Blackwell MTP runs at 2 (per @wzhao18's vLLM Blackwell MTP submission); checking if H200 has parity now" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1279