From dbc4457a5fbfa4b02818dfd4874511dfa6e53924 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Tue, 9 Sep 2025 19:56:08 +0000 Subject: [PATCH 1/6] add e2e test for examples/spec_decode.py and monitor AL Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 1 + examples/offline_inference/spec_decode.py | 34 +++++++++++++++++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b0d4c4456d33..5486874037e5 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -316,6 +316,7 @@ steps: - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill - label: Platform Tests (CUDA) # 4min timeout_in_minutes: 15 diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 5af232cb6af6..53945065be82 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -49,6 +49,7 @@ def get_custom_mm_prompts(num_prompts): def parse_args(): parser = FlexibleArgumentParser() add_dataset_parser(parser) + parser.add_argument("--test", action="store_true") parser.add_argument( "--method", type=str, @@ -72,8 +73,7 @@ def parse_args(): return parser.parse_args() -def main(): - args = parse_args() +def main(args): args.endpoint_type = "openai-chat" model_dir = args.model_dir @@ -194,6 +194,34 @@ def main(): acceptance_rate = acceptance_counts[i] / num_drafts if num_drafts > 0 else 0 print(f"acceptance at token {i}: {acceptance_rate:.2f}") + return acceptance_length + if __name__ == "__main__": - main() + args = parse_args() + acceptance_length = main(args) + + if args.test: + # takes ~30s to run on 1xH100 + assert args.method == "eagle" + assert args.tp == 1 + assert args.num_spec_tokens == 3 + assert args.dataset_path == "philschmid/mt-bench" + assert args.num_prompts == 80 + assert args.temp == 0 + assert args.top_p == 1.0 + assert args.top_k == -1 + assert args.enable_chunked_prefill + + # check acceptance length is within 1% of expected value + rtol = 0.01 + expected_acceptance_length = 2.29 + assert ( + acceptance_length <= (1 + rtol) * expected_acceptance_length + and acceptance_length >= (1 - rtol) * expected_acceptance_length + ), ( + f"acceptance_length {acceptance_length} is not \ + within {rtol * 100}% of {expected_acceptance_length}" + ) + + print("Test passed!") From cfb88fda5b26506e3eeffc94d04e231b091e859a Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Tue, 9 Sep 2025 20:03:43 +0000 Subject: [PATCH 2/6] lint Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> --- examples/offline_inference/spec_decode.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 53945065be82..0ec39920e0bb 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -206,6 +206,7 @@ def main(args): assert args.method == "eagle" assert args.tp == 1 assert args.num_spec_tokens == 3 + assert args.dataset_name == "hf" assert args.dataset_path == "philschmid/mt-bench" assert args.num_prompts == 80 assert args.temp == 0 @@ -220,8 +221,8 @@ def main(args): acceptance_length <= (1 + rtol) * expected_acceptance_length and acceptance_length >= (1 - rtol) * expected_acceptance_length ), ( - f"acceptance_length {acceptance_length} is not \ - within {rtol * 100}% of {expected_acceptance_length}" + f"acceptance_length {acceptance_length} is not " + f"within {rtol * 100}% of {expected_acceptance_length}" ) print("Test passed!") From a7b75fe5d26e9d63d91196b78f254cb23a41708e Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Thu, 18 Sep 2025 14:56:45 +0000 Subject: [PATCH 3/6] add eagle3 Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 1 + examples/offline_inference/spec_decode.py | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index eb204febee89..7396efca865f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -320,6 +320,7 @@ steps: - python3 offline_inference/basic/score.py - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill - label: Platform Tests (CUDA) # 4min timeout_in_minutes: 15 diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 0ec39920e0bb..d83b48bad2e5 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -203,7 +203,7 @@ def main(args): if args.test: # takes ~30s to run on 1xH100 - assert args.method == "eagle" + assert args.method in ["eagle", "eagle3"] assert args.tp == 1 assert args.num_spec_tokens == 3 assert args.dataset_name == "hf" @@ -216,7 +216,11 @@ def main(args): # check acceptance length is within 1% of expected value rtol = 0.01 - expected_acceptance_length = 2.29 + if args.method == "eagle": + expected_acceptance_length = 2.29 + else: + expected_acceptance_length = 2.783 + assert ( acceptance_length <= (1 + rtol) * expected_acceptance_length and acceptance_length >= (1 - rtol) * expected_acceptance_length @@ -225,4 +229,4 @@ def main(args): f"within {rtol * 100}% of {expected_acceptance_length}" ) - print("Test passed!") + print(f"Test passed! Expected AL: {expected_acceptance_length}, got {acceptance_length}") From ea726fdafb4a5c1729a04a8ed1ececa07c92f6bb Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Thu, 18 Sep 2025 16:39:00 +0000 Subject: [PATCH 4/6] lint Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> --- examples/offline_inference/spec_decode.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index d83b48bad2e5..a8b4408e210c 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -216,11 +216,8 @@ def main(args): # check acceptance length is within 1% of expected value rtol = 0.01 - if args.method == "eagle": - expected_acceptance_length = 2.29 - else: - expected_acceptance_length = 2.783 - + expected_acceptance_length = 2.29 if args.method == "eagle" else 2.783 + assert ( acceptance_length <= (1 + rtol) * expected_acceptance_length and acceptance_length >= (1 - rtol) * expected_acceptance_length @@ -229,4 +226,7 @@ def main(args): f"within {rtol * 100}% of {expected_acceptance_length}" ) - print(f"Test passed! Expected AL: {expected_acceptance_length}, got {acceptance_length}") + print( + f"Test passed! Expected AL: " + f"{expected_acceptance_length}, got {acceptance_length}" + ) From 08355b60fe7e945bcc5e2d01409c354df18b86be Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Mon, 22 Sep 2025 15:49:09 +0000 Subject: [PATCH 5/6] max model len OOM Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 4 ++-- examples/offline_inference/spec_decode.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 7396efca865f..54dbcbc7edfe 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -319,8 +319,8 @@ steps: - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - label: Platform Tests (CUDA) # 4min timeout_in_minutes: 15 diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index a8b4408e210c..6b115519fd08 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -62,6 +62,7 @@ def parse_args(): parser.add_argument("--tp", type=int, default=1) parser.add_argument("--enforce-eager", action="store_true") parser.add_argument("--enable-chunked-prefill", action="store_true") + parser.add_argument("--max-model-len", type=int, default=16384) parser.add_argument("--temp", type=float, default=0) parser.add_argument("--top-p", type=float, default=1.0) parser.add_argument("--top-k", type=int, default=-1) @@ -130,7 +131,7 @@ def main(args): gpu_memory_utilization=0.8, speculative_config=speculative_config, disable_log_stats=False, - max_model_len=16384, + max_model_len=args.max_model_len, limit_mm_per_prompt={"image": 5}, disable_chunked_mm_input=True, ) From 7745c2cf3b676cd6745a855a630bd36da7e229a7 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Tue, 23 Sep 2025 15:16:55 +0000 Subject: [PATCH 6/6] update AL Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> --- examples/offline_inference/spec_decode.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 26103fd00eb8..ce078bce0b75 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -219,9 +219,9 @@ def main(args): assert args.top_k == -1 assert args.enable_chunked_prefill - # check acceptance length is within 1% of expected value - rtol = 0.01 - expected_acceptance_length = 2.29 if args.method == "eagle" else 2.783 + # check acceptance length is within 2% of expected value + rtol = 0.02 + expected_acceptance_length = 2.296 if args.method == "eagle" else 2.811 assert ( acceptance_length <= (1 + rtol) * expected_acceptance_length