From dbc4457a5fbfa4b02818dfd4874511dfa6e53924 Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Tue, 9 Sep 2025 19:56:08 +0000
Subject: [PATCH 1/6] add e2e test for examples/spec_decode.py and monitor AL

Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml             |  1 +
 examples/offline_inference/spec_decode.py | 34 +++++++++++++++++++++--
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b0d4c4456d33..5486874037e5 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -316,6 +316,7 @@ steps:
     - python3 offline_inference/basic/embed.py
     - python3 offline_inference/basic/score.py
     - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill
 
 - label: Platform Tests (CUDA) # 4min
   timeout_in_minutes: 15
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 5af232cb6af6..53945065be82 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -49,6 +49,7 @@ def get_custom_mm_prompts(num_prompts):
 def parse_args():
     parser = FlexibleArgumentParser()
     add_dataset_parser(parser)
+    parser.add_argument("--test", action="store_true")
     parser.add_argument(
         "--method",
         type=str,
@@ -72,8 +73,7 @@ def parse_args():
     return parser.parse_args()
 
 
-def main():
-    args = parse_args()
+def main(args):
     args.endpoint_type = "openai-chat"
 
     model_dir = args.model_dir
@@ -194,6 +194,34 @@ def main():
         acceptance_rate = acceptance_counts[i] / num_drafts if num_drafts > 0 else 0
         print(f"acceptance at token {i}: {acceptance_rate:.2f}")
 
+    return acceptance_length
+
 
 if __name__ == "__main__":
-    main()
+    args = parse_args()
+    acceptance_length = main(args)
+
+    if args.test:
+        # takes ~30s to run on 1xH100
+        assert args.method == "eagle"
+        assert args.tp == 1
+        assert args.num_spec_tokens == 3
+        assert args.dataset_path == "philschmid/mt-bench"
+        assert args.num_prompts == 80
+        assert args.temp == 0
+        assert args.top_p == 1.0
+        assert args.top_k == -1
+        assert args.enable_chunked_prefill
+
+        # check acceptance length is within 1% of expected value
+        rtol = 0.01
+        expected_acceptance_length = 2.29
+        assert (
+            acceptance_length <= (1 + rtol) * expected_acceptance_length
+            and acceptance_length >= (1 - rtol) * expected_acceptance_length
+        ), (
+            f"acceptance_length {acceptance_length} is not \
+            within {rtol * 100}% of {expected_acceptance_length}"
+        )
+
+        print("Test passed!")

From cfb88fda5b26506e3eeffc94d04e231b091e859a Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Tue, 9 Sep 2025 20:03:43 +0000
Subject: [PATCH 2/6] lint

Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
---
 examples/offline_inference/spec_decode.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 53945065be82..0ec39920e0bb 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -206,6 +206,7 @@ def main(args):
         assert args.method == "eagle"
         assert args.tp == 1
         assert args.num_spec_tokens == 3
+        assert args.dataset_name == "hf"
         assert args.dataset_path == "philschmid/mt-bench"
         assert args.num_prompts == 80
         assert args.temp == 0
@@ -220,8 +221,8 @@ def main(args):
             acceptance_length <= (1 + rtol) * expected_acceptance_length
             and acceptance_length >= (1 - rtol) * expected_acceptance_length
         ), (
-            f"acceptance_length {acceptance_length} is not \
-            within {rtol * 100}% of {expected_acceptance_length}"
+            f"acceptance_length {acceptance_length} is not "
+            f"within {rtol * 100}% of {expected_acceptance_length}"
         )
 
         print("Test passed!")

From a7b75fe5d26e9d63d91196b78f254cb23a41708e Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Thu, 18 Sep 2025 14:56:45 +0000
Subject: [PATCH 3/6] add eagle3

Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml             |  1 +
 examples/offline_inference/spec_decode.py | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index eb204febee89..7396efca865f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -320,6 +320,7 @@ steps:
     - python3 offline_inference/basic/score.py
     - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
     - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill
 
 - label: Platform Tests (CUDA) # 4min
   timeout_in_minutes: 15
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 0ec39920e0bb..d83b48bad2e5 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -203,7 +203,7 @@ def main(args):
 
     if args.test:
         # takes ~30s to run on 1xH100
-        assert args.method == "eagle"
+        assert args.method in ["eagle", "eagle3"]
         assert args.tp == 1
         assert args.num_spec_tokens == 3
         assert args.dataset_name == "hf"
@@ -216,7 +216,11 @@ def main(args):
 
         # check acceptance length is within 1% of expected value
         rtol = 0.01
-        expected_acceptance_length = 2.29
+        if args.method == "eagle":
+            expected_acceptance_length = 2.29
+        else:
+            expected_acceptance_length = 2.783
+        
         assert (
             acceptance_length <= (1 + rtol) * expected_acceptance_length
             and acceptance_length >= (1 - rtol) * expected_acceptance_length
@@ -225,4 +229,4 @@ def main(args):
             f"within {rtol * 100}% of {expected_acceptance_length}"
         )
 
-        print("Test passed!")
+        print(f"Test passed! Expected AL: {expected_acceptance_length}, got {acceptance_length}")

From ea726fdafb4a5c1729a04a8ed1ececa07c92f6bb Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Thu, 18 Sep 2025 16:39:00 +0000
Subject: [PATCH 4/6] lint

Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
---
 examples/offline_inference/spec_decode.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index d83b48bad2e5..a8b4408e210c 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -216,11 +216,8 @@ def main(args):
 
         # check acceptance length is within 1% of expected value
         rtol = 0.01
-        if args.method == "eagle":
-            expected_acceptance_length = 2.29
-        else:
-            expected_acceptance_length = 2.783
-        
+        expected_acceptance_length = 2.29 if args.method == "eagle" else 2.783
+
         assert (
             acceptance_length <= (1 + rtol) * expected_acceptance_length
             and acceptance_length >= (1 - rtol) * expected_acceptance_length
@@ -229,4 +226,7 @@ def main(args):
             f"within {rtol * 100}% of {expected_acceptance_length}"
         )
 
-        print(f"Test passed! Expected AL: {expected_acceptance_length}, got {acceptance_length}")
+        print(
+            f"Test passed! Expected AL: "
+            f"{expected_acceptance_length}, got {acceptance_length}"
+        )

From 08355b60fe7e945bcc5e2d01409c354df18b86be Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Mon, 22 Sep 2025 15:49:09 +0000
Subject: [PATCH 5/6] max model len OOM

Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml             | 4 ++--
 examples/offline_inference/spec_decode.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7396efca865f..54dbcbc7edfe 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -319,8 +319,8 @@ steps:
     - python3 offline_inference/basic/embed.py
     - python3 offline_inference/basic/score.py
     - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
-    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
 
 - label: Platform Tests (CUDA) # 4min
   timeout_in_minutes: 15
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index a8b4408e210c..6b115519fd08 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -62,6 +62,7 @@ def parse_args():
     parser.add_argument("--tp", type=int, default=1)
     parser.add_argument("--enforce-eager", action="store_true")
     parser.add_argument("--enable-chunked-prefill", action="store_true")
+    parser.add_argument("--max-model-len", type=int, default=16384)
     parser.add_argument("--temp", type=float, default=0)
     parser.add_argument("--top-p", type=float, default=1.0)
     parser.add_argument("--top-k", type=int, default=-1)
@@ -130,7 +131,7 @@ def main(args):
         gpu_memory_utilization=0.8,
         speculative_config=speculative_config,
         disable_log_stats=False,
-        max_model_len=16384,
+        max_model_len=args.max_model_len,
         limit_mm_per_prompt={"image": 5},
         disable_chunked_mm_input=True,
     )

From 7745c2cf3b676cd6745a855a630bd36da7e229a7 Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Tue, 23 Sep 2025 15:16:55 +0000
Subject: [PATCH 6/6] update AL

Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
---
 examples/offline_inference/spec_decode.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 26103fd00eb8..ce078bce0b75 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -219,9 +219,9 @@ def main(args):
         assert args.top_k == -1
         assert args.enable_chunked_prefill
 
-        # check acceptance length is within 1% of expected value
-        rtol = 0.01
-        expected_acceptance_length = 2.29 if args.method == "eagle" else 2.783
+        # check acceptance length is within 2% of expected value
+        rtol = 0.02
+        expected_acceptance_length = 2.296 if args.method == "eagle" else 2.811
 
         assert (
             acceptance_length <= (1 + rtol) * expected_acceptance_length