From a246c8ddb577b66a10f01229a585be8821ae49e3 Mon Sep 17 00:00:00 2001
From: Youlei Yang <youlei.yang@intel.com>
Date: Thu, 14 May 2026 01:13:57 +0000
Subject: [PATCH 1/3] Fix IndexError in ctx filter when ctx_range is empty
 (file-based bucketing)

When VLLM_BUCKETING_FROM_FILE is used, ctx_range is passed as an empty
list to generate_buckets(). The num_ctx_tokens_less_or_equal_batched_max_model_len
filter accessed ctx_range[0] unconditionally, causing IndexError.

Fix: use safe access with fallback to 0 when ctx_range is empty.

Signed-off-by: Youlei Yang <youlei.yang@intel.com>
---
 tests/unit_tests/test_bucketing.py       | 36 ++++++++++++++++++++++++
 vllm_gaudi/extension/bucketing/common.py |  5 ++--
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/tests/unit_tests/test_bucketing.py b/tests/unit_tests/test_bucketing.py
index 3145c8d3c5..5113314415 100644
--- a/tests/unit_tests/test_bucketing.py
+++ b/tests/unit_tests/test_bucketing.py
@@ -727,3 +727,39 @@ def test_decode_buckets_satisfy_ctx_filter(monkeypatch, use_contiguous_pa, max_m
                             f"ctx <= ceil(max_model_len/block_size) * bs "
                             f"(max_blocks_per_seq={max_blocks_per_seq}):\n" +
                             "\n".join(f"  bs={bs}, query={query}, ctx={ctx}" for bs, query, ctx in violations[:20]))
+
+
+def test_file_buckets_with_empty_ctx_range_no_crash(monkeypatch):
+    """When VLLM_BUCKETING_FROM_FILE is used, ctx_range is empty.
+
+    The ctx filter must not crash with IndexError on ctx_range[0].
+    Reproduces server.log issue: GraniteMoeHybrid model with file-based
+    bucketing caused IndexError in num_ctx_tokens_less_or_equal_batched_max_model_len.
+    """
+    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "false")
+    clear_config()
+    get_config()
+
+    max_model_len = 131072
+    block_size = 528
+    max_num_seqs = 32
+    max_blocks = 2424
+
+    file_buckets = [(1, 1, 256), (1, 1, 512), (2, 1, 256), (2, 1, 512), (32, 1, 2424)]
+
+    # ctx_range is empty when using file-based bucketing
+    buckets = generate_buckets(
+        bs_range=[],
+        query_range=[],
+        ctx_range=[],
+        is_prompt=False,
+        max_model_len=max_model_len,
+        max_num_seqs=max_num_seqs,
+        max_num_prefill_seqs=1,
+        max_num_batched_tokens=8192,
+        block_size=block_size,
+        max_blocks=max_blocks,
+        file_buckets=file_buckets,
+    )
+
+    assert len(buckets) > 0, "Should produce buckets from file_buckets"
diff --git a/vllm_gaudi/extension/bucketing/common.py b/vllm_gaudi/extension/bucketing/common.py
index 2fd85dbdae..6fc172cc65 100644
--- a/vllm_gaudi/extension/bucketing/common.py
+++ b/vllm_gaudi/extension/bucketing/common.py
@@ -448,10 +448,11 @@ def batch_size_smaller_than_blocks(bs, query, ctx):
         return bs <= ctx
 
     def num_ctx_tokens_less_or_equal_batched_max_model_len(bs, query, ctx):
-        is_valid = ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_range[0] else True
+        ctx_min = ctx_range[0] if ctx_range else 0
+        is_valid = ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_min else True
         if not is_valid:
             omitted_buckets.add(
-                ("condition: ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_range[0] else True",
+                ("condition: ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_min else True",
                  "-> bs, query, ctx: ", bs, query, ctx))
         return is_valid
 

From 15835274f718a984c02c6c65c9b1daf54fbf52a7 Mon Sep 17 00:00:00 2001
From: Youlei Yang <youlei.yang@intel.com>
Date: Thu, 14 May 2026 01:15:21 +0000
Subject: [PATCH 2/3] Remove ctx filter from contiguous PA decode buckets

For contiguous PA, the block range is already bounded by max_blocks in
the bucketing strategies, so the num_ctx_tokens_less_or_equal_batched_max_model_len
filter is unnecessary and incorrectly drops valid buckets.

Example: with max_model_len=2048, block_size=256, max_num_seqs=256,
bucket (256, 1, 2112) was filtered because 2112 > ceil(2048/256)*256=2048,
but 2112 is a valid user-configured VLLM_DECODE_BLOCK_BUCKET_MAX.

Signed-off-by: Youlei Yang <youlei.yang@intel.com>
---
 tests/unit_tests/test_bucketing.py       | 62 ++++++++++++++++++++----
 vllm_gaudi/extension/bucketing/common.py |  2 +-
 2 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/tests/unit_tests/test_bucketing.py b/tests/unit_tests/test_bucketing.py
index 5113314415..c1b25b7d7a 100644
--- a/tests/unit_tests/test_bucketing.py
+++ b/tests/unit_tests/test_bucketing.py
@@ -530,9 +530,9 @@ def test_real_scenario_fallback_ctx_7408_not_truncated():
 def test_exponential_decode_block_limit_uncapped(monkeypatch):
     """Verify that decode block limit is computed from log2(max_decode_blocks).
 
-    With the new approach, excessive warmup buckets are controlled by
-    filters in generate_buckets() (num_ctx_tokens_less_or_equal_batched_max_model_len)
-    rather than by capping the block limit in get_decode_cfgs().
+    For contiguous PA, max_decode_blocks = min(max_blocks, ceil(max_model_len/block_size)*max_num_seqs).
+    The block range is already bounded by max_blocks, so no additional
+    ctx filter is applied to contiguous PA decode buckets.
     """
     monkeypatch.setenv("VLLM_EXPONENTIAL_BUCKETING", "true")
     monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "true")
@@ -669,7 +669,6 @@ def test_padding_aware_decode_cfgs_contiguous_pa_clamps_block_range(mock_get_con
 # --- Tests that num_ctx_tokens_less_or_equal_batched_max_model_len filter is applied ---
 
 
-@pytest.mark.parametrize("use_contiguous_pa", [True, False], ids=["contiguous_pa", "non_contiguous_pa"])
 @pytest.mark.parametrize(
     ("max_model_len", "block_size", "max_num_seqs", "max_blocks", "max_num_batched_tokens"),
     [
@@ -679,13 +678,15 @@ def test_padding_aware_decode_cfgs_contiguous_pa_clamps_block_range(mock_get_con
     ],
     ids=["qwen3_32b", "small_model", "long_ctx"],
 )
-def test_decode_buckets_satisfy_ctx_filter(monkeypatch, use_contiguous_pa, max_model_len, block_size, max_num_seqs,
-                                           max_blocks, max_num_batched_tokens):
-    """Every decode bucket returned by generate_buckets must satisfy
-    num_ctx_tokens_less_or_equal_batched_max_model_len:
-        ctx <= ceil(max_model_len / block_size) * bs   (when ctx > ctx_range[0])
+def test_decode_buckets_satisfy_ctx_filter_non_contiguous_pa(monkeypatch, max_model_len, block_size, max_num_seqs,
+                                                             max_blocks, max_num_batched_tokens):
+    """For non-contiguous PA, every decode bucket returned by generate_buckets
+    must satisfy ctx <= ceil(max_model_len / block_size) * bs (when ctx > ctx_range[0]).
+
+    The filter is only applied to non-contiguous PA; contiguous PA decode
+    buckets are not filtered since their block range is already bounded by max_blocks.
     """
-    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", str(use_contiguous_pa).lower())
+    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "false")
     clear_config()
     get_config()
 
@@ -729,6 +730,47 @@ def test_decode_buckets_satisfy_ctx_filter(monkeypatch, use_contiguous_pa, max_m
                             "\n".join(f"  bs={bs}, query={query}, ctx={ctx}" for bs, query, ctx in violations[:20]))
 
 
+def test_contiguous_pa_decode_buckets_not_filtered_by_ctx(monkeypatch):
+    """For contiguous PA, the ctx filter must NOT be applied to decode buckets.
+
+    Reproduces std_out.txt issue: with max_model_len=2048, block_size=256,
+    max_num_seqs=256, the bucket (256, 1, 2112) was incorrectly filtered
+    because 2112 > ceil(2048/256)*256 = 2048.
+    """
+    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "true")
+    clear_config()
+    get_config()
+
+    max_model_len = 2048
+    block_size = 256
+    max_num_seqs = 256
+    max_blocks = 2113
+    max_num_batched_tokens = 1048832
+
+    bs_range = [256]
+    query_range = [1]
+    ctx_range = list(range(1280, 2113, 64))  # 1280, 1344, ..., 2048, 2112
+    ctx_range.append(max_blocks)  # append num_hpu_blocks as done in generate_decode_buckets
+
+    buckets = generate_buckets(
+        bs_range=bs_range,
+        query_range=query_range,
+        ctx_range=ctx_range,
+        is_prompt=False,
+        max_model_len=max_model_len,
+        max_num_seqs=max_num_seqs,
+        max_num_prefill_seqs=1,
+        max_num_batched_tokens=max_num_batched_tokens,
+        block_size=block_size,
+        max_blocks=max_blocks,
+    )
+
+    bucket_ctxs = [ctx for _, _, ctx in buckets]
+    assert 2112 in bucket_ctxs, (f"Bucket ctx=2112 was incorrectly filtered out. "
+                                 f"Max ctx in buckets: {max(bucket_ctxs)}")
+    assert max_blocks in bucket_ctxs, (f"Bucket ctx={max_blocks} (num_hpu_blocks) was incorrectly filtered out.")
+
+
 def test_file_buckets_with_empty_ctx_range_no_crash(monkeypatch):
     """When VLLM_BUCKETING_FROM_FILE is used, ctx_range is empty.
 
diff --git a/vllm_gaudi/extension/bucketing/common.py b/vllm_gaudi/extension/bucketing/common.py
index 6fc172cc65..7543ed2bf7 100644
--- a/vllm_gaudi/extension/bucketing/common.py
+++ b/vllm_gaudi/extension/bucketing/common.py
@@ -464,7 +464,7 @@ def num_ctx_tokens_less_or_equal_batched_max_model_len(bs, query, ctx):
         },
         "decode": {
             # depends only on contiguous PA
-            True: [num_ctx_tokens_less_or_equal_batched_max_model_len],
+            True: [],
             False: [batch_size_smaller_than_blocks, num_ctx_tokens_less_or_equal_batched_max_model_len],
         }
     }

From 77b74a90ce8a513a16004e2999e2b9f92694fdb6 Mon Sep 17 00:00:00 2001
From: Youlei Yang <youlei.yang@intel.com>
Date: Mon, 18 May 2026 02:41:40 +0000
Subject: [PATCH 3/3] Skip filters for file-based bucketing and refactor tests

- Set filters to empty list when file_buckets is provided in
  generate_buckets(), since file-based bucketing should pass all
  user-specified buckets through without filtering.
- Rename test_file_buckets_with_empty_ctx_range_no_crash to
  test_file_buckets_bypass_filters to reflect the actual behavior.
- Add bucket (512,1,256) that would be rejected by
  batch_size_smaller_than_blocks filter to prove filters are skipped.
- Update test_decode_buckets_satisfy_ctx_filter to only run for
  non-contiguous PA since contiguous PA decode buckets are not filtered.
- Update docstring for test_exponential_decode_block_limit_uncapped.

Signed-off-by: Youlei Yang <youlei.yang@intel.com>
---
 tests/unit_tests/test_bucketing.py       | 25 +++++++++++++-----------
 vllm_gaudi/extension/bucketing/common.py |  2 +-
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/tests/unit_tests/test_bucketing.py b/tests/unit_tests/test_bucketing.py
index c1b25b7d7a..bc2fa92e91 100644
--- a/tests/unit_tests/test_bucketing.py
+++ b/tests/unit_tests/test_bucketing.py
@@ -771,25 +771,27 @@ def test_contiguous_pa_decode_buckets_not_filtered_by_ctx(monkeypatch):
     assert max_blocks in bucket_ctxs, (f"Bucket ctx={max_blocks} (num_hpu_blocks) was incorrectly filtered out.")
 
 
-def test_file_buckets_with_empty_ctx_range_no_crash(monkeypatch):
-    """When VLLM_BUCKETING_FROM_FILE is used, ctx_range is empty.
+def test_file_buckets_bypass_filters(monkeypatch):
+    """File-based bucketing (VLLM_BUCKETING_FROM_FILE) skips all filters.
 
-    The ctx filter must not crash with IndexError on ctx_range[0].
-    Reproduces server.log issue: GraniteMoeHybrid model with file-based
-    bucketing caused IndexError in num_ctx_tokens_less_or_equal_batched_max_model_len.
+    Buckets (1,1,256) and (2,1,512) would normally be rejected by the
+    batch_size_smaller_than_blocks or ctx filters in non-file mode.
+    Since file buckets bypass filters entirely, all provided buckets
+    must appear in the output unchanged.
     """
-    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "false")
+    monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "true")
     clear_config()
     get_config()
 
-    max_model_len = 131072
-    block_size = 528
+    max_model_len = 2048
+    block_size = 256
     max_num_seqs = 32
     max_blocks = 2424
 
-    file_buckets = [(1, 1, 256), (1, 1, 512), (2, 1, 256), (2, 1, 512), (32, 1, 2424)]
+    # (512,1,256) would be rejected by batch_size_smaller_than_blocks (bs > ctx)
+    # All buckets pass through because file_buckets bypass filters entirely
+    file_buckets = [(1, 1, 256), (1, 1, 512), (2, 1, 256), (512, 1, 256), (32, 1, 2424)]
 
-    # ctx_range is empty when using file-based bucketing
     buckets = generate_buckets(
         bs_range=[],
         query_range=[],
@@ -804,4 +806,5 @@ def test_file_buckets_with_empty_ctx_range_no_crash(monkeypatch):
         file_buckets=file_buckets,
     )
 
-    assert len(buckets) > 0, "Should produce buckets from file_buckets"
+    assert set(buckets) == set(file_buckets), (f"All file buckets should pass through unfiltered.\n"
+                                               f"Expected: {sorted(file_buckets)}\nGot: {sorted(buckets)}")
diff --git a/vllm_gaudi/extension/bucketing/common.py b/vllm_gaudi/extension/bucketing/common.py
index 7543ed2bf7..870cdb0ac3 100644
--- a/vllm_gaudi/extension/bucketing/common.py
+++ b/vllm_gaudi/extension/bucketing/common.py
@@ -491,7 +491,7 @@ def is_ctx_allowed(ctx):
     buckets = set()
     buckets_2d = set()
     omitted_buckets = set()
-    filters = get_filters(is_prompt, use_merged_prefill, use_contiguous_pa)
+    filters = [] if file_buckets else get_filters(is_prompt, use_merged_prefill, use_contiguous_pa)
     corrector = get_corrector(is_prompt, use_contiguous_pa)
 
     if file_buckets: