From a246c8ddb577b66a10f01229a585be8821ae49e3 Mon Sep 17 00:00:00 2001 From: Youlei Yang Date: Thu, 14 May 2026 01:13:57 +0000 Subject: [PATCH 1/3] Fix IndexError in ctx filter when ctx_range is empty (file-based bucketing) When VLLM_BUCKETING_FROM_FILE is used, ctx_range is passed as an empty list to generate_buckets(). The num_ctx_tokens_less_or_equal_batched_max_model_len filter accessed ctx_range[0] unconditionally, causing IndexError. Fix: use safe access with fallback to 0 when ctx_range is empty. Signed-off-by: Youlei Yang --- tests/unit_tests/test_bucketing.py | 36 ++++++++++++++++++++++++ vllm_gaudi/extension/bucketing/common.py | 5 ++-- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/test_bucketing.py b/tests/unit_tests/test_bucketing.py index 3145c8d3c5..5113314415 100644 --- a/tests/unit_tests/test_bucketing.py +++ b/tests/unit_tests/test_bucketing.py @@ -727,3 +727,39 @@ def test_decode_buckets_satisfy_ctx_filter(monkeypatch, use_contiguous_pa, max_m f"ctx <= ceil(max_model_len/block_size) * bs " f"(max_blocks_per_seq={max_blocks_per_seq}):\n" + "\n".join(f" bs={bs}, query={query}, ctx={ctx}" for bs, query, ctx in violations[:20])) + + +def test_file_buckets_with_empty_ctx_range_no_crash(monkeypatch): + """When VLLM_BUCKETING_FROM_FILE is used, ctx_range is empty. + + The ctx filter must not crash with IndexError on ctx_range[0]. + Reproduces server.log issue: GraniteMoeHybrid model with file-based + bucketing caused IndexError in num_ctx_tokens_less_or_equal_batched_max_model_len. + """ + monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "false") + clear_config() + get_config() + + max_model_len = 131072 + block_size = 528 + max_num_seqs = 32 + max_blocks = 2424 + + file_buckets = [(1, 1, 256), (1, 1, 512), (2, 1, 256), (2, 1, 512), (32, 1, 2424)] + + # ctx_range is empty when using file-based bucketing + buckets = generate_buckets( + bs_range=[], + query_range=[], + ctx_range=[], + is_prompt=False, + max_model_len=max_model_len, + max_num_seqs=max_num_seqs, + max_num_prefill_seqs=1, + max_num_batched_tokens=8192, + block_size=block_size, + max_blocks=max_blocks, + file_buckets=file_buckets, + ) + + assert len(buckets) > 0, "Should produce buckets from file_buckets" diff --git a/vllm_gaudi/extension/bucketing/common.py b/vllm_gaudi/extension/bucketing/common.py index 2fd85dbdae..6fc172cc65 100644 --- a/vllm_gaudi/extension/bucketing/common.py +++ b/vllm_gaudi/extension/bucketing/common.py @@ -448,10 +448,11 @@ def batch_size_smaller_than_blocks(bs, query, ctx): return bs <= ctx def num_ctx_tokens_less_or_equal_batched_max_model_len(bs, query, ctx): - is_valid = ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_range[0] else True + ctx_min = ctx_range[0] if ctx_range else 0 + is_valid = ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_min else True if not is_valid: omitted_buckets.add( - ("condition: ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_range[0] else True", + ("condition: ctx <= math.ceil(max_model_len / block_size) * bs if ctx > ctx_min else True", "-> bs, query, ctx: ", bs, query, ctx)) return is_valid From 15835274f718a984c02c6c65c9b1daf54fbf52a7 Mon Sep 17 00:00:00 2001 From: Youlei Yang Date: Thu, 14 May 2026 01:15:21 +0000 Subject: [PATCH 2/3] Remove ctx filter from contiguous PA decode buckets For contiguous PA, the block range is already bounded by max_blocks in the bucketing strategies, so the num_ctx_tokens_less_or_equal_batched_max_model_len filter is unnecessary and incorrectly drops valid buckets. Example: with max_model_len=2048, block_size=256, max_num_seqs=256, bucket (256, 1, 2112) was filtered because 2112 > ceil(2048/256)*256=2048, but 2112 is a valid user-configured VLLM_DECODE_BLOCK_BUCKET_MAX. Signed-off-by: Youlei Yang --- tests/unit_tests/test_bucketing.py | 62 ++++++++++++++++++++---- vllm_gaudi/extension/bucketing/common.py | 2 +- 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/tests/unit_tests/test_bucketing.py b/tests/unit_tests/test_bucketing.py index 5113314415..c1b25b7d7a 100644 --- a/tests/unit_tests/test_bucketing.py +++ b/tests/unit_tests/test_bucketing.py @@ -530,9 +530,9 @@ def test_real_scenario_fallback_ctx_7408_not_truncated(): def test_exponential_decode_block_limit_uncapped(monkeypatch): """Verify that decode block limit is computed from log2(max_decode_blocks). - With the new approach, excessive warmup buckets are controlled by - filters in generate_buckets() (num_ctx_tokens_less_or_equal_batched_max_model_len) - rather than by capping the block limit in get_decode_cfgs(). + For contiguous PA, max_decode_blocks = min(max_blocks, ceil(max_model_len/block_size)*max_num_seqs). + The block range is already bounded by max_blocks, so no additional + ctx filter is applied to contiguous PA decode buckets. """ monkeypatch.setenv("VLLM_EXPONENTIAL_BUCKETING", "true") monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "true") @@ -669,7 +669,6 @@ def test_padding_aware_decode_cfgs_contiguous_pa_clamps_block_range(mock_get_con # --- Tests that num_ctx_tokens_less_or_equal_batched_max_model_len filter is applied --- -@pytest.mark.parametrize("use_contiguous_pa", [True, False], ids=["contiguous_pa", "non_contiguous_pa"]) @pytest.mark.parametrize( ("max_model_len", "block_size", "max_num_seqs", "max_blocks", "max_num_batched_tokens"), [ @@ -679,13 +678,15 @@ def test_padding_aware_decode_cfgs_contiguous_pa_clamps_block_range(mock_get_con ], ids=["qwen3_32b", "small_model", "long_ctx"], ) -def test_decode_buckets_satisfy_ctx_filter(monkeypatch, use_contiguous_pa, max_model_len, block_size, max_num_seqs, - max_blocks, max_num_batched_tokens): - """Every decode bucket returned by generate_buckets must satisfy - num_ctx_tokens_less_or_equal_batched_max_model_len: - ctx <= ceil(max_model_len / block_size) * bs (when ctx > ctx_range[0]) +def test_decode_buckets_satisfy_ctx_filter_non_contiguous_pa(monkeypatch, max_model_len, block_size, max_num_seqs, + max_blocks, max_num_batched_tokens): + """For non-contiguous PA, every decode bucket returned by generate_buckets + must satisfy ctx <= ceil(max_model_len / block_size) * bs (when ctx > ctx_range[0]). + + The filter is only applied to non-contiguous PA; contiguous PA decode + buckets are not filtered since their block range is already bounded by max_blocks. """ - monkeypatch.setenv("VLLM_CONTIGUOUS_PA", str(use_contiguous_pa).lower()) + monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "false") clear_config() get_config() @@ -729,6 +730,47 @@ def test_decode_buckets_satisfy_ctx_filter(monkeypatch, use_contiguous_pa, max_m "\n".join(f" bs={bs}, query={query}, ctx={ctx}" for bs, query, ctx in violations[:20])) +def test_contiguous_pa_decode_buckets_not_filtered_by_ctx(monkeypatch): + """For contiguous PA, the ctx filter must NOT be applied to decode buckets. + + Reproduces std_out.txt issue: with max_model_len=2048, block_size=256, + max_num_seqs=256, the bucket (256, 1, 2112) was incorrectly filtered + because 2112 > ceil(2048/256)*256 = 2048. + """ + monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "true") + clear_config() + get_config() + + max_model_len = 2048 + block_size = 256 + max_num_seqs = 256 + max_blocks = 2113 + max_num_batched_tokens = 1048832 + + bs_range = [256] + query_range = [1] + ctx_range = list(range(1280, 2113, 64)) # 1280, 1344, ..., 2048, 2112 + ctx_range.append(max_blocks) # append num_hpu_blocks as done in generate_decode_buckets + + buckets = generate_buckets( + bs_range=bs_range, + query_range=query_range, + ctx_range=ctx_range, + is_prompt=False, + max_model_len=max_model_len, + max_num_seqs=max_num_seqs, + max_num_prefill_seqs=1, + max_num_batched_tokens=max_num_batched_tokens, + block_size=block_size, + max_blocks=max_blocks, + ) + + bucket_ctxs = [ctx for _, _, ctx in buckets] + assert 2112 in bucket_ctxs, (f"Bucket ctx=2112 was incorrectly filtered out. " + f"Max ctx in buckets: {max(bucket_ctxs)}") + assert max_blocks in bucket_ctxs, (f"Bucket ctx={max_blocks} (num_hpu_blocks) was incorrectly filtered out.") + + def test_file_buckets_with_empty_ctx_range_no_crash(monkeypatch): """When VLLM_BUCKETING_FROM_FILE is used, ctx_range is empty. diff --git a/vllm_gaudi/extension/bucketing/common.py b/vllm_gaudi/extension/bucketing/common.py index 6fc172cc65..7543ed2bf7 100644 --- a/vllm_gaudi/extension/bucketing/common.py +++ b/vllm_gaudi/extension/bucketing/common.py @@ -464,7 +464,7 @@ def num_ctx_tokens_less_or_equal_batched_max_model_len(bs, query, ctx): }, "decode": { # depends only on contiguous PA - True: [num_ctx_tokens_less_or_equal_batched_max_model_len], + True: [], False: [batch_size_smaller_than_blocks, num_ctx_tokens_less_or_equal_batched_max_model_len], } } From 77b74a90ce8a513a16004e2999e2b9f92694fdb6 Mon Sep 17 00:00:00 2001 From: Youlei Yang Date: Mon, 18 May 2026 02:41:40 +0000 Subject: [PATCH 3/3] Skip filters for file-based bucketing and refactor tests - Set filters to empty list when file_buckets is provided in generate_buckets(), since file-based bucketing should pass all user-specified buckets through without filtering. - Rename test_file_buckets_with_empty_ctx_range_no_crash to test_file_buckets_bypass_filters to reflect the actual behavior. - Add bucket (512,1,256) that would be rejected by batch_size_smaller_than_blocks filter to prove filters are skipped. - Update test_decode_buckets_satisfy_ctx_filter to only run for non-contiguous PA since contiguous PA decode buckets are not filtered. - Update docstring for test_exponential_decode_block_limit_uncapped. Signed-off-by: Youlei Yang --- tests/unit_tests/test_bucketing.py | 25 +++++++++++++----------- vllm_gaudi/extension/bucketing/common.py | 2 +- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/tests/unit_tests/test_bucketing.py b/tests/unit_tests/test_bucketing.py index c1b25b7d7a..bc2fa92e91 100644 --- a/tests/unit_tests/test_bucketing.py +++ b/tests/unit_tests/test_bucketing.py @@ -771,25 +771,27 @@ def test_contiguous_pa_decode_buckets_not_filtered_by_ctx(monkeypatch): assert max_blocks in bucket_ctxs, (f"Bucket ctx={max_blocks} (num_hpu_blocks) was incorrectly filtered out.") -def test_file_buckets_with_empty_ctx_range_no_crash(monkeypatch): - """When VLLM_BUCKETING_FROM_FILE is used, ctx_range is empty. +def test_file_buckets_bypass_filters(monkeypatch): + """File-based bucketing (VLLM_BUCKETING_FROM_FILE) skips all filters. - The ctx filter must not crash with IndexError on ctx_range[0]. - Reproduces server.log issue: GraniteMoeHybrid model with file-based - bucketing caused IndexError in num_ctx_tokens_less_or_equal_batched_max_model_len. + Buckets (1,1,256) and (2,1,512) would normally be rejected by the + batch_size_smaller_than_blocks or ctx filters in non-file mode. + Since file buckets bypass filters entirely, all provided buckets + must appear in the output unchanged. """ - monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "false") + monkeypatch.setenv("VLLM_CONTIGUOUS_PA", "true") clear_config() get_config() - max_model_len = 131072 - block_size = 528 + max_model_len = 2048 + block_size = 256 max_num_seqs = 32 max_blocks = 2424 - file_buckets = [(1, 1, 256), (1, 1, 512), (2, 1, 256), (2, 1, 512), (32, 1, 2424)] + # (512,1,256) would be rejected by batch_size_smaller_than_blocks (bs > ctx) + # All buckets pass through because file_buckets bypass filters entirely + file_buckets = [(1, 1, 256), (1, 1, 512), (2, 1, 256), (512, 1, 256), (32, 1, 2424)] - # ctx_range is empty when using file-based bucketing buckets = generate_buckets( bs_range=[], query_range=[], @@ -804,4 +806,5 @@ def test_file_buckets_with_empty_ctx_range_no_crash(monkeypatch): file_buckets=file_buckets, ) - assert len(buckets) > 0, "Should produce buckets from file_buckets" + assert set(buckets) == set(file_buckets), (f"All file buckets should pass through unfiltered.\n" + f"Expected: {sorted(file_buckets)}\nGot: {sorted(buckets)}") diff --git a/vllm_gaudi/extension/bucketing/common.py b/vllm_gaudi/extension/bucketing/common.py index 7543ed2bf7..870cdb0ac3 100644 --- a/vllm_gaudi/extension/bucketing/common.py +++ b/vllm_gaudi/extension/bucketing/common.py @@ -491,7 +491,7 @@ def is_ctx_allowed(ctx): buckets = set() buckets_2d = set() omitted_buckets = set() - filters = get_filters(is_prompt, use_merged_prefill, use_contiguous_pa) + filters = [] if file_buckets else get_filters(is_prompt, use_merged_prefill, use_contiguous_pa) corrector = get_corrector(is_prompt, use_contiguous_pa) if file_buckets: