From 8016494bb4b1d358db92143717688aa9e264a95d Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 29 Apr 2025 02:30:22 -0700 Subject: [PATCH 1/2] Fix some spec decode tests with tl.dot Signed-off-by: Huy Do --- tests/spec_decode/e2e/test_multistep_correctness.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index bb45be791fa8..2ab2c14d9a16 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ - # As of this writing, vLLM only compiles with these 3 block sizes by - # default. - { - "block_size": 8, - }, + # https://github.com/triton-lang/triton/issues/2266 tl.dot + # doesn't support embedding < 16 { "block_size": 16, }, From 8b47659a82e6263f8d6cd7c9b2a7bfc4aa86b0db Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 29 Apr 2025 02:32:24 -0700 Subject: [PATCH 2/2] Another Signed-off-by: Huy Do --- tests/spec_decode/e2e/test_multistep_correctness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index 2ab2c14d9a16..e187b6bc1434 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( @pytest.mark.parametrize( "common_llm_kwargs", [{ - "block_size": 8, + "block_size": 16, # 2 for small prompt, 256//8 for generated. "num_gpu_blocks_override": 2 + 256 // 8, "max_model_len": (2 + 256 // 8) * 8,