diff --git a/tests/v1/distributed/test_eagle_dp.py b/tests/v1/distributed/test_eagle_dp.py index e20893b63632..7b6731788ef3 100644 --- a/tests/v1/distributed/test_eagle_dp.py +++ b/tests/v1/distributed/test_eagle_dp.py @@ -69,9 +69,7 @@ async def test_run_eagle_dp(monkeypatch: pytest.MonkeyPatch, attn_backend: str): ) prompt = "This is a test of data parallel with eagle" - # This test might be flaky, see - # https://github.com/vllm-project/vllm/issues/31913 - num_expected_tokens = 20 + num_expected_tokens = 100 sampling_params = SamplingParams( max_tokens=num_expected_tokens, ignore_eos=True, diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 245995be2642..f36863376dec 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -369,8 +369,11 @@ def build( slot_mapping = common_attn_metadata.slot_mapping causal = common_attn_metadata.causal - # the overhead of the aot schedule is not worth it for spec-decode - aot_schedule = self.aot_schedule and not fast_build + # Disable AOT schedule for spec-decode proposer (not worth the overhead) + # and for batch invariance (schedule varies with max_seqlen_q/k). + aot_schedule = ( + self.aot_schedule and not fast_build and not envs.VLLM_BATCH_INVARIANT + ) if self.aot_sliding_window is None: self.aot_sliding_window = (-1, -1)