Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
58aa0c5
test: add hybrid cache and message normalization tests
Thump604 Mar 16, 2026
5754d2e
fix: _make_batch_cache handles all cache types for hybrid models
Thump604 Mar 16, 2026
e27b438
fix: relax MLLM type guard to support hybrid model caches
Thump604 Mar 16, 2026
8ae8f7b
fix: MLLMBatch.extend() works with all cache types
Thump604 Mar 16, 2026
dd86024
fix: normalize consecutive same-role messages before chat template
Thump604 Mar 16, 2026
ced02c0
style: black formatting
Thump604 Mar 16, 2026
a5fffa4
fix: map developer role to system in _normalize_messages()
Thump604 Mar 16, 2026
287f27e
fix: add missing _generation_lock to MLLM stream_chat path
Thump604 Mar 16, 2026
a1e0da8
fix: remove unused CacheList import in test
Thump604 Mar 16, 2026
52e151a
fix: check ALL cache layers in prefix cache _can_trim_cache
Thump604 Mar 16, 2026
c9a3dd1
ci: retrigger CI
Thump604 Mar 16, 2026
4f8b522
fix: prefix cache handles hybrid model cache layers (issues #142, #136)
Thump604 Mar 16, 2026
101cac2
feat: store and reconstruct hybrid model cache layers
Thump604 Mar 16, 2026
d8e687b
fix: fetch guard rejects partial prefix for hybrid models + scheduler…
Thump604 Mar 16, 2026
9993252
style: black formatting for hybrid prefix cache changes
Thump604 Mar 16, 2026
c405431
fix: fork_cache propagates has_non_kv flag + O(1) layer lookup
Thump604 Mar 16, 2026
cace3f3
fix: chunked prefill compat with mlx-lm >= 0.31.0 (issue #155)
Thump604 Mar 16, 2026
d781743
fix: remove unused mlx.core import (ruff F401)
Thump604 Mar 16, 2026
f92b886
style: black formatting for dict comprehension
Thump604 Mar 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 156 additions & 0 deletions tests/test_chunked_prefill_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# SPDX-License-Identifier: Apache-2.0
"""
Tests for chunked prefill compatibility with mlx-lm tuple format changes.

Regression test for issue #155: mlx-lm >= 0.31.0 added prompt_checkpoints
as a 7th element to BatchGenerator.unprocessed_prompts tuples. The chunked
prefill code in scheduler.py hardcoded a 6-element unpacking which crashed
with ValueError on the new format.
"""

from unittest.mock import MagicMock

import pytest

try:
from mlx_lm.models import cache
from mlx_lm.generate import BatchGenerator

HAS_MLX = True
except ImportError:
HAS_MLX = False

pytestmark = pytest.mark.skipif(not HAS_MLX, reason="MLX not available")


@pytest.fixture
def batch_gen():
"""BatchGenerator with a minimal mock model.

make_prompt_cache checks hasattr(model, 'make_cache') and MagicMock
returns True for any attribute, so we delete it to fall through to
the default KVCache-per-layer path.
"""
model = MagicMock()
model.layers = [MagicMock(), MagicMock()]
del model.make_cache

gen = BatchGenerator(
model=model,
max_tokens=100,
prefill_batch_size=1,
completion_batch_size=1,
)
return gen


class TestChunkedPrefillTupleCompat:
"""Verify chunked prefill handles varying tuple sizes from mlx-lm."""

def test_7_element_tuples_unpack_correctly(self):
"""Issue #155: 7-element tuples from mlx-lm >= 0.31.0 must not crash."""
mock_cache = MagicMock()
mock_cache.empty.return_value = True
batch_prompts = [
(0, [1, 2, 3, 4, 5], 100, [mock_cache], None, [], -1),
(1, [10, 20, 30], 50, [mock_cache], None, [], -1),
]

(
uids,
inputs_raw,
max_tokens_list,
caches,
samplers,
logits_processors,
*_extra,
) = zip(*batch_prompts)

assert uids == (0, 1)
assert inputs_raw == ([1, 2, 3, 4, 5], [10, 20, 30])
assert max_tokens_list == (100, 50)
assert len(_extra) == 1 # prompt_checkpoints
assert _extra[0] == (-1, -1)

def test_6_element_tuples_still_work(self):
"""Backward compat: old mlx-lm without prompt_checkpoints."""
mock_cache = MagicMock()
mock_cache.empty.return_value = True
batch_prompts = [
(0, [1, 2, 3], 100, [mock_cache], None, []),
]

(
uids,
inputs_raw,
max_tokens_list,
caches,
samplers,
logits_processors,
*_extra,
) = zip(*batch_prompts)

assert uids == (0,)
assert len(_extra) == 0

def test_8_element_tuples_forward_compat(self):
"""Future-proofing: if mlx-lm adds more fields, still works."""
mock_cache = MagicMock()
mock_cache.empty.return_value = True
batch_prompts = [
(0, [1, 2, 3], 100, [mock_cache], None, [], -1, "future"),
]

(
uids,
inputs_raw,
max_tokens_list,
caches,
samplers,
logits_processors,
*_extra,
) = zip(*batch_prompts)

assert uids == (0,)
assert len(_extra) == 2

def test_batch_generator_insert_creates_7_element_tuples(self, batch_gen):
"""Verify mlx-lm 0.31.x BatchGenerator.insert creates 7-element tuples."""
prompt_cache = cache.make_prompt_cache(batch_gen.model)

batch_gen.insert([[1, 2, 3, 4, 5]], max_tokens=[50], caches=[prompt_cache])

assert len(batch_gen.unprocessed_prompts) == 1
prompt_tuple = batch_gen.unprocessed_prompts[0]
assert len(prompt_tuple) >= 7, (
f"Expected >= 7 elements in prompt tuple, got {len(prompt_tuple)}. "
f"mlx-lm may have changed tuple format again."
)

def test_chunked_prefill_with_7_element_tuples(self, batch_gen):
"""Integration: _install_chunked_prefill works with 7-element tuples."""
from vllm_mlx.scheduler import _install_chunked_prefill

prompt_cache = cache.make_prompt_cache(batch_gen.model)

batch_gen.insert(
[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]],
max_tokens=[50],
caches=[prompt_cache],
)

_install_chunked_prefill(batch_gen, budget=4)

# Must NOT crash with "too many values to unpack".
# Later errors (AttributeError, etc.) are expected since the mock
# model doesn't do real inference — only the unpacking matters.
try:
batch_gen._next()
except ValueError as e:
if "unpack" in str(e).lower():
pytest.fail(
f"Issue #155 regression: chunked prefill crashed on "
f"7-element tuple unpacking: {e}"
)
except Exception:
pass # Expected: mock model can't do real forward pass
Loading
Loading