From d8601d796371108c19d894636d4b792a6db2bafa Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Thu, 26 Mar 2026 00:57:30 +0100 Subject: [PATCH] fix: bump mlx-lm minimum to 0.31.0 for hybrid model batching ArraysCache gained native batching support (extract, merge, filter, prepare) in mlx-lm 0.31.0. Older versions crash with "ArraysCache.__init__() missing 1 required positional argument: 'size'" when continuous batching encounters hybrid models like Qwen3.5 that mix KVCache and ArraysCache layers. Fixes computor-org/vllm-mlx#11 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fa92e960..88e878c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ classifiers = [ dependencies = [ "mlx>=0.29.0", - "mlx-lm>=0.30.5", # GLM-4 models require 0.30.5+ (new attention layers) + "mlx-lm>=0.31.0", # 0.31+ required for ArraysCache native batching (hybrid models) "mlx-vlm>=0.1.0", # VLM support "transformers>=5.0.0", # mlx-lm 0.30.5+ requires transformers 5.0 (rc3 bug fixed in stable) "tokenizers>=0.19.0",