From eae87cc22580e69597297cfc393444d4587bae5d Mon Sep 17 00:00:00 2001
From: David Malson <dmalson@ocr.3db>
Date: Sat, 28 Feb 2026 15:01:45 -0500
Subject: [PATCH] Fix cross-request data leakage from base64 image cache
 collision
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`save_base64_image()` cached temp file paths using
`md5(base64_string[:1000])` — only the first 1000 characters of the
base64 string. For JPEG images rendered by the same PDF converter or
scanner, the first ~730 bytes of decoded image data (SOI marker, EXIF
headers, quantization tables) are often identical. This caused different
images to produce the same cache key, returning a previous request's
temp file.

The result was that the model received the wrong image pixels entirely,
generating output based on a prior request's image — a data leakage bug.

Fix: hash the full base64 string with SHA-256 instead of MD5 on a
truncated prefix. Also fix `vision_embedding_cache.py` to hash full
file content instead of the first 64KB.
---
 vllm_mlx/models/mllm.py            | 6 ++++--
 vllm_mlx/vision_embedding_cache.py | 5 +++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm_mlx/models/mllm.py b/vllm_mlx/models/mllm.py
index 5a3551eb1..ec761c3f2 100644
--- a/vllm_mlx/models/mllm.py
+++ b/vllm_mlx/models/mllm.py
@@ -465,8 +465,10 @@ def save_base64_image(base64_string: str) -> str:
     """Save base64 image to temp file and return path. Caches identical images."""
     import hashlib
 
-    # Hash the full base64 string to prevent collisions between images
-    # with identical headers (e.g. JPEG images sharing first 1000 chars)
+    # Hash the FULL base64 string — not just a prefix.
+    # Using only the first 1000 chars caused cache collisions between
+    # different images with identical JPEG headers (e.g. invoices from
+    # the same PDF renderer), returning a previous request's image.
     image_hash = hashlib.sha256(base64_string.encode()).hexdigest()
 
     # Return cached path if available and file still exists
diff --git a/vllm_mlx/vision_embedding_cache.py b/vllm_mlx/vision_embedding_cache.py
index 09749aaa9..106a729fa 100644
--- a/vllm_mlx/vision_embedding_cache.py
+++ b/vllm_mlx/vision_embedding_cache.py
@@ -106,9 +106,10 @@ def compute_image_hash(image_path: str) -> str:
     try:
         path = Path(image_path)
         if path.exists() and path.is_file():
-            # Hash file content (first 64KB for speed)
+            # Hash full file content (not truncated — truncation can
+            # cause collisions for images with identical headers)
             with open(path, "rb") as f:
-                content = f.read(65536)
+                content = f.read()
             return hashlib.sha256(content).hexdigest()[:16]
         else:
             # Hash the string (URL or base64)