From 5402ff9e3b3a49d529774816e50536a63065d460 Mon Sep 17 00:00:00 2001
From: hallerite <git@hallerite.com>
Date: Mon, 9 Mar 2026 21:21:31 +0000
Subject: [PATCH] [Bugfix] Fix tokenizer "Already borrowed" race in multimodal
 processing

Deep-copy the tokenizer before passing it to the multimodal processor
so it gets its own Rust tokenizer backend. Without this, concurrent
access from AsyncMicrobatchTokenizer (executor thread) and
call_hf_processor (main thread) causes RuntimeError from the Rust
RefCell borrow checker, triggering the 0.5s retry loop and degrading
VLM throughput by ~17x under concurrent load.

Signed-off-by: hallerite <hallerite@users.noreply.github.com>
Signed-off-by: hallerite <git@hallerite.com>
---
 vllm/renderers/base.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py
index a82646688f45..853a48945eac 100644
--- a/vllm/renderers/base.py
+++ b/vllm/renderers/base.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
+import copy
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Mapping, Sequence
@@ -90,10 +91,17 @@ def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None:
 
             mm_processor_cache = mm_registry.processor_cache_from_config(config)
 
+            # Deep-copy the tokenizer so the multimodal processor gets its
+            # own Rust tokenizer backend.  Without this, concurrent access
+            # from AsyncMicrobatchTokenizer and call_hf_processor causes
+            # "RuntimeError: Already borrowed" from the Rust RefCell.
+            # See: https://github.com/huggingface/tokenizers/issues/537
+            mm_tokenizer = copy.deepcopy(tokenizer)
+
             with set_default_torch_num_threads():
                 self.mm_processor = mm_registry.create_processor(
                     config.model_config,
-                    tokenizer=tokenizer,
+                    tokenizer=mm_tokenizer,
                     cache=mm_processor_cache,
                 )