From 5402ff9e3b3a49d529774816e50536a63065d460 Mon Sep 17 00:00:00 2001 From: hallerite Date: Mon, 9 Mar 2026 21:21:31 +0000 Subject: [PATCH] [Bugfix] Fix tokenizer "Already borrowed" race in multimodal processing Deep-copy the tokenizer before passing it to the multimodal processor so it gets its own Rust tokenizer backend. Without this, concurrent access from AsyncMicrobatchTokenizer (executor thread) and call_hf_processor (main thread) causes RuntimeError from the Rust RefCell borrow checker, triggering the 0.5s retry loop and degrading VLM throughput by ~17x under concurrent load. Signed-off-by: hallerite Signed-off-by: hallerite --- vllm/renderers/base.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index a82646688f45..853a48945eac 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import copy import time from abc import ABC, abstractmethod from collections.abc import Mapping, Sequence @@ -90,10 +91,17 @@ def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None: mm_processor_cache = mm_registry.processor_cache_from_config(config) + # Deep-copy the tokenizer so the multimodal processor gets its + # own Rust tokenizer backend. Without this, concurrent access + # from AsyncMicrobatchTokenizer and call_hf_processor causes + # "RuntimeError: Already borrowed" from the Rust RefCell. + # See: https://github.com/huggingface/tokenizers/issues/537 + mm_tokenizer = copy.deepcopy(tokenizer) + with set_default_torch_num_threads(): self.mm_processor = mm_registry.create_processor( config.model_config, - tokenizer=tokenizer, + tokenizer=mm_tokenizer, cache=mm_processor_cache, )