From 7364670ded3157d40d417083b25e5dfa8b59b175 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BE=BD=E7=99=AB?= Date: Mon, 30 Mar 2026 11:45:48 +0800 Subject: [PATCH 1/2] Fix shared memory race condition in ShmPointerMMData broadcast for multi-GPU VLM serving --- python/sglang/srt/managers/scheduler.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 4351634515aa..bff2cf3fc082 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -1514,6 +1514,22 @@ def recv_requests( # so that ShmPointerMMData metadata (not full tensor data) is what # gets serialized during broadcast_pyobj. if recv_reqs: + # Barrier for the non-DP-attention path only: there is a single + # broadcast_pyobj on tp_cpu_group where the source rank returns + # the original objects immediately while other ranks are still in + # pickle.loads (-> __setstate__ -> shm_open). Without a barrier + # the source can call materialize() / shm_unlink before others + # open the segment. recv_reqs is consistent across all ranks + # here (same broadcast), so the guard is deadlock-free. + # + # Under DP-attention no barrier is needed: the control_reqs + # broadcast on tp_cpu_group (step 3) is a collective that forces + # every rank to complete the earlier attn_tp / attn_cp work_reqs + # deserializations (steps 1-2, which call shm_open) before any + # rank returns from step 3. POSIX guarantees shm_unlink only + # removes the name; already-open handles stay valid. + if not self.server_args.enable_dp_attention and self.tp_size > 1: + barrier(group=self.tp_cpu_group) for req in recv_reqs: unwrap_shm_features(req) From 5967633e9cee29b2b74c8aa5c20d1346af1bad44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BE=BD=E7=99=AB?= Date: Mon, 30 Mar 2026 18:27:00 +0800 Subject: [PATCH 2/2] more conditions --- python/sglang/srt/managers/mm_utils.py | 13 +++++++++++++ python/sglang/srt/managers/scheduler.py | 13 +++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/managers/mm_utils.py b/python/sglang/srt/managers/mm_utils.py index 77f26c959cec..561ec5291a42 100644 --- a/python/sglang/srt/managers/mm_utils.py +++ b/python/sglang/srt/managers/mm_utils.py @@ -1723,6 +1723,19 @@ def wrap_shm_features(obj): return obj +def has_shm_features(recv_reqs): + """Return True if any request in the list contains ShmPointerMMData.""" + for req in recv_reqs: + if hasattr(req, "batch"): + if has_shm_features(req.batch): + return True + elif hasattr(req, "mm_inputs") and req.mm_inputs: + for item in req.mm_inputs.get("mm_items", []): + if isinstance(item.feature, ShmPointerMMData): + return True + return False + + def unwrap_shm_features(obj): """ Restore ShmPointerMMData wrappers back into standard torch.Tensors. diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index bff2cf3fc082..69311ec3a282 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -142,7 +142,11 @@ UpdateWeightsFromIPCReqInput, UpdateWeightsFromTensorReqInput, ) -from sglang.srt.managers.mm_utils import init_mm_embedding_cache, unwrap_shm_features +from sglang.srt.managers.mm_utils import ( + has_shm_features, + init_mm_embedding_cache, + unwrap_shm_features, +) from sglang.srt.managers.multimodal_processor import get_mm_processor, import_processors from sglang.srt.managers.overlap_utils import FutureMap from sglang.srt.managers.prefill_delayer import ( @@ -1528,7 +1532,12 @@ def recv_requests( # deserializations (steps 1-2, which call shm_open) before any # rank returns from step 3. POSIX guarantees shm_unlink only # removes the name; already-open handles stay valid. - if not self.server_args.enable_dp_attention and self.tp_size > 1: + if ( + not self.server_args.enable_dp_attention + and self.tp_size > 1 + and self.model_config.is_multimodal + and has_shm_features(recv_reqs) + ): barrier(group=self.tp_cpu_group) for req in recv_reqs: unwrap_shm_features(req)