diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py index 947191e3b7d6..15fd3f174ed4 100644 --- a/python/sglang/srt/environ.py +++ b/python/sglang/srt/environ.py @@ -315,6 +315,7 @@ class Envs: SGLANG_VLM_CACHE_SIZE_MB = EnvInt(100) SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28) SGLANG_RESIZE_RESAMPLE = EnvStr("") + SGLANG_MM_BUFFER_SIZE_MB = EnvInt(0) # Release & Resume Memory SGLANG_MEMORY_SAVER_CUDA_GRAPH = EnvBool(False) diff --git a/python/sglang/srt/managers/mm_utils.py b/python/sglang/srt/managers/mm_utils.py index e9347e8112be..75c646a63ea3 100644 --- a/python/sglang/srt/managers/mm_utils.py +++ b/python/sglang/srt/managers/mm_utils.py @@ -12,6 +12,7 @@ from torch import nn from sglang.srt.distributed.parallel_state import get_tp_group +from sglang.srt.environ import envs from sglang.srt.layers.multimodal import gpu_tensor_hash from sglang.srt.managers.schedule_batch import ( CudaIpcTensorTransportProxy, @@ -37,6 +38,59 @@ TensorTransportMode = Literal["cuda_ipc", "auto", "default"] +_GPU_FEATURE_BUFFER: Optional[torch.Tensor] = None +_BUFFER_OFFSET = 0 + + +def init_feature_buffer(device): + global _GPU_FEATURE_BUFFER, _BUFFER_OFFSET + if ( + device == "cpu" + or envs.SGLANG_MM_BUFFER_SIZE_MB.get() == 0 + or _GPU_FEATURE_BUFFER is not None + ): + return + try: + size_mb = envs.SGLANG_MM_BUFFER_SIZE_MB.get() + num_elements = int(size_mb * 1024 * 1024 / 4) + _GPU_FEATURE_BUFFER = torch.empty( + num_elements, dtype=torch.float32, device=device + ) + logger.info(f"Preallocated {size_mb}MB GPU buffer") + except RuntimeError as e: + _GPU_FEATURE_BUFFER = None + + +def reset_buffer_offset(): + global _BUFFER_OFFSET + _BUFFER_OFFSET = 0 + + +def is_feature_buffer_initialized(): + global _GPU_FEATURE_BUFFER + if _GPU_FEATURE_BUFFER is None: + return False + return True + + +def try_add_to_buffer(tensor: torch.Tensor) -> Optional[torch.Tensor]: + global _BUFFER_OFFSET + + if _GPU_FEATURE_BUFFER is None: + return tensor + + tensor_size = tensor.numel() + + if _BUFFER_OFFSET + tensor_size <= _GPU_FEATURE_BUFFER.numel(): + buffer_view = _GPU_FEATURE_BUFFER[_BUFFER_OFFSET : _BUFFER_OFFSET + tensor_size] + buffer_view.copy_(tensor.flatten(), non_blocking=True) + result = buffer_view.view(tensor.shape) + _BUFFER_OFFSET += tensor_size + return result + else: + return tensor + + class TransportProxyTensor(torch.Tensor): """ A convenient torch.Tensor subclass that carries extra metadata and supports diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index c4c5a9ebbad2..bf1f13d5831b 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -325,9 +325,32 @@ def from_dict(obj: dict): assert isinstance(ret.mm_items, list) ret.mm_items = [item for item in ret.mm_items if item.is_valid()] + + if envs.SGLANG_MM_BUFFER_SIZE_MB.get() > 0: + from sglang.srt.managers.mm_utils import ( + init_feature_buffer, + is_feature_buffer_initialized, + reset_buffer_offset, + try_add_to_buffer, + ) + + device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu" + if not is_feature_buffer_initialized(): + init_feature_buffer(device) + reset_buffer_offset() + for item in ret.mm_items: + if item.feature is not None: + if isinstance(item.feature, torch.Tensor): + item.feature = try_add_to_buffer(item.feature) + for item in ret.mm_items: item.set_pad_value() + if envs.SGLANG_MM_BUFFER_SIZE_MB.get() > 0: + for item in ret.mm_items: + if item.feature is not None: + item.feature = item.feature.to("cpu", non_blocking=True) + optional_args = [ "mrope_positions", "mrope_position_delta",