From 80917b356619f7bfeafa4879966c4103f3d4cca2 Mon Sep 17 00:00:00 2001 From: liusy58 Date: Mon, 8 Dec 2025 23:49:33 +0800 Subject: [PATCH] add doc --- docs/references/environment_variables.md | 1 + python/sglang/srt/managers/schedule_batch.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/docs/references/environment_variables.md b/docs/references/environment_variables.md index 67e3f92955a3..f2051f8f5044 100644 --- a/docs/references/environment_variables.md +++ b/docs/references/environment_variables.md @@ -36,6 +36,7 @@ SGLang supports various environment variables that can be used to configure its | `SGLANG_SCHEDULER_RECV_SKIPPER_WEIGHT_DECODE` | Weight increment for decode forward mode in scheduler recv skipper. Works with `--scheduler-recv-interval` to control polling frequency during decode phase. | `1` | | `SGLANG_SCHEDULER_RECV_SKIPPER_WEIGHT_VERIFY` | Weight increment for target verify forward mode in scheduler recv skipper. Works with `--scheduler-recv-interval` to control polling frequency during verification phase. | `1` | | `SGLANG_SCHEDULER_RECV_SKIPPER_WEIGHT_NONE` | Weight increment when forward mode is None in scheduler recv skipper. Works with `--scheduler-recv-interval` to control polling frequency when no specific forward mode is active. | `1` | +| `SGLANG_MM_BUFFER_SIZE_MB` | Size of preallocated GPU buffer (in MB) for multi-modal feature hashing optimization. When set to a positive value, temporarily moves features to GPU for faster hash computation, then moves them back to CPU to save GPU memory. Larger features benefit more from GPU hashing. Set to `0` to disable. | `0` | ## DeepGEMM Configuration (Advanced Optimization) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index f712fe0164e4..a1ab8b23b0ca 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -327,6 +327,9 @@ def from_dict(obj: dict): ret.mm_items = [item for item in ret.mm_items if item.is_valid()] if envs.SGLANG_MM_BUFFER_SIZE_MB.get() > 0: + # Multi-modal feature hashing optimization: + # When SGLANG_MM_BUFFER_SIZE_MB > 0, we temporarily move feature tensors to GPU + # for faster hash computation, while avoiding OOM issues. from sglang.srt.managers.mm_utils import ( init_feature_buffer, is_feature_buffer_initialized,