Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/sglang/srt/environ.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ class Envs:
SGLANG_VLM_CACHE_SIZE_MB = EnvInt(100)
SGLANG_IMAGE_MAX_PIXELS = EnvInt(16384 * 28 * 28)
SGLANG_RESIZE_RESAMPLE = EnvStr("")
SGLANG_USE_OPENCV_VIDEO_BACKEND = EnvBool(False)
SGLANG_MM_BUFFER_SIZE_MB = EnvInt(0)
SGLANG_MM_PRECOMPUTE_HASH = EnvBool(False)
SGLANG_VIT_ENABLE_CUDA_GRAPH = EnvBool(False)
Expand Down
16 changes: 14 additions & 2 deletions python/sglang/srt/multimodal/processors/qwen_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
BaseMultimodalProcessor as SGLangBaseProcessor,
)
from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
from sglang.srt.utils import read_video_frames_opencv
from sglang.utils import logger

IMAGE_FACTOR = 28
Expand Down Expand Up @@ -150,13 +151,24 @@ async def preprocess_video(
) -> torch.Tensor:
entry_time = time.perf_counter()

total_frames, video_fps = len(vr), vr.get_avg_fps()
if envs.SGLANG_USE_OPENCV_VIDEO_BACKEND.get():
import cv2

total_frames, video_fps = int(vr.get(cv2.CAP_PROP_FRAME_COUNT)), vr.get(
cv2.CAP_PROP_FPS
)
else:
total_frames, video_fps = len(vr), vr.get_avg_fps()

nframes = smart_nframes(
video_config, total_frames=total_frames, video_fps=video_fps
)
idx = np.linspace(0, total_frames - 1, num=nframes, dtype=np.int64)
idx = np.unique(idx)
video_np = vr.get_batch(idx).asnumpy()
if envs.SGLANG_USE_OPENCV_VIDEO_BACKEND.get():
video_np = read_video_frames_opencv(vr, idx)
else:
video_np = vr.get_batch(idx).asnumpy()
video = torch.from_numpy(video_np).pin_memory()
video = video.permute(0, 3, 1, 2) # Convert to TCHW format

Expand Down
93 changes: 93 additions & 0 deletions python/sglang/srt/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,13 @@ def get_image_bytes(image_file: Union[str, bytes]):


def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
if envs.SGLANG_USE_OPENCV_VIDEO_BACKEND.get():
return get_video_opencv_handler(video_file)
else:
return get_video_decord_handler(video_file, use_gpu)


def get_video_decord_handler(video_file: Union[str, bytes], use_gpu: bool = True):
# We import decord here to avoid a strange Segmentation fault (core dumped) issue.
from decord import VideoReader, cpu, gpu

Expand Down Expand Up @@ -1010,6 +1017,92 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
os.unlink(tmp_file.name)


def get_video_opencv_handler(video_file: Union[str, bytes]):
import cv2

tmp_file = None
vc = None
try:
if isinstance(video_file, bytes):
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
tmp_file.write(video_file)
tmp_file.close()
vc = cv2.VideoCapture(tmp_file.name)
elif isinstance(video_file, str):
if video_file.startswith(("http://", "https://")):
timeout = int(os.getenv("REQUEST_TIMEOUT", "10"))
response = requests.get(video_file, stream=True, timeout=timeout)
response.raise_for_status()
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
for chunk in response.iter_content(chunk_size=8192):
tmp_file.write(chunk)
tmp_file.close()
vc = cv2.VideoCapture(tmp_file.name)
elif video_file.startswith("data:"):
_, encoded = video_file.split(",", 1)
video_bytes = pybase64.b64decode(encoded, validate=True)
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
tmp_file.write(video_bytes)
tmp_file.close()
vc = cv2.VideoCapture(tmp_file.name)
# `urlparse` supports file:// paths, and so does VideoReader
elif os.path.isfile(urlparse(video_file).path):
vc = cv2.VideoCapture(tmp_file.name)
else:
video_bytes = pybase64.b64decode(video_file, validate=True)
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
tmp_file.write(video_bytes)
tmp_file.close()
vc = cv2.VideoCapture(tmp_file.name)
else:
raise ValueError(f"Unsupported video input type: {type(video_file)}")

return vc

finally:
if tmp_file and os.path.exists(tmp_file.name):
os.unlink(tmp_file.name)


def read_video_frames_opencv(vc, frame_idx: List[int]):
import cv2

try:
width = int(vc.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(vc.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(vc.get(cv2.CAP_PROP_FRAME_COUNT))
n_frames = len(frame_idx)

video_np = np.empty((n_frames, height, width, 3), dtype=np.uint8)
mx_idx = min(total_frames, max(frame_idx) + 1)
frame_idx_st = set(frame_idx)
cnt = 0
for idx in range(mx_idx):
ok = vc.grab()
if not ok:
if idx in frame_idx_st:
logger.warning(
f"Failed to read frame {idx}, skipped. The video may be corrupted."
)
continue
if idx in frame_idx_st:
ret, frame = vc.retrieve()
if ret:
video_np[cnt] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
cnt += 1
else:
logger.warning(
f"Failed to retrieve frame {idx}, skipped. The video may be corrupted."
)
if cnt != n_frames:
logger.warning(
f"Expected {n_frames} frames, but only got {cnt}. The video may be corrupted."
)
return video_np[:cnt]
finally:
vc.release()


def sample_video_frames(
video: "VideoReader", *, desired_fps: int, max_frames: int
) -> list[int]:
Expand Down
Loading