diff --git a/python/pyproject.toml b/python/pyproject.toml index 1e89bbd1613a..fa9aa9567e23 100755 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -63,6 +63,8 @@ dependencies = [ "timm==1.0.16", "torch_memory_saver==0.0.9", "torch==2.8.0", + "torchcodec==0.7.0 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')", # torchcodec does not exist in those systems. If not provided, transformer will use torchvision instead by default. + "av ; sys_platform == 'linux' and (platform_machine == 'aarch64' or platform_machine == 'arm64' and platform_machine == 'armv7l')", "torchaudio==2.8.0", "torchvision", "torchao==0.9.0", @@ -89,7 +91,6 @@ diffusion = [ "moviepy>=2.0.0", "cloudpickle", "remote-pdb", - "torchcodec==0.5.0", "st_attn ==0.0.7", "vsa==0.0.4", ] diff --git a/python/pyproject_xpu.toml b/python/pyproject_xpu.toml index 23d4b6121bf2..b8b1830b6d0c 100644 --- a/python/pyproject_xpu.toml +++ b/python/pyproject_xpu.toml @@ -16,6 +16,8 @@ classifiers = [ dependencies = [ "torch==2.9.0", + "torchcodec==0.8.0 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')", # torchcodec does not exist in those systems. If not provided, transformer will use torchvision instead by default. + "av ; sys_platform == 'linux' and (platform_machine == 'aarch64' or platform_machine == 'arm64' and platform_machine == 'armv7l')", "torchaudio==2.9.0", "torchvision", "sgl-kernel @ git+https://github.com/sgl-project/sgl-kernel-xpu.git", diff --git a/python/sglang/srt/multimodal/processors/glm4v.py b/python/sglang/srt/multimodal/processors/glm4v.py index d09d2d1e7b0f..80d717a7ad76 100644 --- a/python/sglang/srt/multimodal/processors/glm4v.py +++ b/python/sglang/srt/multimodal/processors/glm4v.py @@ -1,7 +1,5 @@ from typing import List, Union -from decord import VideoReader - from sglang.srt.layers.rotary_embedding import MRotaryEmbedding from sglang.srt.models.glm4v import Glm4vForConditionalGeneration from sglang.srt.models.glm4v_moe import Glm4vMoeForConditionalGeneration @@ -46,36 +44,6 @@ def __init__(self, hf_config, server_args, _processor, *args, **kwargs): video_token_id=self.IM_TOKEN_ID, ).build(_processor) - # adapted from https://github.com/huggingface/transformers/blob/369c99d0cea403b77bd0aef818527106453fd9fc/src/transformers/video_utils.py#L312 - async def preprocess_video(self, vr: VideoReader): - """ - Preprocess video using VideoReader from Decord backend. - - Args: - vr (VideoReader): VideoReader object from decord - - Returns: - tuple: A tuple containing processed frames and metadata - """ - video_fps = vr.get_avg_fps() - total_num_frames = len(vr) - duration = total_num_frames / video_fps if video_fps else 0 - - # Extract all frames - indices = list(range(total_num_frames)) - frames = vr.get_batch(indices).asnumpy() - - # Return metadata as dict so transformers can properly create VideoMetadata objects - metadata = { - "total_num_frames": int(total_num_frames), - "fps": float(video_fps), - "duration": float(duration), - "video_backend": "decord", - "frames_indices": indices, - } - - return frames, metadata - async def process_mm_data_async( self, image_data: List[Union[str, bytes]], @@ -91,19 +59,10 @@ async def process_mm_data_async( multimodal_tokens=self.mm_tokens, ) - video_metadata = None - if base_output.videos: - videos_processed = [ - await self.preprocess_video(video) for video in base_output.videos - ] - base_output.videos, video_metadata = map(list, zip(*videos_processed)) - # transformer requires the video inputs to be under this format - base_output.videos = [base_output.videos] - video_metadata = [video_metadata] - + base_output.videos = request_obj.video_data mm_items, input_ids, ret = self.process_and_combine_mm_data( - base_output, self.mm_tokens, video_metadata=video_metadata + base_output, self.mm_tokens ) input_ids = input_ids.flatten()