diff --git a/python/pyproject.toml b/python/pyproject.toml
index 1e89bbd1613a..fa9aa9567e23 100755
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -63,6 +63,8 @@ dependencies = [
   "timm==1.0.16",
   "torch_memory_saver==0.0.9",
   "torch==2.8.0",
+  "torchcodec==0.7.0 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')", # torchcodec does not exist in those systems. If not provided, transformer will use torchvision instead by default.
+  "av ; sys_platform == 'linux' and (platform_machine == 'aarch64' or platform_machine == 'arm64' and platform_machine == 'armv7l')",
   "torchaudio==2.8.0",
   "torchvision",
   "torchao==0.9.0",
@@ -89,7 +91,6 @@ diffusion = [
     "moviepy>=2.0.0",
     "cloudpickle",
     "remote-pdb",
-    "torchcodec==0.5.0",
     "st_attn ==0.0.7",
     "vsa==0.0.4",
 ]
diff --git a/python/pyproject_xpu.toml b/python/pyproject_xpu.toml
index 23d4b6121bf2..b8b1830b6d0c 100644
--- a/python/pyproject_xpu.toml
+++ b/python/pyproject_xpu.toml
@@ -16,6 +16,8 @@ classifiers = [
 
 dependencies = [
   "torch==2.9.0",
+  "torchcodec==0.8.0 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')", # torchcodec does not exist in those systems. If not provided, transformer will use torchvision instead by default.
+  "av ; sys_platform == 'linux' and (platform_machine == 'aarch64' or platform_machine == 'arm64' and platform_machine == 'armv7l')",
   "torchaudio==2.9.0",
   "torchvision",
   "sgl-kernel @ git+https://github.com/sgl-project/sgl-kernel-xpu.git",
diff --git a/python/sglang/srt/multimodal/processors/glm4v.py b/python/sglang/srt/multimodal/processors/glm4v.py
index d09d2d1e7b0f..80d717a7ad76 100644
--- a/python/sglang/srt/multimodal/processors/glm4v.py
+++ b/python/sglang/srt/multimodal/processors/glm4v.py
@@ -1,7 +1,5 @@
 from typing import List, Union
 
-from decord import VideoReader
-
 from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
 from sglang.srt.models.glm4v import Glm4vForConditionalGeneration
 from sglang.srt.models.glm4v_moe import Glm4vMoeForConditionalGeneration
@@ -46,36 +44,6 @@ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
             video_token_id=self.IM_TOKEN_ID,
         ).build(_processor)
 
-    # adapted from https://github.com/huggingface/transformers/blob/369c99d0cea403b77bd0aef818527106453fd9fc/src/transformers/video_utils.py#L312
-    async def preprocess_video(self, vr: VideoReader):
-        """
-        Preprocess video using VideoReader from Decord backend.
-
-        Args:
-            vr (VideoReader): VideoReader object from decord
-
-        Returns:
-            tuple: A tuple containing processed frames and metadata
-        """
-        video_fps = vr.get_avg_fps()
-        total_num_frames = len(vr)
-        duration = total_num_frames / video_fps if video_fps else 0
-
-        # Extract all frames
-        indices = list(range(total_num_frames))
-        frames = vr.get_batch(indices).asnumpy()
-
-        # Return metadata as dict so transformers can properly create VideoMetadata objects
-        metadata = {
-            "total_num_frames": int(total_num_frames),
-            "fps": float(video_fps),
-            "duration": float(duration),
-            "video_backend": "decord",
-            "frames_indices": indices,
-        }
-
-        return frames, metadata
-
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],
@@ -91,19 +59,10 @@ async def process_mm_data_async(
             multimodal_tokens=self.mm_tokens,
         )
 
-        video_metadata = None
-
         if base_output.videos:
-            videos_processed = [
-                await self.preprocess_video(video) for video in base_output.videos
-            ]
-            base_output.videos, video_metadata = map(list, zip(*videos_processed))
-            # transformer requires the video inputs to be under this format
-            base_output.videos = [base_output.videos]
-            video_metadata = [video_metadata]
-
+            base_output.videos = request_obj.video_data
         mm_items, input_ids, ret = self.process_and_combine_mm_data(
-            base_output, self.mm_tokens, video_metadata=video_metadata
+            base_output, self.mm_tokens
         )
 
         input_ids = input_ids.flatten()