From 31850f2dc53d02829b400077db8dc9bbfad552fe Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <justinning0323@outlook.com>
Date: Sun, 30 Mar 2025 10:26:22 +0000
Subject: [PATCH 1/8] [Fix] Take Image as input, also refactor hashing and
 loading logic for better type handling

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
---
 .../managers/multimodal_processors/base_processor.py  | 11 ++++++++++-
 .../srt/managers/multimodal_processors/qwen_vl.py     |  4 ++++
 python/sglang/srt/utils.py                            |  8 +++++---
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/managers/multimodal_processors/base_processor.py b/python/sglang/srt/managers/multimodal_processors/base_processor.py
index a784abb70d8..bf164905d88 100644
--- a/python/sglang/srt/managers/multimodal_processors/base_processor.py
+++ b/python/sglang/srt/managers/multimodal_processors/base_processor.py
@@ -230,7 +230,16 @@ def load_mm_data(
                             continue
 
                     image_sizes += frames[0].size * len(frames)
-                    hashes += [hash(image_file)] * len(frames)
+                    
+                    # Generate a hashable value for the image file
+                    if isinstance(image_file, Image.Image):
+                        # For PIL.Image objects, use the ID as a hashable value
+                        hash_value = hash(id(image_file))
+                    else:
+                        # For other types (strings, etc.), use the regular hash
+                        hash_value = hash(image_file)
+                    
+                    hashes += [hash_value] * len(frames)
                     images += frames
                     image_index += 1
                     if frames_to_process != 0:
diff --git a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
index d978e49f1fe..0d1130f59cf 100644
--- a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
+++ b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
@@ -53,6 +53,7 @@ def _process_images_task(images, input_text, _hf_config):
     async def _process_single_image(self, images, input_text) -> dict:
         if self.executor is not None:
             loop = asyncio.get_event_loop()
+            # FIXME
             return await loop.run_in_executor(
                 self.executor,
                 Qwen2_5VLImageProcessor._process_images_task,
@@ -149,6 +150,9 @@ def floor_by_factor(number: int, factor: int) -> int:
         ret = await self._process_single_image(
             images=images, input_text=base_output.input_text
         )
+        
+        #debug
+        print(f"!!!!!! ret: {type(ret)}, {ret["image_grid_thw"]}")
 
         image_grid_thws = torch.concat([ret["image_grid_thw"]])
         video_grid_thws = None
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 4d7a2e532a6..21ca4c60160 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -537,10 +537,12 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
     return audio
 
 
-def load_image(image_file: Union[str, bytes]) -> tuple[Image, tuple[int, int]]:
+def load_image(image_file: Union[Image.Image, str, bytes]) -> tuple[Image.Image, tuple[int, int]]:
     image = image_size = None
-
-    if isinstance(image_file, bytes):
+    if isinstance(image_file, Image.Image):
+        image = image_file
+        image_size = (image.width, image.height)
+    elif isinstance(image_file, bytes):
         image = Image.open(BytesIO(image_file))
     elif image_file.startswith("http://") or image_file.startswith("https://"):
         timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))

From 7c3c466c73403a56e08b2d72e7ae0b103aaae397 Mon Sep 17 00:00:00 2001
From: GeLee <leege233@gmail.com>
Date: Mon, 31 Mar 2025 00:55:37 +0800
Subject: [PATCH 2/8] fix the index error in sglang rollout

---
 python/sglang/srt/entrypoints/engine.py       |  5 ++--
 .../managers/multimodal_processors/qwen_vl.py | 25 ++++++++++---------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
index afc46c50f3b..6732d52bdd9 100644
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -152,8 +152,9 @@ def generate(
         Please refer to `GenerateReqInput` for the documentation.
         """
         modalities_list = []
-        if image_data is not None:
-            modalities_list.append("image")
+        for item_image_data in image_data:
+            if item_image_data is not None:
+                modalities_list.append("image")
 
         obj = GenerateReqInput(
             text=prompt,
diff --git a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
index 0d1130f59cf..f2bd5a7567f 100644
--- a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
+++ b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
@@ -51,18 +51,19 @@ def _process_images_task(images, input_text, _hf_config):
         }
 
     async def _process_single_image(self, images, input_text) -> dict:
-        if self.executor is not None:
-            loop = asyncio.get_event_loop()
-            # FIXME
-            return await loop.run_in_executor(
-                self.executor,
-                Qwen2_5VLImageProcessor._process_images_task,
-                images,
-                input_text,
-                self.hf_config,
-            )
-        else:
-            return self._process_images_task(images, input_text, self.hf_config)
+        # if self.executor is not None:
+        #     loop = asyncio.get_event_loop()
+        #     # FIXME
+        #     return await loop.run_in_executor(
+        #         self.executor,
+        #         Qwen2_5VLImageProcessor._process_images_task,
+        #         images,
+        #         input_text,
+        #         self.hf_config,
+        #     )
+        # else:
+        #     return self._process_images_task(images, input_text, self.hf_config)
+        return self._process_images_task(images, input_text, self.hf_config)
 
     async def process_mm_data_async(
         self,

From b09b293960ad9d19d77e50fb15c48f91385ecccb Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <justinning0323@outlook.com>
Date: Sun, 30 Mar 2025 23:01:16 +0000
Subject: [PATCH 3/8] minor: clean up unused imports and debug print statement

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
---
 .../sglang/srt/managers/multimodal_processors/qwen_vl.py   | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
index f2bd5a7567f..a2d5542e0b8 100644
--- a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
+++ b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
@@ -6,10 +6,8 @@
 import torch
 from PIL import Image
 
-from sglang.srt.managers.multimodal_processor import (
-    BaseMultimodalProcessor as SGLangBaseProcessor,
-)
 from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
     MultimodalSpecialTokens,
     get_global_processor,
 )
@@ -151,9 +149,6 @@ def floor_by_factor(number: int, factor: int) -> int:
         ret = await self._process_single_image(
             images=images, input_text=base_output.input_text
         )
-        
-        #debug
-        print(f"!!!!!! ret: {type(ret)}, {ret["image_grid_thw"]}")
 
         image_grid_thws = torch.concat([ret["image_grid_thw"]])
         video_grid_thws = None

From 530b6e949d71ad44602fa2baa411bdd2e471650f Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <justinning0323@outlook.com>
Date: Mon, 31 Mar 2025 06:53:15 +0000
Subject: [PATCH 4/8] format

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
---
 .../srt/managers/multimodal_processors/base_processor.py      | 4 ++--
 python/sglang/srt/managers/multimodal_processors/qwen_vl.py   | 2 ++
 python/sglang/srt/utils.py                                    | 4 +++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/managers/multimodal_processors/base_processor.py b/python/sglang/srt/managers/multimodal_processors/base_processor.py
index bf164905d88..15f057adfe7 100644
--- a/python/sglang/srt/managers/multimodal_processors/base_processor.py
+++ b/python/sglang/srt/managers/multimodal_processors/base_processor.py
@@ -230,7 +230,7 @@ def load_mm_data(
                             continue
 
                     image_sizes += frames[0].size * len(frames)
-                    
+
                     # Generate a hashable value for the image file
                     if isinstance(image_file, Image.Image):
                         # For PIL.Image objects, use the ID as a hashable value
@@ -238,7 +238,7 @@ def load_mm_data(
                     else:
                         # For other types (strings, etc.), use the regular hash
                         hash_value = hash(image_file)
-                    
+
                     hashes += [hash_value] * len(frames)
                     images += frames
                     image_index += 1
diff --git a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
index a2d5542e0b8..e98bf5ba8de 100644
--- a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
+++ b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
@@ -8,6 +8,8 @@
 
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.managers.multimodal_processors.base_processor import (
     MultimodalSpecialTokens,
     get_global_processor,
 )
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 21ca4c60160..29f98d0001f 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -537,7 +537,9 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
     return audio
 
 
-def load_image(image_file: Union[Image.Image, str, bytes]) -> tuple[Image.Image, tuple[int, int]]:
+def load_image(
+    image_file: Union[Image.Image, str, bytes]
+) -> tuple[Image.Image, tuple[int, int]]:
     image = image_size = None
     if isinstance(image_file, Image.Image):
         image = image_file

From 9dbd9ea0059ac9efbfbb06d208967232211a7eb0 Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <justinning0323@outlook.com>
Date: Tue, 1 Apr 2025 05:01:39 +0000
Subject: [PATCH 5/8] fix: remove unused code due to auto merge

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
---
 .../managers/multimodal_processors/qwen_vl.py | 29 -------------------
 1 file changed, 29 deletions(-)

diff --git a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
index 757a6ffe8dc..9d0d6d8437f 100644
--- a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
+++ b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py
@@ -33,35 +33,6 @@ def __init__(self, hf_config, server_args, _processor):
         self.MAX_PIXELS = 16384 * 28 * 28
         self.MAX_RATIO = 200
 
-    @staticmethod
-    def _process_images_task(images, input_text, _hf_config):
-        if isinstance(images, list) and len(images) == 0:
-            images = None
-        result = get_global_processor().__call__(
-            text=[input_text], images=images, padding=True, return_tensors="pt"
-        )
-
-        return {
-            "input_ids": result.input_ids,
-            "pixel_values": getattr(result, "pixel_values", None),
-            "image_grid_thw": getattr(result, "image_grid_thw", None),
-            "second_per_grid_ts": getattr(result, "second_per_grid_ts", None),
-            "video_grid_thws": getattr(result, "video_grid_thws", None),
-        }
-
-    async def _process_single_image(self, images, input_text) -> dict:
-        if self.executor is not None:
-            loop = asyncio.get_event_loop()
-            return await loop.run_in_executor(
-                self.executor,
-                Qwen2_5VLImageProcessor._process_images_task,
-                images,
-                input_text,
-                self.hf_config,
-            )
-        else:
-            return self._process_images_task(images, input_text, self.hf_config)
-
     async def process_mm_data_async(
         self,
         image_data: List[Union[str, bytes]],

From 32ef9dcaf4f2958ebb0ac931277c3612fd4ee734 Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <justinning0323@outlook.com>
Date: Tue, 1 Apr 2025 05:02:22 +0000
Subject: [PATCH 6/8] fix: remove redundant assert

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
---
 .../sglang/srt/managers/multimodal_processors/base_processor.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/sglang/srt/managers/multimodal_processors/base_processor.py b/python/sglang/srt/managers/multimodal_processors/base_processor.py
index 77b73992705..c976f24f728 100644
--- a/python/sglang/srt/managers/multimodal_processors/base_processor.py
+++ b/python/sglang/srt/managers/multimodal_processors/base_processor.py
@@ -139,8 +139,6 @@ def load_mm_data(
         else:
             multimodal_tokens.image_token = multimodal_tokens.image_token
 
-        assert isinstance(prompt, str)
-
         if isinstance(prompt, list) and return_text:
             assert len(prompt) and isinstance(prompt[0], int)
             prompt = self._processor.tokenizer.decode(prompt)

From 40135e63b0e018175ff79f3ef86b35ae59c56155 Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <justinning0323@outlook.com>
Date: Tue, 1 Apr 2025 01:48:57 -0700
Subject: [PATCH 7/8] fix: revert modality

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
---
 python/sglang/srt/entrypoints/engine.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
index 6732d52bdd9..afc46c50f3b 100644
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -152,9 +152,8 @@ def generate(
         Please refer to `GenerateReqInput` for the documentation.
         """
         modalities_list = []
-        for item_image_data in image_data:
-            if item_image_data is not None:
-                modalities_list.append("image")
+        if image_data is not None:
+            modalities_list.append("image")
 
         obj = GenerateReqInput(
             text=prompt,

From b995cfbcfc1db48331ec71e52b6bcd081882e378 Mon Sep 17 00:00:00 2001
From: Xinyuan Tong <justinning0323@outlook.com>
Date: Tue, 1 Apr 2025 09:19:34 +0000
Subject: [PATCH 8/8] fix: remove unused modalities list

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
---
 python/sglang/srt/entrypoints/engine.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
index afc46c50f3b..b92c6ecdbb5 100644
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -151,10 +151,6 @@ def generate(
         The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
         Please refer to `GenerateReqInput` for the documentation.
         """
-        modalities_list = []
-        if image_data is not None:
-            modalities_list.append("image")
-
         obj = GenerateReqInput(
             text=prompt,
             input_ids=input_ids,
@@ -165,7 +161,6 @@ def generate(
             top_logprobs_num=top_logprobs_num,
             token_ids_logprob=token_ids_logprob,
             lora_path=lora_path,
-            modalities=modalities_list,
             custom_logit_processor=custom_logit_processor,
             return_hidden_states=return_hidden_states,
             stream=stream,