From 31850f2dc53d02829b400077db8dc9bbfad552fe Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Sun, 30 Mar 2025 10:26:22 +0000 Subject: [PATCH 1/8] [Fix] Take Image as input, also refactor hashing and loading logic for better type handling Signed-off-by: Xinyuan Tong --- .../managers/multimodal_processors/base_processor.py | 11 ++++++++++- .../srt/managers/multimodal_processors/qwen_vl.py | 4 ++++ python/sglang/srt/utils.py | 8 +++++--- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/python/sglang/srt/managers/multimodal_processors/base_processor.py b/python/sglang/srt/managers/multimodal_processors/base_processor.py index a784abb70d8..bf164905d88 100644 --- a/python/sglang/srt/managers/multimodal_processors/base_processor.py +++ b/python/sglang/srt/managers/multimodal_processors/base_processor.py @@ -230,7 +230,16 @@ def load_mm_data( continue image_sizes += frames[0].size * len(frames) - hashes += [hash(image_file)] * len(frames) + + # Generate a hashable value for the image file + if isinstance(image_file, Image.Image): + # For PIL.Image objects, use the ID as a hashable value + hash_value = hash(id(image_file)) + else: + # For other types (strings, etc.), use the regular hash + hash_value = hash(image_file) + + hashes += [hash_value] * len(frames) images += frames image_index += 1 if frames_to_process != 0: diff --git a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py index d978e49f1fe..0d1130f59cf 100644 --- a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py +++ b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py @@ -53,6 +53,7 @@ def _process_images_task(images, input_text, _hf_config): async def _process_single_image(self, images, input_text) -> dict: if self.executor is not None: loop = asyncio.get_event_loop() + # FIXME return await loop.run_in_executor( self.executor, Qwen2_5VLImageProcessor._process_images_task, @@ -149,6 +150,9 @@ def floor_by_factor(number: int, factor: int) -> int: ret = await self._process_single_image( images=images, input_text=base_output.input_text ) + + #debug + print(f"!!!!!! ret: {type(ret)}, {ret["image_grid_thw"]}") image_grid_thws = torch.concat([ret["image_grid_thw"]]) video_grid_thws = None diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 4d7a2e532a6..21ca4c60160 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -537,10 +537,12 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra return audio -def load_image(image_file: Union[str, bytes]) -> tuple[Image, tuple[int, int]]: +def load_image(image_file: Union[Image.Image, str, bytes]) -> tuple[Image.Image, tuple[int, int]]: image = image_size = None - - if isinstance(image_file, bytes): + if isinstance(image_file, Image.Image): + image = image_file + image_size = (image.width, image.height) + elif isinstance(image_file, bytes): image = Image.open(BytesIO(image_file)) elif image_file.startswith("http://") or image_file.startswith("https://"): timeout = int(os.getenv("REQUEST_TIMEOUT", "3")) From 7c3c466c73403a56e08b2d72e7ae0b103aaae397 Mon Sep 17 00:00:00 2001 From: GeLee Date: Mon, 31 Mar 2025 00:55:37 +0800 Subject: [PATCH 2/8] fix the index error in sglang rollout --- python/sglang/srt/entrypoints/engine.py | 5 ++-- .../managers/multimodal_processors/qwen_vl.py | 25 ++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index afc46c50f3b..6732d52bdd9 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -152,8 +152,9 @@ def generate( Please refer to `GenerateReqInput` for the documentation. """ modalities_list = [] - if image_data is not None: - modalities_list.append("image") + for item_image_data in image_data: + if item_image_data is not None: + modalities_list.append("image") obj = GenerateReqInput( text=prompt, diff --git a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py index 0d1130f59cf..f2bd5a7567f 100644 --- a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py +++ b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py @@ -51,18 +51,19 @@ def _process_images_task(images, input_text, _hf_config): } async def _process_single_image(self, images, input_text) -> dict: - if self.executor is not None: - loop = asyncio.get_event_loop() - # FIXME - return await loop.run_in_executor( - self.executor, - Qwen2_5VLImageProcessor._process_images_task, - images, - input_text, - self.hf_config, - ) - else: - return self._process_images_task(images, input_text, self.hf_config) + # if self.executor is not None: + # loop = asyncio.get_event_loop() + # # FIXME + # return await loop.run_in_executor( + # self.executor, + # Qwen2_5VLImageProcessor._process_images_task, + # images, + # input_text, + # self.hf_config, + # ) + # else: + # return self._process_images_task(images, input_text, self.hf_config) + return self._process_images_task(images, input_text, self.hf_config) async def process_mm_data_async( self, From b09b293960ad9d19d77e50fb15c48f91385ecccb Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Sun, 30 Mar 2025 23:01:16 +0000 Subject: [PATCH 3/8] minor: clean up unused imports and debug print statement Signed-off-by: Xinyuan Tong --- .../sglang/srt/managers/multimodal_processors/qwen_vl.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py index f2bd5a7567f..a2d5542e0b8 100644 --- a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py +++ b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py @@ -6,10 +6,8 @@ import torch from PIL import Image -from sglang.srt.managers.multimodal_processor import ( - BaseMultimodalProcessor as SGLangBaseProcessor, -) from sglang.srt.managers.multimodal_processors.base_processor import ( + BaseMultimodalProcessor as SGLangBaseProcessor, MultimodalSpecialTokens, get_global_processor, ) @@ -151,9 +149,6 @@ def floor_by_factor(number: int, factor: int) -> int: ret = await self._process_single_image( images=images, input_text=base_output.input_text ) - - #debug - print(f"!!!!!! ret: {type(ret)}, {ret["image_grid_thw"]}") image_grid_thws = torch.concat([ret["image_grid_thw"]]) video_grid_thws = None From 530b6e949d71ad44602fa2baa411bdd2e471650f Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Mon, 31 Mar 2025 06:53:15 +0000 Subject: [PATCH 4/8] format Signed-off-by: Xinyuan Tong --- .../srt/managers/multimodal_processors/base_processor.py | 4 ++-- python/sglang/srt/managers/multimodal_processors/qwen_vl.py | 2 ++ python/sglang/srt/utils.py | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/managers/multimodal_processors/base_processor.py b/python/sglang/srt/managers/multimodal_processors/base_processor.py index bf164905d88..15f057adfe7 100644 --- a/python/sglang/srt/managers/multimodal_processors/base_processor.py +++ b/python/sglang/srt/managers/multimodal_processors/base_processor.py @@ -230,7 +230,7 @@ def load_mm_data( continue image_sizes += frames[0].size * len(frames) - + # Generate a hashable value for the image file if isinstance(image_file, Image.Image): # For PIL.Image objects, use the ID as a hashable value @@ -238,7 +238,7 @@ def load_mm_data( else: # For other types (strings, etc.), use the regular hash hash_value = hash(image_file) - + hashes += [hash_value] * len(frames) images += frames image_index += 1 diff --git a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py index a2d5542e0b8..e98bf5ba8de 100644 --- a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py +++ b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py @@ -8,6 +8,8 @@ from sglang.srt.managers.multimodal_processors.base_processor import ( BaseMultimodalProcessor as SGLangBaseProcessor, +) +from sglang.srt.managers.multimodal_processors.base_processor import ( MultimodalSpecialTokens, get_global_processor, ) diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 21ca4c60160..29f98d0001f 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -537,7 +537,9 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra return audio -def load_image(image_file: Union[Image.Image, str, bytes]) -> tuple[Image.Image, tuple[int, int]]: +def load_image( + image_file: Union[Image.Image, str, bytes] +) -> tuple[Image.Image, tuple[int, int]]: image = image_size = None if isinstance(image_file, Image.Image): image = image_file From 9dbd9ea0059ac9efbfbb06d208967232211a7eb0 Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Tue, 1 Apr 2025 05:01:39 +0000 Subject: [PATCH 5/8] fix: remove unused code due to auto merge Signed-off-by: Xinyuan Tong --- .../managers/multimodal_processors/qwen_vl.py | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py index 757a6ffe8dc..9d0d6d8437f 100644 --- a/python/sglang/srt/managers/multimodal_processors/qwen_vl.py +++ b/python/sglang/srt/managers/multimodal_processors/qwen_vl.py @@ -33,35 +33,6 @@ def __init__(self, hf_config, server_args, _processor): self.MAX_PIXELS = 16384 * 28 * 28 self.MAX_RATIO = 200 - @staticmethod - def _process_images_task(images, input_text, _hf_config): - if isinstance(images, list) and len(images) == 0: - images = None - result = get_global_processor().__call__( - text=[input_text], images=images, padding=True, return_tensors="pt" - ) - - return { - "input_ids": result.input_ids, - "pixel_values": getattr(result, "pixel_values", None), - "image_grid_thw": getattr(result, "image_grid_thw", None), - "second_per_grid_ts": getattr(result, "second_per_grid_ts", None), - "video_grid_thws": getattr(result, "video_grid_thws", None), - } - - async def _process_single_image(self, images, input_text) -> dict: - if self.executor is not None: - loop = asyncio.get_event_loop() - return await loop.run_in_executor( - self.executor, - Qwen2_5VLImageProcessor._process_images_task, - images, - input_text, - self.hf_config, - ) - else: - return self._process_images_task(images, input_text, self.hf_config) - async def process_mm_data_async( self, image_data: List[Union[str, bytes]], From 32ef9dcaf4f2958ebb0ac931277c3612fd4ee734 Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Tue, 1 Apr 2025 05:02:22 +0000 Subject: [PATCH 6/8] fix: remove redundant assert Signed-off-by: Xinyuan Tong --- .../sglang/srt/managers/multimodal_processors/base_processor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/sglang/srt/managers/multimodal_processors/base_processor.py b/python/sglang/srt/managers/multimodal_processors/base_processor.py index 77b73992705..c976f24f728 100644 --- a/python/sglang/srt/managers/multimodal_processors/base_processor.py +++ b/python/sglang/srt/managers/multimodal_processors/base_processor.py @@ -139,8 +139,6 @@ def load_mm_data( else: multimodal_tokens.image_token = multimodal_tokens.image_token - assert isinstance(prompt, str) - if isinstance(prompt, list) and return_text: assert len(prompt) and isinstance(prompt[0], int) prompt = self._processor.tokenizer.decode(prompt) From 40135e63b0e018175ff79f3ef86b35ae59c56155 Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Tue, 1 Apr 2025 01:48:57 -0700 Subject: [PATCH 7/8] fix: revert modality Signed-off-by: Xinyuan Tong --- python/sglang/srt/entrypoints/engine.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index 6732d52bdd9..afc46c50f3b 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -152,9 +152,8 @@ def generate( Please refer to `GenerateReqInput` for the documentation. """ modalities_list = [] - for item_image_data in image_data: - if item_image_data is not None: - modalities_list.append("image") + if image_data is not None: + modalities_list.append("image") obj = GenerateReqInput( text=prompt, From b995cfbcfc1db48331ec71e52b6bcd081882e378 Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Tue, 1 Apr 2025 09:19:34 +0000 Subject: [PATCH 8/8] fix: remove unused modalities list Signed-off-by: Xinyuan Tong --- python/sglang/srt/entrypoints/engine.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py index afc46c50f3b..b92c6ecdbb5 100644 --- a/python/sglang/srt/entrypoints/engine.py +++ b/python/sglang/srt/entrypoints/engine.py @@ -151,10 +151,6 @@ def generate( The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`. Please refer to `GenerateReqInput` for the documentation. """ - modalities_list = [] - if image_data is not None: - modalities_list.append("image") - obj = GenerateReqInput( text=prompt, input_ids=input_ids, @@ -165,7 +161,6 @@ def generate( top_logprobs_num=top_logprobs_num, token_ids_logprob=token_ids_logprob, lora_path=lora_path, - modalities=modalities_list, custom_logit_processor=custom_logit_processor, return_hidden_states=return_hidden_states, stream=stream,