huggingface · qgallouedec · Mar 9, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/tests/test_vllm_client_server.py b/tests/test_vllm_client_server.py
@@ -18,7 +18,7 @@
 
 import pytest
 from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 from transformers.testing_utils import torch_device
 
 from trl.generation.vllm_client import VLLMClient
@@ -31,6 +31,7 @@
     kill_process,
     require_3_accelerators,
     require_torch_multi_accelerator,
+    require_vision,
     require_vllm,
 )
 
@@ -874,3 +875,98 @@ def teardown_class(cls):
         # vLLM x pytest (or Popen) seems not to handle process termination well. To avoid zombie processes, we need to
         # kill the server process and its children explicitly.
         kill_process(cls.server_process)
+
+
+@pytest.mark.slow
+@require_vllm
+@require_vision
+class TestVLLMClientServerVLM(TrlTestCase):
+    model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    @classmethod
+    def setup_class(cls):
+        # Start the server process
+        cls.server_process = subprocess.Popen(
+            ["trl", "vllm-serve", "--model", cls.model_id], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+
+        # Initialize the client (no communicator needed for generation-only tests)
+        cls.client = VLLMClient(connection_timeout=240, host="localhost")
+
+    def test_generate_with_token_ids_and_image(self):
+        from PIL import Image
+
+        processor = AutoProcessor.from_pretrained(self.model_id)
+        image1 = Image.new("RGB", (64, 64), color="red")
+        image2 = Image.new("RGB", (64, 64), color="blue")
+        image3 = Image.new("RGB", (64, 64), color="green")
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image1},
+                        {"type": "image", "image": image2},
+                        {"type": "text", "text": "What are the differences between these two images?"},
+                    ],
+                }
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image3},
+                        {"type": "text", "text": "What is the color of this image?"},
+                    ],
+                }
+            ],
+        ]
+        prompt_token_ids = processor.apply_chat_template(
+            conversation=messages, tokenize=True, add_generation_prompt=True
+        )
+        outputs = self.client.generate(prompt_token_ids, images=[[image1, image2], [image3]], max_tokens=64)
+        prompt_ids = outputs["prompt_ids"]
+        completion_ids = outputs["completion_ids"]
+
+        assert len(prompt_ids) == 2
+        assert len(completion_ids) == 2
+        assert all(isinstance(tok, int) for tok in prompt_ids[0])
+        assert all(isinstance(tok, int) for tok in completion_ids[0])
+
+    def test_generate_with_token_ids_mixed_images(self):
+        """Test a batch where one prompt has an image and the other does not."""
+        from PIL import Image
+
+        processor = AutoProcessor.from_pretrained(self.model_id)
+        image = Image.new("RGB", (64, 64), color="red")
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [{"type": "image", "image": image}, {"type": "text", "text": "Describe this image."}],
+                }
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": "What is 1+1?"}],
+                }
+            ],
+        ]
+        prompt_token_ids = processor.apply_chat_template(
+            conversation=messages, tokenize=True, add_generation_prompt=True
+        )
+        outputs = self.client.generate(prompt_token_ids, images=[[image], None], max_tokens=64)
+        prompt_ids = outputs["prompt_ids"]
+        completion_ids = outputs["completion_ids"]
+
+        assert len(prompt_ids) == 2
+        assert len(completion_ids) == 2
+        assert all(isinstance(tok, int) for tok in prompt_ids[0])
+        assert all(isinstance(tok, int) for tok in prompt_ids[1])
+        assert all(isinstance(tok, int) for tok in completion_ids[0])
+        assert all(isinstance(tok, int) for tok in completion_ids[1])
+
+    @classmethod
+    def teardown_class(cls):
+        kill_process(cls.server_process)
diff --git a/trl/experimental/online_dpo/online_dpo_trainer.py b/trl/experimental/online_dpo/online_dpo_trainer.py
@@ -750,7 +750,9 @@ def _generate_vllm_server(self, prompts, images=None):
             # prompt individually.
             ordered_set_of_prompts = all_prompts[:: self.num_generations]
             if has_images:
-                ordered_set_of_images = all_images[:: self.num_generations]
+                ordered_set_of_images = [
+                    [img] if img is not None else None for img in all_images[:: self.num_generations]
+                ]
             else:
                 ordered_set_of_images = None
             completion_ids = self.vllm_client.generate(

diff --git a/trl/generation/vllm_client.py b/trl/generation/vllm_client.py
@@ -220,8 +220,9 @@ def generate(
         Args:
             prompts (`list[str]` or `list[list[int]]`):
                 List of text prompts or list of token ID lists for which the model will generate completions.
-            images (`list[PIL.Image]`, *optional*):
-                List of PIL Images to send along with the prompts. Only valid when `prompts` is a list of strings.
+            images (`list[list[PIL.Image] | None]`, *optional*):
+                List of image lists for VLM support. Each element is a list of PIL images for the corresponding prompt,
+                or `None` if no images for that prompt.
             n (`int`, *optional*, defaults to `1`):
                 Number of completions to generate for each prompt.
             repetition_penalty (`float`, *optional*, defaults to `1.0`):
@@ -260,8 +261,12 @@ def generate(
         """
         url = f"{self.base_url}/generate/"
 
-        # Convert PIL images to base64 strings
-        images = [pil_to_base64(img) for img in images] if images else None
+        # Convert PIL images to base64 strings. Each element is a list of images for the corresponding prompt,
+        # or None if no images for that prompt.
+        if images:
+            images = [
+                [pil_to_base64(img) for img in img_list] if img_list is not None else None for img_list in images
+            ]
 
         response = self.session.post(
             url,

diff --git a/trl/scripts/vllm_serve.py b/trl/scripts/vllm_serve.py
@@ -491,7 +491,7 @@ async def get_world_size():
 
     class GenerateRequest(BaseModel):
         prompts: list[str] | list[list[int]]
-        images: list[str] | None = None
+        images: list[list[str] | None] | None = None
         n: int = 1
         repetition_penalty: float = 1.0
         temperature: float = 1.0
@@ -518,8 +518,8 @@ async def generate(request: GenerateRequest):
             request (`GenerateRequest`):
                 - `prompts` (list of `str` or list of list of `int`): A list of prompts. It accepts either text strings
                   or pre-tokenized token ID lists. When text strings are provided, `images` can optionally be included.
-                - `images` (list of `str`, *optional*, default to `None`): A list of base64 encoded images to process
-                  along with prompts.
+                - `images` (list of list of `str` or `None`, *optional*): A list of image lists. Each element is a list
+                  of base64-encoded images for the corresponding prompt, or `None` if no images for that prompt.
                 - `n` (`int`, *optional*, defaults to `1`): Number of completions to generate for each prompt.
                 - `repetition_penalty` (`float`, *optional*, defaults to `1.0`): Repetition penalty to apply during
                   generation.
@@ -571,19 +571,15 @@ async def generate(request: GenerateRequest):
         ```
         """
         # Build vLLM-compatible prompt inputs
-        if request.prompts and isinstance(request.prompts[0], list):
-            # Token IDs path: wrap each list of token IDs as a TokensPrompt dict for vLLM
-            prompts = [{"prompt_token_ids": ids} for ids in request.prompts]
-        else:
-            # Text prompts path: build prompt dicts with optional images
-            request.images = request.images or [None] * len(request.prompts)
-
-            prompts = []
-            for prompt, image in zip(request.prompts, request.images, strict=True):
-                row = {"prompt": prompt}
-                if image is not None:
-                    row["multi_modal_data"] = {"image": Image.open(BytesIO(base64.b64decode(image)))}
-                prompts.append(row)
+        is_token_ids = request.prompts and isinstance(request.prompts[0], list)
+        request.images = request.images or [None] * len(request.prompts)
+
+        prompts = []
+        for prompt, image_list in zip(request.prompts, request.images, strict=True):
+            row = {"prompt_token_ids": prompt} if is_token_ids else {"prompt": prompt}
+            if image_list is not None:
+                row["multi_modal_data"] = {"image": [Image.open(BytesIO(base64.b64decode(img))) for img in image_list]}
+            prompts.append(row)
 
         generation_kwargs = {
             "n": request.n,