huggingface · qgallouedec · Mar 10, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
@@ -162,17 +162,44 @@ def test_compute_entropy_all_masked(self):
 class TestGRPORolloutDispatch:
     def _make_trainer(self):
         trainer = object.__new__(GRPOTrainer)
-        trainer.accelerator = SimpleNamespace(device=torch.device("cpu"), is_main_process=True)
+        trainer.accelerator = SimpleNamespace(
+            device=torch.device("cpu"),
+            is_main_process=True,
+            gather=lambda t: t,
+        )
         trainer.args = SimpleNamespace(report_to=[])
         trainer.model = SimpleNamespace(training=True)
-        trainer.state = SimpleNamespace(global_step=2)
+        trainer.state = SimpleNamespace(global_step=2, num_input_tokens_seen=0)
         trainer._last_loaded_step = 1
         trainer.use_vllm = False
         trainer.use_transformers_paged = False
         trainer.vllm_generation = SimpleNamespace(sync_weights=MagicMock())
+        trainer.processing_class = SimpleNamespace(
+            batch_decode=MagicMock(return_value=["decoded"]),
+        )
+        trainer.tools = None
+        trainer.eos_token_id = 2
+        trainer.pad_token_id = 0
+        trainer._metrics = {
+            "train": {
+                "num_tokens": [],
+                **{
+                    k: []
+                    for k in [
+                        "completions/mean_length",
+                        "completions/min_length",
+                        "completions/max_length",
+                        "completions/clipped_ratio",
+                        "completions/mean_terminated_length",
+                        "completions/min_terminated_length",
+                        "completions/max_terminated_length",
+                    ]
+                },
+            }
+        }
         return trainer
 
-    def test_generate_single_turn_prefers_rollout_func(self):
+    def test_generate_prefers_rollout_func(self):
         trainer = self._make_trainer()
         trainer.rollout_func = MagicMock(
             return_value={
@@ -183,33 +210,32 @@ def test_generate_single_turn_prefers_rollout_func(self):
             }
         )
 
-        prompt_ids, completion_ids, logprobs, extra_fields = trainer._generate_single_turn(["prompt"])
+        result = trainer._generate(["prompt"])
 
-        assert prompt_ids == [[1]]
-        assert completion_ids == [[2]]
-        assert logprobs == [[-0.1]]
-        assert extra_fields == {"env_mask": [[1]]}
+        assert result[0] == [[1]]  # prompt_ids
+        assert result[1] == [[2]]  # completion_ids
+        assert result[2] == [[1]]  # tool_mask (from env_mask)
         trainer.rollout_func.assert_called_once_with(["prompt"], trainer)
 
-    def test_generate_single_turn_rollout_func_syncs_vllm_weights_when_needed(self):
+    def test_generate_rollout_func_syncs_vllm_weights_when_needed(self):
         trainer = self._make_trainer()
         trainer.use_vllm = True
         trainer.rollout_func = MagicMock(
             return_value={"prompt_ids": [[1]], "completion_ids": [[2]], "logprobs": [[0.0]]}
         )
 
-        trainer._generate_single_turn(["prompt"])
+        trainer._generate(["prompt"])
 
         trainer.vllm_generation.sync_weights.assert_called_once()
         assert trainer._last_loaded_step == trainer.state.global_step
         trainer.rollout_func.assert_called_once_with(["prompt"], trainer)
 
-    def test_generate_single_turn_rollout_func_raises_when_required_keys_are_missing(self):
+    def test_generate_rollout_func_raises_when_required_keys_are_missing(self):
         trainer = self._make_trainer()
         trainer.rollout_func = MagicMock(return_value={"prompt_ids": [[1]], "completion_ids": [[2]]})
 
         with pytest.raises(ValueError, match="rollout_func must return keys"):
-            trainer._generate_single_turn(["prompt"])
+            trainer._generate(["prompt"])
 
 
 class TestGRPOTrainer(TrlTestCase):

diff --git a/tests/test_vllm_client_server.py b/tests/test_vllm_client_server.py
@@ -18,7 +18,7 @@
 
 import pytest
 from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 from transformers.testing_utils import torch_device
 
 from trl.generation.vllm_client import VLLMClient
@@ -31,6 +31,7 @@
     kill_process,
     require_3_accelerators,
     require_torch_multi_accelerator,
+    require_vision,
     require_vllm,
 )
 
@@ -207,6 +208,31 @@ def multiply(a: int, b: int) -> int:
         decoded_prompt = tokenizer.decode(outputs["prompt_ids"][0])
         assert "Multiplies two integers." in decoded_prompt
 
+    def test_generate_with_token_ids(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        prompts = ["Hello, AI!", "Tell me a joke"]
+        prompt_token_ids = tokenizer(prompts)["input_ids"]
+        outputs = self.client.generate(prompt_token_ids)
+        prompt_ids = outputs["prompt_ids"]
+        completion_ids = outputs["completion_ids"]
+
+        # Check that the outputs are lists
+        assert isinstance(prompt_ids, list)
+        assert isinstance(completion_ids, list)
+
+        # Check that the number of sequences are equal to the number of prompts
+        assert len(prompt_ids) == len(prompts)
+        assert len(completion_ids) == len(prompts)
+
+        # Check that prompt_ids match the input token IDs
+        assert prompt_ids == prompt_token_ids
+
+        # Check that the sequences are lists of integers
+        for seq in prompt_ids:
+            assert all(isinstance(tok, int) for tok in seq)
+        for seq in completion_ids:
+            assert all(isinstance(tok, int) for tok in seq)
+
     def test_generate_with_params(self):
         prompts = ["Hello, AI!", "Tell me a joke"]
         completion_ids = self.client.generate(prompts, n=2, repetition_penalty=0.9, temperature=0.8, max_tokens=32)[
@@ -411,6 +437,31 @@ def multiply(a: int, b: int) -> int:
         decoded_prompt = tokenizer.decode(outputs["prompt_ids"][0])
         assert "Multiplies two integers." in decoded_prompt
 
+    def test_generate_with_token_ids(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        prompts = ["Hello, AI!", "Tell me a joke"]
+        prompt_token_ids = tokenizer(prompts)["input_ids"]
+        outputs = self.client.generate(prompt_token_ids)
+        prompt_ids = outputs["prompt_ids"]
+        completion_ids = outputs["completion_ids"]
+
+        # Check that the outputs are lists
+        assert isinstance(prompt_ids, list)
+        assert isinstance(completion_ids, list)
+
+        # Check that the number of sequences are equal to the number of prompts
+        assert len(prompt_ids) == len(prompts)
+        assert len(completion_ids) == len(prompts)
+
+        # Check that prompt_ids match the input token IDs
+        assert prompt_ids == prompt_token_ids
+
+        # Check that the sequences are lists of integers
+        for seq in prompt_ids:
+            assert all(isinstance(tok, int) for tok in seq)
+        for seq in completion_ids:
+            assert all(isinstance(tok, int) for tok in seq)
+
     def test_generate_with_params(self):
         prompts = ["Hello, AI!", "Tell me a joke"]
         completion_ids = self.client.generate(prompts, n=2, repetition_penalty=0.9, temperature=0.8, max_tokens=32)[
@@ -536,6 +587,31 @@ def multiply(a: int, b: int) -> int:
         decoded_prompt = tokenizer.decode(outputs["prompt_ids"][0])
         assert "Multiplies two integers." in decoded_prompt
 
+    def test_generate_with_token_ids(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        prompts = ["Hello, AI!", "Tell me a joke"]
+        prompt_token_ids = tokenizer(prompts)["input_ids"]
+        outputs = self.client.generate(prompt_token_ids)
+        prompt_ids = outputs["prompt_ids"]
+        completion_ids = outputs["completion_ids"]
+
+        # Check that the outputs are lists
+        assert isinstance(prompt_ids, list)
+        assert isinstance(completion_ids, list)
+
+        # Check that the number of sequences are equal to the number of prompts
+        assert len(prompt_ids) == len(prompts)
+        assert len(completion_ids) == len(prompts)
+
+        # Check that prompt_ids match the input token IDs
+        assert prompt_ids == prompt_token_ids
+
+        # Check that the sequences are lists of integers
+        for seq in prompt_ids:
+            assert all(isinstance(tok, int) for tok in seq)
+        for seq in completion_ids:
+            assert all(isinstance(tok, int) for tok in seq)
+
     def test_generate_with_params(self):
         prompts = ["Hello, AI!", "Tell me a joke"]
         completion_ids = self.client.generate(prompts, n=2, repetition_penalty=0.9, temperature=0.8, max_tokens=32)[
@@ -665,6 +741,31 @@ def multiply(a: int, b: int) -> int:
         decoded_prompt = tokenizer.decode(outputs["prompt_ids"][0])
         assert "Multiplies two integers." in decoded_prompt
 
+    def test_generate_with_token_ids(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        prompts = ["Hello, AI!", "Tell me a joke"]
+        prompt_token_ids = tokenizer(prompts)["input_ids"]
+        outputs = self.client.generate(prompt_token_ids)
+        prompt_ids = outputs["prompt_ids"]
+        completion_ids = outputs["completion_ids"]
+
+        # Check that the outputs are lists
+        assert isinstance(prompt_ids, list)
+        assert isinstance(completion_ids, list)
+
+        # Check that the number of sequences are equal to the number of prompts
+        assert len(prompt_ids) == len(prompts)
+        assert len(completion_ids) == len(prompts)
+
+        # Check that prompt_ids match the input token IDs
+        assert prompt_ids == prompt_token_ids
+
+        # Check that the sequences are lists of integers
+        for seq in prompt_ids:
+            assert all(isinstance(tok, int) for tok in seq)
+        for seq in completion_ids:
+            assert all(isinstance(tok, int) for tok in seq)
+
     def test_generate_with_params(self):
         prompts = ["Hello, AI!", "Tell me a joke"]
         completion_ids = self.client.generate(prompts, n=2, repetition_penalty=0.9, temperature=0.8, max_tokens=32)[
@@ -774,3 +875,98 @@ def teardown_class(cls):
         # vLLM x pytest (or Popen) seems not to handle process termination well. To avoid zombie processes, we need to
         # kill the server process and its children explicitly.
         kill_process(cls.server_process)
+
+
+@pytest.mark.slow
+@require_vllm
+@require_vision
+class TestVLLMClientServerVLM(TrlTestCase):
+    model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    @classmethod
+    def setup_class(cls):
+        # Start the server process
+        cls.server_process = subprocess.Popen(
+            ["trl", "vllm-serve", "--model", cls.model_id], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+
+        # Initialize the client (no communicator needed for generation-only tests)
+        cls.client = VLLMClient(connection_timeout=240, host="localhost")
+
+    def test_generate_with_token_ids_and_image(self):
+        from PIL import Image
+
+        processor = AutoProcessor.from_pretrained(self.model_id)
+        image1 = Image.new("RGB", (64, 64), color="red")
+        image2 = Image.new("RGB", (64, 64), color="blue")
+        image3 = Image.new("RGB", (64, 64), color="green")
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image1},
+                        {"type": "image", "image": image2},
+                        {"type": "text", "text": "What are the differences between these two images?"},
+                    ],
+                }
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image3},
+                        {"type": "text", "text": "What is the color of this image?"},
+                    ],
+                }
+            ],
+        ]
+        prompt_token_ids = processor.apply_chat_template(
+            conversation=messages, tokenize=True, add_generation_prompt=True
+        )
+        outputs = self.client.generate(prompt_token_ids, images=[[image1, image2], [image3]], max_tokens=64)
+        prompt_ids = outputs["prompt_ids"]
+        completion_ids = outputs["completion_ids"]
+
+        assert len(prompt_ids) == 2
+        assert len(completion_ids) == 2
+        assert all(isinstance(tok, int) for tok in prompt_ids[0])
+        assert all(isinstance(tok, int) for tok in completion_ids[0])
+
+    def test_generate_with_token_ids_mixed_images(self):
+        """Test a batch where one prompt has an image and the other does not."""
+        from PIL import Image
+
+        processor = AutoProcessor.from_pretrained(self.model_id)
+        image = Image.new("RGB", (64, 64), color="red")
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [{"type": "image", "image": image}, {"type": "text", "text": "Describe this image."}],
+                }
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": "What is 1+1?"}],
+                }
+            ],
+        ]
+        prompt_token_ids = processor.apply_chat_template(
+            conversation=messages, tokenize=True, add_generation_prompt=True
+        )
+        outputs = self.client.generate(prompt_token_ids, images=[[image], None], max_tokens=64)
+        prompt_ids = outputs["prompt_ids"]
+        completion_ids = outputs["completion_ids"]
+
+        assert len(prompt_ids) == 2
+        assert len(completion_ids) == 2
+        assert all(isinstance(tok, int) for tok in prompt_ids[0])
+        assert all(isinstance(tok, int) for tok in prompt_ids[1])
+        assert all(isinstance(tok, int) for tok in completion_ids[0])
+        assert all(isinstance(tok, int) for tok in completion_ids[1])
+
+    @classmethod
+    def teardown_class(cls):
+        kill_process(cls.server_process)
diff --git a/trl/experimental/online_dpo/online_dpo_trainer.py b/trl/experimental/online_dpo/online_dpo_trainer.py
@@ -750,7 +750,9 @@ def _generate_vllm_server(self, prompts, images=None):
             # prompt individually.
             ordered_set_of_prompts = all_prompts[:: self.num_generations]
             if has_images:
-                ordered_set_of_images = all_images[:: self.num_generations]
+                ordered_set_of_images = [
+                    [img] if img is not None else None for img in all_images[:: self.num_generations]
+                ]
             else:
                 ordered_set_of_images = None
             completion_ids = self.vllm_client.generate(

diff --git a/trl/generation/vllm_client.py b/trl/generation/vllm_client.py
@@ -201,7 +201,7 @@ def check_server(self, total_timeout: float = 0.0, retry_interval: float = 2.0):
 
     def generate(
         self,
-        prompts: list[str],
+        prompts: list[str] | list[list[int]],
         images: list | None = None,
         n: int = 1,
         repetition_penalty: float = 1.0,
@@ -219,10 +219,11 @@ def generate(
         Generates model completions for the provided prompts.
 
         Args:
-            prompts (`list[str]`):
-                List of text prompts for which the model will generate completions.
-            images (`list[PIL.Image]`, *optional*):
-                List of PIL Images to send along with the prompts.
+            prompts (`list[str]` or `list[list[int]]`):
+                List of text prompts or list of token ID lists for which the model will generate completions.
+            images (`list[list[PIL.Image] | None]`, *optional*):
+                List of image lists for VLM support. Each element is a list of PIL images for the corresponding prompt,
+                or `None` if no images for that prompt.
             n (`int`, *optional*, defaults to `1`):
                 Number of completions to generate for each prompt.
             repetition_penalty (`float`, *optional*, defaults to `1.0`):
@@ -265,8 +266,12 @@ def generate(
         """
         url = f"{self.base_url}/generate/"
 
-        # Convert PIL images to base64 strings
-        images = [pil_to_base64(img) for img in images] if images else None
+        # Convert PIL images to base64 strings. Each element is a list of images for the corresponding prompt,
+        # or None if no images for that prompt.
+        if images:
+            images = [
+                [pil_to_base64(img) for img in img_list] if img_list is not None else None for img_list in images
+            ]
 
         response = self.session.post(
             url,

diff --git a/trl/generation/vllm_generation.py b/trl/generation/vllm_generation.py
@@ -627,7 +627,8 @@ def generate(self, prompts: list, num_generations: int, profiler: ProfilingConte
                             chat_template=chat_template,
                         )
                     else:
-                        output = self.vllm_client.generate(prompts=ordered_set_of_prompts, **sampling_params)
+                        ordered_set_of_prompt_ids = self.processing_class(text=ordered_set_of_prompts)["input_ids"]
+                        output = self.vllm_client.generate(prompts=ordered_set_of_prompt_ids, **sampling_params)
                     # Extract required fields and collect any extra fields for reward functions
                     required_keys = {"prompt_ids", "completion_ids", "logprobs", "logprob_token_ids"}
                     extra_fields = {k: v for k, v in output.items() if k not in required_keys}