vllm-project · DarkLight1337 · Oct 29, 2024 · Oct 15, 2024 · Oct 16, 2024 · Oct 16, 2024
@@ -105,6 +105,53 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
+                                                    model_name: str,
+                                                    image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        n=2,
+        max_tokens=10,
+        extra_body=dict(use_beam_search=True))
+    assert len(chat_completion.choices) == 2
+    assert chat_completion.choices[
+        0].message.content != chat_completion.choices[1].message.content
+
+    with pytest.raises(openai.BadRequestError) as exc_info:
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            n=2,
+            max_tokens=10,
+            logprobs=True,
+            top_logprobs=5,
+            extra_body=dict(use_beam_search=True))
+
+    # Assert that the exception message is correct
+    assert "Only the `cumulative_logprob` " in str(exc_info.value)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@@ -160,6 +207,41 @@ async def test_single_chat_session_image_base64encoded(
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image_base64encoded_beamsearch(
+        client: openai.AsyncOpenAI, model_name: str, image_url: str,
+        base64_encoded_image: Dict[str, str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url":
+                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        n=2,
+        max_tokens=10,
+        extra_body=dict(use_beam_search=True))
+    assert len(chat_completion.choices) == 2
+    assert chat_completion.choices[
+        0].message.content != chat_completion.choices[1].message.content
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)

diff --git a/vllm/beam_search.py b/vllm/beam_search.py
@@ -13,6 +13,7 @@ class BeamSearchSequence:
     tokens: List[int]
     cum_logprob: float = 0.0
     text: Optional[str] = None
+    finish_reason: Optional[str] = None
 
 
 @dataclass

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -1,5 +1,6 @@
 import asyncio
 from abc import ABC, abstractmethod
+from copy import deepcopy
 from typing import AsyncGenerator, List, Mapping, Optional, Union
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
@@ -69,24 +70,50 @@ async def beam_search(
         ignore_eos = params.ignore_eos
         temperature = params.temperature
         length_penalty = params.length_penalty
+        include_stop_str_in_output = params.include_stop_str_in_output
 
         tokenizer = await self.get_tokenizer(lora_request=None)
-        tokenizedPrompt = prompt if isinstance(
-            prompt, list) else tokenizer.encode(prompt)
-        tokenizedLength = len(tokenizedPrompt)
+
+        if isinstance(prompt, dict):
+            if "prompt" in prompt:
+                tokenized_prompt = tokenizer.encode(prompt.get("prompt"))
+                multi_modal_data = prompt.get("multi_modal_data")
+                mm_processor_kwargs = prompt.get("mm_processor_kwargs")
+            elif "prompt_token_ids" in prompt:
+                tokenized_prompt = prompt.get("prompt_token_ids")
+                multi_modal_data = prompt.get("multi_modal_data")
+                mm_processor_kwargs = prompt.get("mm_processor_kwargs")
+            else:
+                raise TypeError(
+                    "Dictionary input must be a TextPrompt or TokensPrompt")
+        else:
+            tokenized_prompt = prompt if isinstance(
+                prompt, list) else tokenizer.encode(prompt)
+            multi_modal_data = None
+            mm_processor_kwargs = None
+
+        tokenized_length = len(tokenized_prompt)
 
         sort_beams_key = create_sort_beams_key_function(
             tokenizer.eos_token_id, length_penalty)
 
-        beam_search_params = SamplingParams(logprobs=2 * beam_width,
-                                            max_tokens=1,
-                                            temperature=temperature)
-        all_beams = [BeamSearchSequence(tokens=tokenizedPrompt, cum_logprob=0)]
+        beam_search_params = SamplingParams(
+            logprobs=2 * beam_width,
+            max_tokens=1,
+            temperature=temperature,
+        )
+        all_beams = [
+            BeamSearchSequence(tokens=tokenized_prompt, cum_logprob=0)
+        ]
         completed = []
 
         for _ in range(max_tokens):
             prompts_batch = [
-                TokensPrompt(prompt_token_ids=beam.tokens)
+                TokensPrompt(
+                    prompt_token_ids=beam.tokens,
+                    multi_modal_data=deepcopy(
+                        multi_modal_data),  # always the values from inputs
+                    mm_processor_kwargs=deepcopy(mm_processor_kwargs))
                 for beam in all_beams
             ]
 
@@ -112,16 +139,23 @@ async def beam_search(
                 if result.outputs[0].logprobs is not None:
                     logprobs = result.outputs[0].logprobs[0]
                     for token_id, logprob_obj in logprobs.items():
-                        new_beam = BeamSearchSequence(
-                            tokens=current_beam.tokens + [token_id],
-                            cum_logprob=current_beam.cum_logprob +
-                            logprob_obj.logprob)
-
                         if token_id == tokenizer.eos_token_id and \
                             not ignore_eos:
-                            completed.append(new_beam)
+                            completed.append(
+                                BeamSearchSequence(
+                                    tokens=current_beam.tokens +
+                                    [token_id] if include_stop_str_in_output
+                                    else current_beam.tokens,  #
+                                    cum_logprob=current_beam.cum_logprob +
+                                    logprob_obj.logprob,
+                                    finish_reason="stop"))
                         else:
-                            new_beams.append(new_beam)
+                            new_beams.append(
+                                BeamSearchSequence(
+                                    tokens=current_beam.tokens + [token_id],  #
+                                    cum_logprob=current_beam.cum_logprob +
+                                    logprob_obj.logprob,
+                                ))
 
             sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True)
             all_beams = sorted_beams[:beam_width]
@@ -131,22 +165,22 @@ async def beam_search(
         best_beams = sorted_completed[:beam_width]
 
         for beam in best_beams:
-            beam.text = tokenizer.decode(beam.tokens[tokenizedLength:])
+            beam.text = tokenizer.decode(beam.tokens[tokenized_length:])
 
         beam_search_output = RequestOutput(
             request_id=request_id,
-            prompt=prompt,
+            prompt=tokenizer.decode(tokenized_prompt),
             outputs=[
                 CompletionOutput(
                     text=beam.text,
                     cumulative_logprob=beam.cum_logprob,
-                    token_ids=beam.tokens,
+                    token_ids=beam.tokens[tokenized_length:],
                     index=i,
                     logprobs=beam.cum_logprob,
                 ) for (i, beam) in enumerate(best_beams)
             ],
             finished=True,
-            prompt_token_ids=tokenizedPrompt,
+            prompt_token_ids=tokenized_prompt,
             prompt_logprobs=None)
 
         yield beam_search_output

@@ -302,7 +302,7 @@ def to_beam_search_params(self,
             ignore_eos=self.ignore_eos,
             temperature=temperature,
             length_penalty=self.length_penalty,
-        )
+            include_stop_str_in_output=self.include_stop_str_in_output)
 
     def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
         max_tokens = self.max_tokens
@@ -400,6 +400,10 @@ def check_logprobs(cls, data):
                 raise ValueError(
                     "when using `top_logprobs`, `logprobs` must be set to true."
                 )
+        if data.get("logprobs") and data.get("use_beam_search"):
+            raise ValueError(
+                "Only the `cumulative_logprob` of each output will be returned."
+            )
 
         return data
 
@@ -594,7 +598,7 @@ def to_beam_search_params(self,
             ignore_eos=self.ignore_eos,
             temperature=temperature,
             length_penalty=self.length_penalty,
-        )
+            include_stop_str_in_output=self.include_stop_str_in_output)
 
     def to_sampling_params(self, default_max_tokens: int) -> SamplingParams:
         max_tokens = self.max_tokens

@@ -236,7 +236,7 @@ async def create_chat_completion(
 
             if isinstance(sampling_params, BeamSearchParams):
                 result_generator = self.engine_client.beam_search(
-                    engine_inputs['prompt_token_ids'],
+                    engine_inputs,
                     request_id,
                     sampling_params,
                 )

@@ -150,7 +150,7 @@ async def create_completion(
 
                 if isinstance(sampling_params, BeamSearchParams):
                     generator = self.engine_client.beam_search(
-                        prompt_inputs["prompt_token_ids"],
+                        prompt_inputs,
                         request_id_item,
                         sampling_params,
                     )

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
@@ -489,3 +489,4 @@ class BeamSearchParams(
     ignore_eos: bool = False
     temperature: float = 0.0
     length_penalty: float = 1.0
+    include_stop_str_in_output: bool = False