diff --git a/requirements/common.txt b/requirements/common.txt index 5c5df38f8e06..365e30ed4e44 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -31,7 +31,7 @@ partial-json-parser # used for parsing partial JSON outputs pyzmq >= 25.0.0 msgspec gguf >= 0.17.0 -mistral_common[image] >= 1.8.8 +mistral_common[image] >= 1.9.0 opencv-python-headless >= 4.13.0 # required for video IO pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 @@ -52,4 +52,4 @@ anthropic >= 0.71.0 model-hosting-container-standards >= 0.1.13, < 1.0.0 mcp grpcio -grpcio-reflection \ No newline at end of file +grpcio-reflection diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 589067cb88b7..8dcbe2a719ac 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -23,7 +23,7 @@ jiwer # required for audio tests timm # required for internvl test transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test -mistral_common[image,audio] >= 1.8.8 # required for voxtral test +mistral_common[image,audio] >= 1.9.0 # required for voxtral test num2words # required for smolvlm test opencv-python-headless >= 4.13.0 # required for video test datamodel_code_generator # required for minicpm3 test diff --git a/requirements/test.in b/requirements/test.in index 73da100a6d10..3f2110f82d8c 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -30,7 +30,7 @@ torchaudio==2.9.1 torchvision==0.24.1 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test -mistral_common[image,audio] >= 1.8.8 # required for voxtral test +mistral_common[image,audio] >= 1.9.0 # required for voxtral test num2words # required for smolvlm test open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py opencv-python-headless >= 4.13.0 # required for video test diff --git a/requirements/test.txt b/requirements/test.txt index 3dcb9918e651..d32cdc5df736 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -491,7 +491,7 @@ mbstrdecoder==1.1.3 # typepy mdurl==0.1.2 # via markdown-it-py -mistral-common==1.8.8 +mistral-common==1.9.0 # via -r requirements/test.in mlflow==2.22.0 # via terratorch diff --git a/tests/models/multimodal/generation/test_voxtral_streaming.py b/tests/models/multimodal/generation/test_voxtral_streaming.py index 5cdf6f171e08..41b9a683017b 100644 --- a/tests/models/multimodal/generation/test_voxtral_streaming.py +++ b/tests/models/multimodal/generation/test_voxtral_streaming.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio from dataclasses import asdict import pytest @@ -9,33 +10,71 @@ StreamingMode, TranscriptionRequest, ) +from mistral_common.tokens.tokenizers.audio import AudioConfig from mistral_common.tokens.tokenizers.mistral import MistralTokenizer from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.audio import AudioAsset +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.inputs.data import TokensPrompt +from vllm.v1.engine.async_llm import AsyncLLM, StreamingInput +MODEL_NAME = "mistralai/Voxtral-Mini-3B-Realtime-2602" +ENGINE_CONFIG = dict( + model=MODEL_NAME, + max_model_len=8192, + max_num_seqs=4, + limit_mm_per_prompt={"audio": 1}, + config_format="mistral", + load_format="mistral", + tokenizer_mode="mistral", + enforce_eager=True, + gpu_memory_utilization=0.4, +) -def _get_engine(path: str) -> LLM: - engine_args = EngineArgs( - model=path, - max_model_len=8192, - max_num_seqs=1, - limit_mm_per_prompt={"audio": 1}, - config_format="mistral", - load_format="mistral", - tokenizer_mode="mistral", - enforce_eager=True, - gpu_memory_utilization=0.4, - ) + +EXPECTED_TEXT = [ + ( + " First words I spoke in the original phonograph. " + "A little piece of practical poetry. Mary had a little lamb," + " it sleeps with quite a snow, and everywhere that Mary went, " + "the lamb was sure to go." + ), + ( + " And the 0-1 pitch on the way to Edgar Martinez. Swung on" + " the line. Down the left field line for OBS. Here comes Joy. " + "Here is Junior to third base. They're going to wave him in. " + "The throw to the plate will be late. The Mariners are going" + " to play. For the American League Championship, " + "I don't believe it. It just continues. My, oh, my." + ), +] + + +@pytest.fixture +def audio_assets() -> list[AudioAsset]: + return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] + + +@pytest.fixture +def tokenizer() -> MistralTokenizer: + return MistralTokenizer.from_hf_hub(MODEL_NAME) + + +@pytest.fixture +def engine() -> LLM: + engine_args = EngineArgs(**ENGINE_CONFIG) return LLM(**asdict(engine_args)) -@pytest.mark.skip(reason="Voxtral streaming is not yet public") -def test_voxtral_streaming_forward(): - audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] +@pytest.fixture +def async_engine() -> AsyncLLM: + engine_args = AsyncEngineArgs(**ENGINE_CONFIG) + return AsyncLLM.from_engine_args(engine_args) - model_name = "mistralai/Voxtral-Mini-3B-Realtime-2602" - tokenizer = MistralTokenizer.from_hf_hub(model_name) + +@pytest.mark.skip(reason="Voxtral streaming is not yet public") +def test_voxtral_streaming_forward(audio_assets, tokenizer, engine): audio_config = tokenizer.instruct_tokenizer.tokenizer.audio def from_file(file_path: str): @@ -58,11 +97,7 @@ def from_file(file_path: str): for tokens, audio_array in tokenized_list: num_samples = audio_array.shape[0] - max_tokens = ( - audio_config.num_audio_tokens(num_samples) - - audio_config.num_delay_tokens - - 1 - ) + max_tokens = audio_config.num_audio_tokens(num_samples) - len(tokens) - 1 sampling_params.append(SamplingParams(temperature=0.0, max_tokens=max_tokens)) input_dict = { @@ -71,27 +106,153 @@ def from_file(file_path: str): } inputs.append(input_dict) - llm = _get_engine(model_name) - outputs = llm.generate( + outputs = engine.generate( inputs, sampling_params=sampling_params, ) texts = [out.outputs[0].text for out in outputs] - expected = [ - ( - " First words I spoke in the original phonograph. " - "A little piece of practical poetry. Mary had a little lamb," - " it sleeps with quite a snow, and everywhere that Mary went, " - "the lamb was sure to go." - ), - ( - " And the 0-1 pitch on the way to Edgar Martinez. Swung on" - " the line. Down the left field line for OBS. Here comes Joy. " - "Here is Junior to third base. They're going to wave him in. " - "The throw to the plate will be late. The Mariners are going" - " to play. For the American League Championship, " - "I don't believe it. It just continues. My oh, my." - ), - ] - assert texts == expected + assert texts == EXPECTED_TEXT + + +class RealTimeAudioInput: + """ + This class is used to stream an audio file just as + if it would be streamed in real-time. + """ + + def __init__(self, tokenizer: MistralTokenizer) -> None: + self._tokenizer = tokenizer + self._config: AudioConfig = ( + self._tokenizer.instruct_tokenizer.audio_encoder.audio_config + ) + + self._look_ahead_in_ms = self._config.streaming_look_ahead_ms + self._look_back_in_ms = self._config.streaming_look_back_ms + + self._sampling_rate = self._config.sampling_rate + + self._audio: Audio | None = None + + # mutable objects + self._start = 0 + + n_left_pad_samples = ( + self._config.raw_audio_length_per_tok * self._config.n_left_pad_tokens + ) + self._end = self.streaming_delay + n_left_pad_samples + self.streaming_size + self._queue: asyncio.Queue[StreamingInput | None] = asyncio.Queue() + + @classmethod + async def create(cls, audio: Audio, tokenizer: MistralTokenizer): + self = cls(tokenizer) + + # we're doing "OFFLINE" encoding here to right & left pad the audio since + # we have access to the whole audio + # if we'd do an actual online realtime streaming application we + # should instead pass `StreamingMode.ONLINE` + req = TranscriptionRequest( + streaming=StreamingMode.OFFLINE, + audio=RawAudio.from_audio(audio), + language=None, + ) + audio_enc = self._tokenizer.encode_transcription(req) + self._audio = audio_enc.audios[0] + + # add first request + await self.add_tokens(audio_enc.tokens) + + return self + + @property + def look_ahead(self) -> int: + return self._get_len_in_samples(self._look_ahead_in_ms) + + @property + def look_back(self) -> int: + return self._get_len_in_samples(self._look_back_in_ms) + + @property + def streaming_delay(self) -> int: + return self._get_len_in_samples(self._config.transcription_delay_ms) + + @property + def streaming_size(self) -> int: + stream_size_in_ms = 1000 / self._config.frame_rate + return self._get_len_in_samples(stream_size_in_ms) + + def _get_len_in_samples(self, len_in_ms: float) -> int: + _len_in_s = self._sampling_rate * len_in_ms / 1000 + assert _len_in_s.is_integer(), _len_in_s + len_in_s = int(_len_in_s) + + return len_in_s + + async def add_tokens(self, tokens: list[int]) -> None: + assert self._audio is not None + if self._start >= len(self._audio.audio_array): + self.stop() + return + + _end = self._end + self.look_ahead + _start = max(0, self._start - self.look_back) + + multi_modal_data = {"audio": (self._audio.audio_array[_start:_end], None)} + + prompt = TokensPrompt( + prompt_token_ids=tokens, multi_modal_data=multi_modal_data + ) + + await self._queue.put(StreamingInput(prompt)) + + # increase + self._start = self._end + self._end = self._end + self.streaming_size + + def stop(self): + self._queue.put_nowait(None) + + async def generator(self): + while (item := await self._queue.get()) is not None: + yield item + + +@pytest.mark.asyncio +@pytest.mark.skip(reason="Voxtral streaming is not yet public") +async def test_voxtral_streaming_generator(audio_assets, tokenizer, async_engine): + sampling_params = SamplingParams(temperature=0.0, max_tokens=1) + + output_tokens_list = [] + for i, audio_asset in enumerate(audio_assets): + output_tokens = [] + audio = Audio.from_file(audio_asset.get_local_path(), strict=False) + streaming_input = await RealTimeAudioInput.create( + audio=audio, tokenizer=tokenizer + ) + + request_id = f"session-{i}" + + async for resp in async_engine.generate( + prompt=streaming_input.generator(), + sampling_params=sampling_params, + request_id=request_id, + ): + tokens = resp.outputs[0].token_ids[-1:] + + output_tokens.extend(tokens) + await streaming_input.add_tokens(tokens) + + output_tokens_list.append(output_tokens) + + texts = [tokenizer.decode(output_tokens) for output_tokens in output_tokens_list] + + # 'true' streaming and 'offline' streaming differ a bit because log-mels are + # differently noramalized + texts[0] = ( + texts[0] + .replace("He has f", "F") + .replace("its fleece was quite a slow", "it sleeps with quite a snow") + ) + texts[1] = texts[1].replace("a base hit", "OBS").replace("oh my", "oh, my") + + assert texts == EXPECTED_TEXT