Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions requirements/common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ partial-json-parser # used for parsing partial JSON outputs
pyzmq >= 25.0.0
msgspec
gguf >= 0.17.0
mistral_common[image] >= 1.8.8
mistral_common[image] >= 1.9.0
opencv-python-headless >= 4.13.0 # required for video IO
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
Expand All @@ -52,4 +52,4 @@ anthropic >= 0.71.0
model-hosting-container-standards >= 0.1.13, < 1.0.0
mcp
grpcio
grpcio-reflection
grpcio-reflection
2 changes: 1 addition & 1 deletion requirements/nightly_torch_test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jiwer # required for audio tests
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.8 # required for voxtral test
mistral_common[image,audio] >= 1.9.0 # required for voxtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
Expand Down
2 changes: 1 addition & 1 deletion requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ torchaudio==2.9.1
torchvision==0.24.1
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.8 # required for voxtral test
mistral_common[image,audio] >= 1.9.0 # required for voxtral test
num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
opencv-python-headless >= 4.13.0 # required for video test
Expand Down
2 changes: 1 addition & 1 deletion requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ mbstrdecoder==1.1.3
# typepy
mdurl==0.1.2
# via markdown-it-py
mistral-common==1.8.8
mistral-common==1.9.0
# via -r requirements/test.in
mlflow==2.22.0
# via terratorch
Expand Down
243 changes: 202 additions & 41 deletions tests/models/multimodal/generation/test_voxtral_streaming.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
from dataclasses import asdict

import pytest
Expand All @@ -9,33 +10,71 @@
StreamingMode,
TranscriptionRequest,
)
from mistral_common.tokens.tokenizers.audio import AudioConfig
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer

from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.inputs.data import TokensPrompt
from vllm.v1.engine.async_llm import AsyncLLM, StreamingInput

MODEL_NAME = "mistralai/Voxtral-Mini-3B-Realtime-2602"
ENGINE_CONFIG = dict(
model=MODEL_NAME,
max_model_len=8192,
max_num_seqs=4,
limit_mm_per_prompt={"audio": 1},
config_format="mistral",
load_format="mistral",
tokenizer_mode="mistral",
enforce_eager=True,
gpu_memory_utilization=0.4,
)

def _get_engine(path: str) -> LLM:
engine_args = EngineArgs(
model=path,
max_model_len=8192,
max_num_seqs=1,
limit_mm_per_prompt={"audio": 1},
config_format="mistral",
load_format="mistral",
tokenizer_mode="mistral",
enforce_eager=True,
gpu_memory_utilization=0.4,
)

EXPECTED_TEXT = [
(
" First words I spoke in the original phonograph. "
"A little piece of practical poetry. Mary had a little lamb,"
" it sleeps with quite a snow, and everywhere that Mary went, "
"the lamb was sure to go."
),
(
" And the 0-1 pitch on the way to Edgar Martinez. Swung on"
" the line. Down the left field line for OBS. Here comes Joy. "
"Here is Junior to third base. They're going to wave him in. "
"The throw to the plate will be late. The Mariners are going"
" to play. For the American League Championship, "
"I don't believe it. It just continues. My, oh, my."
),
]


@pytest.fixture
def audio_assets() -> list[AudioAsset]:
return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]


@pytest.fixture
def tokenizer() -> MistralTokenizer:
return MistralTokenizer.from_hf_hub(MODEL_NAME)


@pytest.fixture
def engine() -> LLM:
engine_args = EngineArgs(**ENGINE_CONFIG)
return LLM(**asdict(engine_args))


@pytest.mark.skip(reason="Voxtral streaming is not yet public")
def test_voxtral_streaming_forward():
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
@pytest.fixture
def async_engine() -> AsyncLLM:
engine_args = AsyncEngineArgs(**ENGINE_CONFIG)
return AsyncLLM.from_engine_args(engine_args)

model_name = "mistralai/Voxtral-Mini-3B-Realtime-2602"
tokenizer = MistralTokenizer.from_hf_hub(model_name)

@pytest.mark.skip(reason="Voxtral streaming is not yet public")
def test_voxtral_streaming_forward(audio_assets, tokenizer, engine):
audio_config = tokenizer.instruct_tokenizer.tokenizer.audio

def from_file(file_path: str):
Expand All @@ -58,11 +97,7 @@ def from_file(file_path: str):

for tokens, audio_array in tokenized_list:
num_samples = audio_array.shape[0]
max_tokens = (
audio_config.num_audio_tokens(num_samples)
- audio_config.num_delay_tokens
- 1
)
max_tokens = audio_config.num_audio_tokens(num_samples) - len(tokens) - 1
sampling_params.append(SamplingParams(temperature=0.0, max_tokens=max_tokens))

input_dict = {
Expand All @@ -71,27 +106,153 @@ def from_file(file_path: str):
}
inputs.append(input_dict)

llm = _get_engine(model_name)
outputs = llm.generate(
outputs = engine.generate(
inputs,
sampling_params=sampling_params,
)

texts = [out.outputs[0].text for out in outputs]
expected = [
(
" First words I spoke in the original phonograph. "
"A little piece of practical poetry. Mary had a little lamb,"
" it sleeps with quite a snow, and everywhere that Mary went, "
"the lamb was sure to go."
),
(
" And the 0-1 pitch on the way to Edgar Martinez. Swung on"
" the line. Down the left field line for OBS. Here comes Joy. "
"Here is Junior to third base. They're going to wave him in. "
"The throw to the plate will be late. The Mariners are going"
" to play. For the American League Championship, "
"I don't believe it. It just continues. My oh, my."
),
]
assert texts == expected
assert texts == EXPECTED_TEXT


class RealTimeAudioInput:
"""
This class is used to stream an audio file just as
if it would be streamed in real-time.
"""

def __init__(self, tokenizer: MistralTokenizer) -> None:
self._tokenizer = tokenizer
self._config: AudioConfig = (
self._tokenizer.instruct_tokenizer.audio_encoder.audio_config
)

self._look_ahead_in_ms = self._config.streaming_look_ahead_ms
self._look_back_in_ms = self._config.streaming_look_back_ms

self._sampling_rate = self._config.sampling_rate

self._audio: Audio | None = None

# mutable objects
self._start = 0

n_left_pad_samples = (
self._config.raw_audio_length_per_tok * self._config.n_left_pad_tokens
)
self._end = self.streaming_delay + n_left_pad_samples + self.streaming_size
self._queue: asyncio.Queue[StreamingInput | None] = asyncio.Queue()

@classmethod
async def create(cls, audio: Audio, tokenizer: MistralTokenizer):
self = cls(tokenizer)

# we're doing "OFFLINE" encoding here to right & left pad the audio since
# we have access to the whole audio
# if we'd do an actual online realtime streaming application we
# should instead pass `StreamingMode.ONLINE`
req = TranscriptionRequest(
streaming=StreamingMode.OFFLINE,
audio=RawAudio.from_audio(audio),
language=None,
)
audio_enc = self._tokenizer.encode_transcription(req)
self._audio = audio_enc.audios[0]

# add first request
await self.add_tokens(audio_enc.tokens)

return self

@property
def look_ahead(self) -> int:
return self._get_len_in_samples(self._look_ahead_in_ms)

@property
def look_back(self) -> int:
return self._get_len_in_samples(self._look_back_in_ms)

@property
def streaming_delay(self) -> int:
return self._get_len_in_samples(self._config.transcription_delay_ms)

@property
def streaming_size(self) -> int:
stream_size_in_ms = 1000 / self._config.frame_rate
return self._get_len_in_samples(stream_size_in_ms)

def _get_len_in_samples(self, len_in_ms: float) -> int:
_len_in_s = self._sampling_rate * len_in_ms / 1000
assert _len_in_s.is_integer(), _len_in_s
len_in_s = int(_len_in_s)

return len_in_s

async def add_tokens(self, tokens: list[int]) -> None:
assert self._audio is not None
if self._start >= len(self._audio.audio_array):
self.stop()
return

_end = self._end + self.look_ahead
_start = max(0, self._start - self.look_back)

multi_modal_data = {"audio": (self._audio.audio_array[_start:_end], None)}

prompt = TokensPrompt(
prompt_token_ids=tokens, multi_modal_data=multi_modal_data
)

await self._queue.put(StreamingInput(prompt))

# increase
self._start = self._end
self._end = self._end + self.streaming_size

def stop(self):
self._queue.put_nowait(None)

async def generator(self):
while (item := await self._queue.get()) is not None:
yield item


@pytest.mark.asyncio
@pytest.mark.skip(reason="Voxtral streaming is not yet public")
async def test_voxtral_streaming_generator(audio_assets, tokenizer, async_engine):
sampling_params = SamplingParams(temperature=0.0, max_tokens=1)

output_tokens_list = []
for i, audio_asset in enumerate(audio_assets):
output_tokens = []
audio = Audio.from_file(audio_asset.get_local_path(), strict=False)
streaming_input = await RealTimeAudioInput.create(
audio=audio, tokenizer=tokenizer
)

request_id = f"session-{i}"

async for resp in async_engine.generate(
prompt=streaming_input.generator(),
sampling_params=sampling_params,
request_id=request_id,
):
tokens = resp.outputs[0].token_ids[-1:]

output_tokens.extend(tokens)
await streaming_input.add_tokens(tokens)

output_tokens_list.append(output_tokens)

texts = [tokenizer.decode(output_tokens) for output_tokens in output_tokens_list]

# 'true' streaming and 'offline' streaming differ a bit because log-mels are
# differently noramalized
texts[0] = (
texts[0]
.replace("He has f", "F")
.replace("its fleece was quite a slow", "it sleeps with quite a snow")
)
texts[1] = texts[1].replace("a base hit", "OBS").replace("oh my", "oh, my")

assert texts == EXPECTED_TEXT