diff --git a/requirements/common.txt b/requirements/common.txt
index 5c5df38f8e06..365e30ed4e44 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -31,7 +31,7 @@ partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
 gguf >= 0.17.0
-mistral_common[image] >= 1.8.8
+mistral_common[image] >= 1.9.0
 opencv-python-headless >= 4.13.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
@@ -52,4 +52,4 @@ anthropic >= 0.71.0
 model-hosting-container-standards >= 0.1.13, < 1.0.0
 mcp
 grpcio
-grpcio-reflection
\ No newline at end of file
+grpcio-reflection
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index 589067cb88b7..8dcbe2a719ac 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -23,7 +23,7 @@ jiwer # required for audio tests
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.8.8 # required for voxtral test
+mistral_common[image,audio] >= 1.9.0 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
diff --git a/requirements/test.in b/requirements/test.in
index 73da100a6d10..3f2110f82d8c 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -30,7 +30,7 @@ torchaudio==2.9.1
 torchvision==0.24.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.8.8 # required for voxtral test
+mistral_common[image,audio] >= 1.9.0 # required for voxtral test
 num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
 opencv-python-headless >= 4.13.0 # required for video test
diff --git a/requirements/test.txt b/requirements/test.txt
index 3dcb9918e651..d32cdc5df736 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -491,7 +491,7 @@ mbstrdecoder==1.1.3
     #   typepy
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.8.8
+mistral-common==1.9.0
     # via -r requirements/test.in
 mlflow==2.22.0
     # via terratorch
diff --git a/tests/models/multimodal/generation/test_voxtral_streaming.py b/tests/models/multimodal/generation/test_voxtral_streaming.py
index 5cdf6f171e08..41b9a683017b 100644
--- a/tests/models/multimodal/generation/test_voxtral_streaming.py
+++ b/tests/models/multimodal/generation/test_voxtral_streaming.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
 from dataclasses import asdict
 
 import pytest
@@ -9,33 +10,71 @@
     StreamingMode,
     TranscriptionRequest,
 )
+from mistral_common.tokens.tokenizers.audio import AudioConfig
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.audio import AudioAsset
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs.data import TokensPrompt
+from vllm.v1.engine.async_llm import AsyncLLM, StreamingInput
 
+MODEL_NAME = "mistralai/Voxtral-Mini-3B-Realtime-2602"
+ENGINE_CONFIG = dict(
+    model=MODEL_NAME,
+    max_model_len=8192,
+    max_num_seqs=4,
+    limit_mm_per_prompt={"audio": 1},
+    config_format="mistral",
+    load_format="mistral",
+    tokenizer_mode="mistral",
+    enforce_eager=True,
+    gpu_memory_utilization=0.4,
+)
 
-def _get_engine(path: str) -> LLM:
-    engine_args = EngineArgs(
-        model=path,
-        max_model_len=8192,
-        max_num_seqs=1,
-        limit_mm_per_prompt={"audio": 1},
-        config_format="mistral",
-        load_format="mistral",
-        tokenizer_mode="mistral",
-        enforce_eager=True,
-        gpu_memory_utilization=0.4,
-    )
+
+EXPECTED_TEXT = [
+    (
+        " First words I spoke in the original phonograph. "
+        "A little piece of practical poetry. Mary had a little lamb,"
+        " it sleeps with quite a snow, and everywhere that Mary went, "
+        "the lamb was sure to go."
+    ),
+    (
+        " And the 0-1 pitch on the way to Edgar Martinez. Swung on"
+        " the line. Down the left field line for OBS. Here comes Joy. "
+        "Here is Junior to third base. They're going to wave him in. "
+        "The throw to the plate will be late. The Mariners are going"
+        " to play. For the American League Championship, "
+        "I don't believe it. It just continues. My, oh, my."
+    ),
+]
+
+
+@pytest.fixture
+def audio_assets() -> list[AudioAsset]:
+    return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
+
+
+@pytest.fixture
+def tokenizer() -> MistralTokenizer:
+    return MistralTokenizer.from_hf_hub(MODEL_NAME)
+
+
+@pytest.fixture
+def engine() -> LLM:
+    engine_args = EngineArgs(**ENGINE_CONFIG)
     return LLM(**asdict(engine_args))
 
 
-@pytest.mark.skip(reason="Voxtral streaming is not yet public")
-def test_voxtral_streaming_forward():
-    audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
+@pytest.fixture
+def async_engine() -> AsyncLLM:
+    engine_args = AsyncEngineArgs(**ENGINE_CONFIG)
+    return AsyncLLM.from_engine_args(engine_args)
 
-    model_name = "mistralai/Voxtral-Mini-3B-Realtime-2602"
-    tokenizer = MistralTokenizer.from_hf_hub(model_name)
+
+@pytest.mark.skip(reason="Voxtral streaming is not yet public")
+def test_voxtral_streaming_forward(audio_assets, tokenizer, engine):
     audio_config = tokenizer.instruct_tokenizer.tokenizer.audio
 
     def from_file(file_path: str):
@@ -58,11 +97,7 @@ def from_file(file_path: str):
 
     for tokens, audio_array in tokenized_list:
         num_samples = audio_array.shape[0]
-        max_tokens = (
-            audio_config.num_audio_tokens(num_samples)
-            - audio_config.num_delay_tokens
-            - 1
-        )
+        max_tokens = audio_config.num_audio_tokens(num_samples) - len(tokens) - 1
         sampling_params.append(SamplingParams(temperature=0.0, max_tokens=max_tokens))
 
         input_dict = {
@@ -71,27 +106,153 @@ def from_file(file_path: str):
         }
         inputs.append(input_dict)
 
-    llm = _get_engine(model_name)
-    outputs = llm.generate(
+    outputs = engine.generate(
         inputs,
         sampling_params=sampling_params,
     )
 
     texts = [out.outputs[0].text for out in outputs]
-    expected = [
-        (
-            " First words I spoke in the original phonograph. "
-            "A little piece of practical poetry. Mary had a little lamb,"
-            " it sleeps with quite a snow, and everywhere that Mary went, "
-            "the lamb was sure to go."
-        ),
-        (
-            " And the 0-1 pitch on the way to Edgar Martinez. Swung on"
-            " the line. Down the left field line for OBS. Here comes Joy. "
-            "Here is Junior to third base. They're going to wave him in. "
-            "The throw to the plate will be late. The Mariners are going"
-            " to play. For the American League Championship, "
-            "I don't believe it. It just continues. My oh, my."
-        ),
-    ]
-    assert texts == expected
+    assert texts == EXPECTED_TEXT
+
+
+class RealTimeAudioInput:
+    """
+    This class is used to stream an audio file just as
+    if it would be streamed in real-time.
+    """
+
+    def __init__(self, tokenizer: MistralTokenizer) -> None:
+        self._tokenizer = tokenizer
+        self._config: AudioConfig = (
+            self._tokenizer.instruct_tokenizer.audio_encoder.audio_config
+        )
+
+        self._look_ahead_in_ms = self._config.streaming_look_ahead_ms
+        self._look_back_in_ms = self._config.streaming_look_back_ms
+
+        self._sampling_rate = self._config.sampling_rate
+
+        self._audio: Audio | None = None
+
+        # mutable objects
+        self._start = 0
+
+        n_left_pad_samples = (
+            self._config.raw_audio_length_per_tok * self._config.n_left_pad_tokens
+        )
+        self._end = self.streaming_delay + n_left_pad_samples + self.streaming_size
+        self._queue: asyncio.Queue[StreamingInput | None] = asyncio.Queue()
+
+    @classmethod
+    async def create(cls, audio: Audio, tokenizer: MistralTokenizer):
+        self = cls(tokenizer)
+
+        # we're doing "OFFLINE" encoding here to right & left pad the audio since
+        # we have access to the whole audio
+        # if we'd do an actual online realtime streaming application we
+        # should instead pass `StreamingMode.ONLINE`
+        req = TranscriptionRequest(
+            streaming=StreamingMode.OFFLINE,
+            audio=RawAudio.from_audio(audio),
+            language=None,
+        )
+        audio_enc = self._tokenizer.encode_transcription(req)
+        self._audio = audio_enc.audios[0]
+
+        # add first request
+        await self.add_tokens(audio_enc.tokens)
+
+        return self
+
+    @property
+    def look_ahead(self) -> int:
+        return self._get_len_in_samples(self._look_ahead_in_ms)
+
+    @property
+    def look_back(self) -> int:
+        return self._get_len_in_samples(self._look_back_in_ms)
+
+    @property
+    def streaming_delay(self) -> int:
+        return self._get_len_in_samples(self._config.transcription_delay_ms)
+
+    @property
+    def streaming_size(self) -> int:
+        stream_size_in_ms = 1000 / self._config.frame_rate
+        return self._get_len_in_samples(stream_size_in_ms)
+
+    def _get_len_in_samples(self, len_in_ms: float) -> int:
+        _len_in_s = self._sampling_rate * len_in_ms / 1000
+        assert _len_in_s.is_integer(), _len_in_s
+        len_in_s = int(_len_in_s)
+
+        return len_in_s
+
+    async def add_tokens(self, tokens: list[int]) -> None:
+        assert self._audio is not None
+        if self._start >= len(self._audio.audio_array):
+            self.stop()
+            return
+
+        _end = self._end + self.look_ahead
+        _start = max(0, self._start - self.look_back)
+
+        multi_modal_data = {"audio": (self._audio.audio_array[_start:_end], None)}
+
+        prompt = TokensPrompt(
+            prompt_token_ids=tokens, multi_modal_data=multi_modal_data
+        )
+
+        await self._queue.put(StreamingInput(prompt))
+
+        # increase
+        self._start = self._end
+        self._end = self._end + self.streaming_size
+
+    def stop(self):
+        self._queue.put_nowait(None)
+
+    async def generator(self):
+        while (item := await self._queue.get()) is not None:
+            yield item
+
+
+@pytest.mark.asyncio
+@pytest.mark.skip(reason="Voxtral streaming is not yet public")
+async def test_voxtral_streaming_generator(audio_assets, tokenizer, async_engine):
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=1)
+
+    output_tokens_list = []
+    for i, audio_asset in enumerate(audio_assets):
+        output_tokens = []
+        audio = Audio.from_file(audio_asset.get_local_path(), strict=False)
+        streaming_input = await RealTimeAudioInput.create(
+            audio=audio, tokenizer=tokenizer
+        )
+
+        request_id = f"session-{i}"
+
+        async for resp in async_engine.generate(
+            prompt=streaming_input.generator(),
+            sampling_params=sampling_params,
+            request_id=request_id,
+        ):
+            tokens = resp.outputs[0].token_ids[-1:]
+
+            output_tokens.extend(tokens)
+            await streaming_input.add_tokens(tokens)
+
+        output_tokens_list.append(output_tokens)
+
+    texts = [tokenizer.decode(output_tokens) for output_tokens in output_tokens_list]
+
+    # 'true' streaming and 'offline' streaming differ a bit because log-mels are
+    # differently noramalized
+    texts[0] = (
+        texts[0]
+        .replace("He has f", "F")
+        .replace("its fleece was quite a slow", "it sleeps with quite a snow")
+    )
+    texts[1] = texts[1].replace("a base hit", "OBS").replace("oh my", "oh, my")
+
+    assert texts == EXPECTED_TEXT