From e7b07213f143c3666929350c773b369e8f40aa06 Mon Sep 17 00:00:00 2001 From: SoluMilken Date: Mon, 25 May 2026 23:59:36 +0800 Subject: [PATCH 1/4] Enable Gemma4 audio transcription endpoint Co-authored-by: OpenAI Codex Signed-off-by: SoluMilken --- docs/models/supported_models.md | 1 + .../test_gemma4_transcription.py | 47 +++++++++++++ vllm/model_executor/models/gemma4_mm.py | 69 ++++++++++++++++++- 3 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 tests/model_executor/test_gemma4_transcription.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 599da3c6b5e8..63c419d23947 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -698,6 +698,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | `FireRedLIDForConditionalGeneration` | FireRedLID | `PatchyTisa/FireRedLID-vllm`, etc. | | | | `FunASRForConditionalGeneration` | FunASR | `allendou/Fun-ASR-Nano-2512-vllm`, etc. | | | | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | +| `Gemma4ForConditionalGeneration` | Gemma 4 | `google/gemma-4-E2B-it`, `google/gemma-4-E4B-it`, etc. | | ✅︎ | | `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-4.0-1b-speech`, `ibm-granite/granite-speech-3.3-2b`, etc. | ✅︎ | ✅︎ | | `Qwen3ASRForConditionalGeneration` | Qwen3-ASR | `Qwen/Qwen3-ASR-1.7B`, etc. | ✅︎ | ✅︎ | diff --git a/tests/model_executor/test_gemma4_transcription.py b/tests/model_executor/test_gemma4_transcription.py new file mode 100644 index 000000000000..32349d7c7a6e --- /dev/null +++ b/tests/model_executor/test_gemma4_transcription.py @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import cast + +import numpy as np + +from vllm.config.model import ModelConfig +from vllm.config.speech_to_text import SpeechToTextConfig, SpeechToTextParams +from vllm.model_executor.models.gemma4_mm import Gemma4ForConditionalGeneration + + +def _make_stt_params( + *, + language: str | None = "en", + task_type: str = "transcribe", + to_language: str | None = None, +) -> SpeechToTextParams: + return SpeechToTextParams( + audio=np.zeros(1600, dtype=np.float32), + stt_config=SpeechToTextConfig(sample_rate=16000), + model_config=cast(ModelConfig, object()), + language=language, + task_type=task_type, + to_language=to_language, + ) + + +def test_gemma4_transcription_prompt_uses_audio_token(): + prompt = Gemma4ForConditionalGeneration.get_generation_prompt(_make_stt_params()) + + assert prompt["prompt"] == ( + "<|turn>user\n" + "Transcribe this audio into English: <|audio|>\n" + "<|turn>model\n" + ) + assert prompt["multi_modal_data"]["audio"][1] == 16000 + + +def test_gemma4_translation_prompt_includes_source_and_target_language(): + prompt = Gemma4ForConditionalGeneration.get_generation_prompt( + _make_stt_params(task_type="translate", language="it", to_language="en") + ) + + assert ( + "Translate this audio from Italian into English: <|audio|>" in prompt["prompt"] + ) diff --git a/vllm/model_executor/models/gemma4_mm.py b/vllm/model_executor/models/gemma4_mm.py index b546040b7414..e121c39fb508 100644 --- a/vllm/model_executor/models/gemma4_mm.py +++ b/vllm/model_executor/models/gemma4_mm.py @@ -33,14 +33,16 @@ Gemma4TextConfig, ) -from vllm.config import VllmConfig +from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions -from vllm.inputs import MultiModalDataDict +from vllm.config.speech_to_text import SpeechToTextParams +from vllm.inputs import MultiModalDataDict, PromptType, TextPrompt from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.models.gemma4 import Gemma4ForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalFieldConfig, @@ -63,6 +65,7 @@ ) from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.processor import cached_processor_from_config from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( @@ -71,6 +74,7 @@ SupportsLoRA, SupportsMultiModal, SupportsPP, + SupportsTranscription, ) from .utils import ( AutoWeightsLoader, @@ -920,7 +924,10 @@ class Gemma4ForConditionalGeneration( SupportsPP, SupportsLoRA, SupportsEagle3, + SupportsTranscription, ): + supported_languages = ISO639_1_SUPPORTED_LANGS + packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -1599,3 +1606,61 @@ def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality == "video": return "<|video|>" raise ValueError(f"Unsupported modality: {modality}") + + @classmethod + def get_generation_prompt(cls, stt_params: SpeechToTextParams) -> PromptType: + audio = stt_params.audio + stt_config = stt_params.stt_config + language = stt_params.language + task_type = stt_params.task_type + to_language = stt_params.to_language + + prompt = "<|turn>user\n" + prompt += "Transcribe" if task_type == "transcribe" else "Translate" + prompt += " this audio" + + full_lang_name = cls.supported_languages.get(language, "") + full_lang_name_to = cls.supported_languages.get(to_language, "") + + if task_type == "transcribe" and full_lang_name: + prompt += f" into {full_lang_name}" + elif task_type == "translate": + if full_lang_name: + prompt += f" from {full_lang_name}" + if full_lang_name_to: + prompt += f" into {full_lang_name_to}" + + prompt += ": <|audio|>\n<|turn>model\n" + + return TextPrompt( + prompt=prompt, + multi_modal_data={"audio": (audio, stt_config.sample_rate)}, + ) + + @classmethod + def get_speech_to_text_config( + cls, model_config: ModelConfig, task_type: str + ) -> SpeechToTextConfig: + processor = cached_processor_from_config(model_config) + feature_extractor = processor.feature_extractor + max_audio_clip_s = math.floor( + processor.audio_seq_length * processor.audio_ms_per_token / 1000 + ) + return SpeechToTextConfig( + max_audio_clip_s=max_audio_clip_s, + sample_rate=feature_extractor.sampling_rate, + min_energy_split_window_size=None, + ) + + @classmethod + def get_num_audio_tokens( + cls, + audio_duration_s: float, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + ) -> int | None: + processor = cached_processor_from_config(model_config) + num_audio_tokens = math.ceil( + audio_duration_s * 1000 / processor.audio_ms_per_token + ) + return min(num_audio_tokens, processor.audio_seq_length) + 2 From 801f880ceef8fb240137724cbfee7dd2ebff7827 Mon Sep 17 00:00:00 2001 From: SoluMilken Date: Tue, 26 May 2026 23:31:32 +0800 Subject: [PATCH 2/4] Add end-to-end integration test for Gemma4 Signed-off-by: SoluMilken --- .../transcription/test_transcription_validation.py | 3 ++- .../translation/test_translation_validation.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/speech_to_text/transcription/test_transcription_validation.py b/tests/entrypoints/speech_to_text/transcription/test_transcription_validation.py index 5ea218406b98..45cef3b6a89c 100644 --- a/tests/entrypoints/speech_to_text/transcription/test_transcription_validation.py +++ b/tests/entrypoints/speech_to_text/transcription/test_transcription_validation.py @@ -131,7 +131,8 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention): @pytest.mark.asyncio @pytest.mark.parametrize( - "model_name", ["google/gemma-3n-E2B-it", "Qwen/Qwen3-ASR-0.6B"] + "model_name", + ["google/gemma-3n-E2B-it", "google/gemma-4-E2B-it", "Qwen/Qwen3-ASR-0.6B"], ) async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name): # Gemma accuracy on some of the audio samples we use is particularly bad, diff --git a/tests/entrypoints/speech_to_text/translation/test_translation_validation.py b/tests/entrypoints/speech_to_text/translation/test_translation_validation.py index ed3cff5f1c22..99bbe88b0f96 100644 --- a/tests/entrypoints/speech_to_text/translation/test_translation_validation.py +++ b/tests/entrypoints/speech_to_text/translation/test_translation_validation.py @@ -59,7 +59,12 @@ def _get_server_args(attention_config): @pytest.fixture( - scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"] + scope="module", + params=[ + "openai/whisper-small", + "google/gemma-3n-E2B-it", + "google/gemma-4-E2B-it", + ], ) def server(request): # Parametrize over model name @@ -261,8 +266,8 @@ async def test_stream_options(foscolo, server): @pytest.mark.asyncio async def test_long_audio_request(foscolo, client_and_model): client, model_name = client_and_model - if model_name == "google/gemma-3n-E2B-it": - pytest.skip("Gemma3n does not support long audio requests") + if model_name in ("google/gemma-3n-E2B-it", "google/gemma-4-E2B-it"): + pytest.skip(f"{model_name} does not support audio chunking in vLLM yet") foscolo.seek(0) audio, sr = load_audio(foscolo) repeated_audio = np.tile(audio, 2) From f81edc44590ee61ae7c36ddb603df0d4caf9baf1 Mon Sep 17 00:00:00 2001 From: SoluMilken Date: Wed, 27 May 2026 00:19:04 +0800 Subject: [PATCH 3/4] Merge unittests to existing test file Signed-off-by: SoluMilken --- .../test_gemma4_transcription.py | 47 ------------------- .../multimodal/processing/test_gemma4.py | 44 +++++++++++++++++ 2 files changed, 44 insertions(+), 47 deletions(-) delete mode 100644 tests/model_executor/test_gemma4_transcription.py diff --git a/tests/model_executor/test_gemma4_transcription.py b/tests/model_executor/test_gemma4_transcription.py deleted file mode 100644 index 32349d7c7a6e..000000000000 --- a/tests/model_executor/test_gemma4_transcription.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import cast - -import numpy as np - -from vllm.config.model import ModelConfig -from vllm.config.speech_to_text import SpeechToTextConfig, SpeechToTextParams -from vllm.model_executor.models.gemma4_mm import Gemma4ForConditionalGeneration - - -def _make_stt_params( - *, - language: str | None = "en", - task_type: str = "transcribe", - to_language: str | None = None, -) -> SpeechToTextParams: - return SpeechToTextParams( - audio=np.zeros(1600, dtype=np.float32), - stt_config=SpeechToTextConfig(sample_rate=16000), - model_config=cast(ModelConfig, object()), - language=language, - task_type=task_type, - to_language=to_language, - ) - - -def test_gemma4_transcription_prompt_uses_audio_token(): - prompt = Gemma4ForConditionalGeneration.get_generation_prompt(_make_stt_params()) - - assert prompt["prompt"] == ( - "<|turn>user\n" - "Transcribe this audio into English: <|audio|>\n" - "<|turn>model\n" - ) - assert prompt["multi_modal_data"]["audio"][1] == 16000 - - -def test_gemma4_translation_prompt_includes_source_and_target_language(): - prompt = Gemma4ForConditionalGeneration.get_generation_prompt( - _make_stt_params(task_type="translate", language="it", to_language="en") - ) - - assert ( - "Translate this audio from Italian into English: <|audio|>" in prompt["prompt"] - ) diff --git a/tests/models/multimodal/processing/test_gemma4.py b/tests/models/multimodal/processing/test_gemma4.py index a355501fdd80..5188b4da9a6b 100644 --- a/tests/models/multimodal/processing/test_gemma4.py +++ b/tests/models/multimodal/processing/test_gemma4.py @@ -2,11 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping +from typing import cast +import numpy as np import pytest import torch from PIL import Image as PILImage +from vllm.config.model import ModelConfig +from vllm.config.speech_to_text import SpeechToTextConfig, SpeechToTextParams from vllm.model_executor.models.gemma4_mm import ( Gemma4ForConditionalGeneration, Gemma4ImagePixelInputs, @@ -285,3 +289,43 @@ def test_encoder_chunk_no_free_memory_falls_back_to_one(): ) == 1 ) + + +# --- STT prompt generation --- + + +def _make_stt_params( + *, + language: str | None = "en", + task_type: str = "transcribe", + to_language: str | None = None, +) -> SpeechToTextParams: + return SpeechToTextParams( + audio=np.zeros(1600, dtype=np.float32), + stt_config=SpeechToTextConfig(sample_rate=16000), + model_config=cast(ModelConfig, object()), + language=language, + task_type=task_type, + to_language=to_language, + ) + + +def test_gemma4_transcription_prompt_uses_audio_token(): + prompt = Gemma4ForConditionalGeneration.get_generation_prompt(_make_stt_params()) + + assert prompt["prompt"] == ( + "<|turn>user\n" + "Transcribe this audio into English: <|audio|>\n" + "<|turn>model\n" + ) + assert prompt["multi_modal_data"]["audio"][1] == 16000 + + +def test_gemma4_translation_prompt_includes_source_and_target_language(): + prompt = Gemma4ForConditionalGeneration.get_generation_prompt( + _make_stt_params(task_type="translate", language="it", to_language="en") + ) + + assert ( + "Translate this audio from Italian into English: <|audio|>" in prompt["prompt"] + ) From 24652b44feb55d0287953db5a43c8382871d1862 Mon Sep 17 00:00:00 2001 From: SoluMilken Date: Wed, 27 May 2026 11:47:18 +0800 Subject: [PATCH 4/4] Relax Foscolo transcription assertion for Gemma4 Signed-off-by: SoluMilken --- .../transcription/test_transcription_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/speech_to_text/transcription/test_transcription_validation.py b/tests/entrypoints/speech_to_text/transcription/test_transcription_validation.py index 45cef3b6a89c..f0897d6e8d83 100644 --- a/tests/entrypoints/speech_to_text/transcription/test_transcription_validation.py +++ b/tests/entrypoints/speech_to_text/transcription/test_transcription_validation.py @@ -153,5 +153,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name) model_name, foscolo, language="it", - expected_text="ove il mio corpo fanciulletto", + expected_text="ove il mio corpo", )