From 6136ec9c4e8191342a8c7e6a4ba147cda5b1c132 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 5 Feb 2026 04:36:38 -0800 Subject: [PATCH 01/11] update vllm miltimodal for api calls convenience Signed-off-by: George Zelenfroind --- .../inference/model/vllm_multimodal.py | 219 ++++++++++++------ 1 file changed, 148 insertions(+), 71 deletions(-) diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py index c2285b0ccc..482e638cdc 100644 --- a/nemo_skills/inference/model/vllm_multimodal.py +++ b/nemo_skills/inference/model/vllm_multimodal.py @@ -17,6 +17,7 @@ This module provides a multimodal model class that handles: - Audio INPUT: encoding audio files to base64, chunking long audio - Audio OUTPUT: saving audio responses from the server to disk +- External API support: NVIDIA Inference API, OpenAI, and other OpenAI-compatible APIs """ import base64 @@ -32,6 +33,7 @@ audio_file_to_base64, chunk_audio, load_audio_file, + make_audio_content_block, save_audio_chunk_to_base64, ) from .vllm import VLLMModel @@ -43,10 +45,10 @@ class VLLMMultimodalModel(VLLMModel): - """VLLMModel with support for audio input and output. + """VLLMModel with support for audio input/output and external APIs. Audio INPUT capabilities: - 1. Converts audio file paths to base64-encoded audio_url format + 1. Converts audio file paths to base64-encoded input_audio format 2. Chunks long audio files for models with duration limits 3. Aggregates results from chunked audio processing @@ -54,29 +56,55 @@ class VLLMMultimodalModel(VLLMModel): 1. Saves audio responses from the server to disk / output_dir/audio/ 2. Replaces the base64 data with the file path in the result + External API support: + - Automatically detects external APIs (any non-localhost URL) + - Smart API key handling from environment variables + - Skips vLLM-specific params for external APIs + + Example usage: + # Local vLLM server + model = VLLMMultimodalModel(model="Qwen/Qwen2-Audio-7B") + + # NVIDIA Inference API + model = VLLMMultimodalModel( + model="gcp/google/gemini-2.5-pro", + base_url="https://inference-api.nvidia.com/v1" + ) """ def __init__( self, + model: str | None = None, + base_url: str | None = None, enable_audio_chunking: bool = True, audio_chunk_task_types: list[str] | None = None, chunk_audio_threshold_sec: int = 30, + audio_format: str = "input_audio", **kwargs, ): - """Initialize VLLMMultimodalModel with audio I/O support. + """Initialize VLLMMultimodalModel with audio I/O and external API support. Args: + model: Model name (e.g., "Qwen/Qwen2-Audio-7B" for local, "gcp/google/gemini-2.5-pro" for NVIDIA API). + base_url: API base URL. If None, defaults to local server. For external APIs, provide the full URL. enable_audio_chunking: Master switch for audio chunking. audio_chunk_task_types: If None, chunk all task types; if specified, only chunk these. chunk_audio_threshold_sec: Audio duration threshold for chunking (in seconds). + audio_format: Format for audio content (must be "input_audio"). **kwargs: Other parameters passed to VLLMModel/BaseModel. """ - super().__init__(**kwargs) + # Determine if this is an external API (non-local URL) + self._external_api_mode = not self._is_local_url(base_url) + + super().__init__(model=model, base_url=base_url, **kwargs) # Audio INPUT config self.enable_audio_chunking = enable_audio_chunking self.audio_chunk_task_types = audio_chunk_task_types self.chunk_audio_threshold_sec = chunk_audio_threshold_sec + if audio_format != "input_audio": + raise ValueError(f"Unsupported audio_format '{audio_format}'. Only 'input_audio' is supported.") + self.audio_format = audio_format # Audio OUTPUT config self.output_audio_dir = None @@ -85,6 +113,92 @@ def __init__( os.makedirs(self.output_audio_dir, exist_ok=True) LOG.info(f"Audio responses will be saved to: {self.output_audio_dir}") + def _is_local_url(self, base_url: str | None) -> bool: + """Check if the base_url points to a local server. + + Args: + base_url: API base URL. + + Returns: + True if local server, False otherwise. + """ + if not base_url: + return True # No URL means local server (will use default host:port) + local_patterns = ["127.0.0.1", "localhost", "0.0.0.0"] + return any(pattern in base_url for pattern in local_patterns) + + def _get_api_key(self, api_key: str | None, api_key_env_var: str | None, base_url: str) -> str | None: + """Get API key with smart detection for external APIs. + + Checks for API keys in the following order: + 1. Explicit api_key argument + 2. Environment variable specified by api_key_env_var + 3. Auto-detect based on base_url (NVIDIA_API_KEY, OPENAI_API_KEY, etc.) + + Args: + api_key: Explicit API key. + api_key_env_var: Environment variable name for API key. + base_url: API base URL for auto-detection. + + Returns: + API key string or None. + """ + # First, try parent class logic (explicit key or env var) + api_key = super()._get_api_key(api_key, api_key_env_var, base_url) + + if api_key is not None: + return api_key + + # Auto-detect API key based on base_url + if base_url: + if "api.nvidia.com" in base_url or "inference-api.nvidia.com" in base_url: + api_key = os.getenv("NVIDIA_API_KEY") + if not api_key: + raise ValueError( + "NVIDIA_API_KEY is required for NVIDIA APIs and could not be found. " + "Set NVIDIA_API_KEY environment variable or pass api_key explicitly." + ) + return api_key + + if "api.openai.com" in base_url: + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + raise ValueError( + "OPENAI_API_KEY is required for OpenAI APIs and could not be found. " + "Set OPENAI_API_KEY environment variable or pass api_key explicitly." + ) + return api_key + + if "generativelanguage.googleapis.com" in base_url: + api_key = os.getenv("GOOGLE_API_KEY") + if not api_key: + raise ValueError( + "GOOGLE_API_KEY is required for Google APIs and could not be found. " + "Set GOOGLE_API_KEY environment variable or pass api_key explicitly." + ) + return api_key + + return api_key + + def _build_request_body(self, top_k, min_p, repetition_penalty, extra_body: dict = None): + """Build request body, skipping vLLM-specific params for external APIs. + + Args: + top_k: Top-k sampling parameter. + min_p: Min-p sampling parameter. + repetition_penalty: Repetition penalty parameter. + extra_body: Additional parameters to include. + + Returns: + Dictionary of extra body parameters for the request. + """ + # For external APIs, skip vLLM-specific parameters + if self._external_api_mode: + return extra_body or {} + + # For local vLLM server, use full parameter set + return super()._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body) + def _parse_chat_completion_response(self, response, include_response: bool = False, **kwargs) -> dict: """Parse chat completion response and save any audio to disk.""" result = super()._parse_chat_completion_response(response, include_response=include_response, **kwargs) @@ -158,62 +272,46 @@ def content_text_to_list(self, message: dict) -> dict: """Convert message content with audio to proper list format. Handles 'audio' or 'audios' keys in messages and converts them to - base64-encoded audio_url content items. + base64-encoded input_audio content items. - CRITICAL: Audio must come BEFORE text for Qwen models to transcribe correctly. + CRITICAL: Audio must come BEFORE text for models to process correctly. Args: message: Message dict that may contain 'audio' or 'audios' fields. Returns: - Message dict with content converted to list format including audio. + New message dict with content converted to list format including audio. """ if "audio" not in message and "audios" not in message: return message - content = message.get("content", "") + result = copy.deepcopy(message) + + if "content" not in result: + raise KeyError("Missing required 'content' in message") + content = result["content"] if isinstance(content, str): - message["content"] = [{"type": "text", "text": content}] - elif isinstance(content, list): - message["content"] = content - else: + result["content"] = [{"type": "text", "text": content}] + elif not isinstance(content, list): raise TypeError(f"Unexpected content type: {type(content)}") audio_items = [] - if "audio" in message: - audio = message["audio"] + if "audio" in result: + audio = result.pop("audio") audio_path = os.path.join(self.data_dir, audio["path"]) base64_audio = audio_file_to_base64(audio_path) - audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} - audio_items.append(audio_message) - del message["audio"] # Remove original audio field after conversion - elif "audios" in message: - for audio in message["audios"]: + audio_items.append(make_audio_content_block(base64_audio, self.audio_format)) + elif "audios" in result: + for audio in result.pop("audios"): audio_path = os.path.join(self.data_dir, audio["path"]) base64_audio = audio_file_to_base64(audio_path) - audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} - audio_items.append(audio_message) - del message["audios"] # Remove original audios field after conversion + audio_items.append(make_audio_content_block(base64_audio, self.audio_format)) - # Insert audio items at the BEGINNING of content list (before text) if audio_items: - message["content"] = audio_items + message["content"] - - return message - - def _preprocess_messages_for_model(self, messages: list[dict]) -> list[dict]: - """Preprocess messages - creates copies to avoid mutation. + result["content"] = audio_items + result["content"] - Note: /no_think suffix is passed through unchanged (handled by the model). - - Args: - messages: List of message dicts. - - Returns: - Copy of message dicts. - """ - return [copy.deepcopy(msg) for msg in messages] + return result def _needs_audio_chunking(self, messages: list[dict], task_type: str = None) -> tuple[bool, str, float]: """Check if audio in messages needs chunking. @@ -235,11 +333,14 @@ def _needs_audio_chunking(self, messages: list[dict], task_type: str = None) -> # Find audio in messages for msg in messages: - if msg.get("role") == "user": - audio_info = msg.get("audio") - if not audio_info: - audios = msg.get("audios", []) + if msg["role"] == "user": + if "audio" in msg: + audio_info = msg["audio"] + elif "audios" in msg: + audios = msg["audios"] audio_info = audios[0] if audios else {} + else: + continue if audio_info and "path" in audio_info: audio_path = os.path.join(self.data_dir, audio_info["path"]) @@ -300,16 +401,16 @@ async def _generate_with_chunking( if msg_copy["role"] == "user" and ("audio" in msg_copy or "audios" in msg_copy): chunk_base64 = save_audio_chunk_to_base64(audio_chunk, sampling_rate) - content = msg_copy.get("content", "") + if "content" not in msg_copy: + raise KeyError("Missing required 'content' in message") + content = msg_copy["content"] if isinstance(content, str): text_content = [{"type": "text", "text": content}] else: text_content = content # Add audio chunk at the beginning (before text) - msg_copy["content"] = [ - {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{chunk_base64}"}} - ] + text_content + msg_copy["content"] = [make_audio_content_block(chunk_base64, self.audio_format)] + text_content # Remove original audio fields msg_copy.pop("audio", None) @@ -317,9 +418,6 @@ async def _generate_with_chunking( chunk_messages.append(msg_copy) - # Preprocess messages (strip /no_think for Qwen) - chunk_messages = self._preprocess_messages_for_model(chunk_messages) - # Generate for this chunk using parent's generate_async result = await super().generate_async( prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs @@ -379,28 +477,7 @@ async def generate_async( return await self._generate_with_chunking(messages, audio_path, duration, tokens_to_generate, **kwargs) # No chunking needed - convert audio fields to base64 format - messages = [self.content_text_to_list(copy.deepcopy(msg)) for msg in messages] - messages = self._preprocess_messages_for_model(messages) - prompt = messages + prompt = [self.content_text_to_list(msg) for msg in messages] # Call parent's generate_async (which handles audio OUTPUT via _parse_chat_completion_response) return await super().generate_async(prompt=prompt, tokens_to_generate=tokens_to_generate, **kwargs) - - def _build_chat_request_params( - self, - messages: list[dict], - **kwargs, - ) -> dict: - """Build chat request parameters with audio preprocessing. - - Args: - messages: List of message dicts. - **kwargs: Additional parameters for the request. - - Returns: - Request parameters dict. - """ - # content_text_to_list THEN preprocess - messages = [self.content_text_to_list(copy.deepcopy(msg)) for msg in messages] - messages = self._preprocess_messages_for_model(messages) - return super()._build_chat_request_params(messages=messages, **kwargs) From 7695ab39ae2bc3c0176e0349ab215231ec10fc2c Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 5 Feb 2026 05:20:36 -0800 Subject: [PATCH 02/11] Update to contribution, tests for input audio format Signed-off-by: George Zelenfroind --- nemo_skills/inference/generate.py | 2 +- nemo_skills/inference/model/audio_utils.py | 4 +- .../inference/model/vllm_multimodal.py | 49 +++++++------------ tests/test_vllm_audio.py | 38 ++++++++++++-- 4 files changed, 57 insertions(+), 36 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index e05a532793..94c610a22d 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -207,7 +207,7 @@ class GenerationTaskConfig: enable_litellm_cache: bool = False # List of content types to drop from messages (e.g., base64 audio) to keep output files smaller - drop_content_types: list[str] = field(default_factory=lambda: ["audio_url"]) + drop_content_types: list[str] = field(default_factory=lambda: ["audio_url", "input_audio"]) # Audio configuration - set by benchmarks that need audio processing (mmau-pro, audiobench, etc.) enable_audio: bool = False # Enable audio preprocessing (set by benchmark configs) diff --git a/nemo_skills/inference/model/audio_utils.py b/nemo_skills/inference/model/audio_utils.py index 02c8eaf459..d4b4c44dbd 100644 --- a/nemo_skills/inference/model/audio_utils.py +++ b/nemo_skills/inference/model/audio_utils.py @@ -145,6 +145,8 @@ def make_audio_content_block(base64_audio: str, audio_format: str = "audio_url") if audio_format == "input_audio": # OpenAI native format (works with NVIDIA API / Gemini / Azure) return {"type": "input_audio", "input_audio": {"data": base64_audio, "format": "wav"}} - else: + elif audio_format == "audio_url": # Data URI format (works with vLLM / Qwen) return {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} + else: + raise ValueError(f"Unsupported audio_format '{audio_format}'. Use 'audio_url' or 'input_audio'.") diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py index 482e638cdc..478ab83908 100644 --- a/nemo_skills/inference/model/vllm_multimodal.py +++ b/nemo_skills/inference/model/vllm_multimodal.py @@ -56,10 +56,7 @@ class VLLMMultimodalModel(VLLMModel): 1. Saves audio responses from the server to disk / output_dir/audio/ 2. Replaces the base64 data with the file path in the result - External API support: - - Automatically detects external APIs (any non-localhost URL) - - Smart API key handling from environment variables - - Skips vLLM-specific params for external APIs + Also supports external APIs (NVIDIA, OpenAI) via base_url parameter. Example usage: # Local vLLM server @@ -90,20 +87,19 @@ def __init__( enable_audio_chunking: Master switch for audio chunking. audio_chunk_task_types: If None, chunk all task types; if specified, only chunk these. chunk_audio_threshold_sec: Audio duration threshold for chunking (in seconds). - audio_format: Format for audio content (must be "input_audio"). + audio_format: Format for audio content ("audio_url" or "input_audio"). **kwargs: Other parameters passed to VLLMModel/BaseModel. """ - # Determine if this is an external API (non-local URL) - self._external_api_mode = not self._is_local_url(base_url) - super().__init__(model=model, base_url=base_url, **kwargs) + # Determine if this is an external API (non-local URL) + self._external_api_mode = not self._is_local_url(self.base_url) + # Audio INPUT config self.enable_audio_chunking = enable_audio_chunking self.audio_chunk_task_types = audio_chunk_task_types self.chunk_audio_threshold_sec = chunk_audio_threshold_sec - if audio_format != "input_audio": - raise ValueError(f"Unsupported audio_format '{audio_format}'. Only 'input_audio' is supported.") + self.audio_format = audio_format # Audio OUTPUT config @@ -207,12 +203,9 @@ def _parse_chat_completion_response(self, response, include_response: bool = Fal if "generation" in result and result["generation"]: match = DEBUG_INFO_PATTERN.search(result["generation"]) if match: - try: - result["debug_info"] = json.loads(match.group(1)) - # Strip debug_info from generation - result["generation"] = DEBUG_INFO_PATTERN.sub("", result["generation"]) - except json.JSONDecodeError: - LOG.warning("Failed to parse debug_info JSON from content") + result["debug_info"] = json.loads(match.group(1)) + # Strip debug_info from generation + result["generation"] = DEBUG_INFO_PATTERN.sub("", result["generation"]) choice = response.choices[0] if hasattr(choice.message, "audio") and choice.message.audio: @@ -245,20 +238,16 @@ def _process_audio_response(self, audio_data, response_id: str) -> dict: return audio_info if self.output_audio_dir: - try: - audio_bytes = base64.b64decode(audio_base64) - filename = f"{response_id}.wav" - filepath = os.path.join(self.output_audio_dir, filename) - - with open(filepath, "wb") as f: - f.write(audio_bytes) - - audio_info["path"] = filepath - audio_info["size_bytes"] = len(audio_bytes) - LOG.info(f"Saved audio: {filepath} ({len(audio_bytes)} bytes)") - except Exception as e: - LOG.warning(f"Failed to save audio: {e}") - audio_info["data"] = audio_base64 + audio_bytes = base64.b64decode(audio_base64) + filename = f"{response_id}.wav" + filepath = os.path.join(self.output_audio_dir, filename) + + with open(filepath, "wb") as f: + f.write(audio_bytes) + + audio_info["path"] = filepath + audio_info["size_bytes"] = len(audio_bytes) + LOG.info(f"Saved audio: {filepath} ({len(audio_bytes)} bytes)") else: audio_info["data"] = audio_base64 diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py index 0c8ca1b89a..ca7fba0331 100644 --- a/tests/test_vllm_audio.py +++ b/tests/test_vllm_audio.py @@ -42,6 +42,15 @@ def test_audio_file_to_base64(): os.unlink(temp_path) +def _is_valid_audio_content(content_item: dict) -> bool: + """Check if content item is a valid audio block (either format).""" + if content_item.get("type") == "audio_url": + return content_item.get("audio_url", {}).get("url", "").startswith("data:audio/wav;base64,") + elif content_item.get("type") == "input_audio": + return "data" in content_item.get("input_audio", {}) + return False + + @pytest.fixture def mock_vllm_multimodal_model(tmp_path): """Create a mock VLLMMultimodalModel for testing audio preprocessing.""" @@ -53,6 +62,7 @@ def mock_vllm_multimodal_model(tmp_path): model.enable_audio_chunking = True model.audio_chunk_task_types = None model.chunk_audio_threshold_sec = 30 + model.audio_format = "audio_url" # Test audio_url format (for vLLM/Qwen) model._tunnel = None return model @@ -72,8 +82,28 @@ def test_content_text_to_list_with_audio(mock_vllm_multimodal_model, tmp_path): assert isinstance(result["content"], list) assert len(result["content"]) == 2 - assert result["content"][0]["type"] == "audio_url" - assert result["content"][0]["audio_url"]["url"].startswith("data:audio/wav;base64,") + assert _is_valid_audio_content(result["content"][0]) + assert result["content"][1]["type"] == "text" + + +def test_content_text_to_list_with_input_audio_format(mock_vllm_multimodal_model, tmp_path): + """Test audio conversion with input_audio format (OpenAI native).""" + audio_path = tmp_path / "test.wav" + with open(audio_path, "wb") as f: + f.write(b"RIFF" + b"\x00" * 100) + + # Switch to input_audio format + mock_vllm_multimodal_model.audio_format = "input_audio" + + message = {"role": "user", "content": "Describe this audio", "audio": {"path": "test.wav"}} + result = mock_vllm_multimodal_model.content_text_to_list(message) + + assert isinstance(result["content"], list) + assert len(result["content"]) == 2 + # Verify input_audio format structure + assert result["content"][0]["type"] == "input_audio" + assert "data" in result["content"][0]["input_audio"] + assert result["content"][0]["input_audio"]["format"] == "wav" assert result["content"][1]["type"] == "text" @@ -100,8 +130,8 @@ def test_content_text_to_list_with_multiple_audios(mock_vllm_multimodal_model, t assert isinstance(result["content"], list) assert len(result["content"]) == 3 # Audio MUST come before text for Qwen Audio - assert result["content"][0]["type"] == "audio_url" - assert result["content"][1]["type"] == "audio_url" + assert _is_valid_audio_content(result["content"][0]) + assert _is_valid_audio_content(result["content"][1]) assert result["content"][2]["type"] == "text" From 8a3fb5dbd2f0f237ce0867f0607237308af757fa Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 5 Feb 2026 05:24:13 -0800 Subject: [PATCH 03/11] handling of topk top p params Signed-off-by: George Zelenfroind --- .../inference/model/vllm_multimodal.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py index 478ab83908..01997df7a7 100644 --- a/nemo_skills/inference/model/vllm_multimodal.py +++ b/nemo_skills/inference/model/vllm_multimodal.py @@ -180,16 +180,32 @@ def _build_request_body(self, top_k, min_p, repetition_penalty, extra_body: dict """Build request body, skipping vLLM-specific params for external APIs. Args: - top_k: Top-k sampling parameter. - min_p: Min-p sampling parameter. - repetition_penalty: Repetition penalty parameter. + top_k: Top-k sampling parameter (vLLM, default -1). + min_p: Min-p sampling parameter (vLLM, default 0.0). + repetition_penalty: Repetition penalty parameter (vLLM, default 1.0). extra_body: Additional parameters to include. Returns: Dictionary of extra body parameters for the request. + + Raises: + ValueError: If vLLM-specific params are set to non-default values in external API mode. """ - # For external APIs, skip vLLM-specific parameters + # For external APIs, fail if user explicitly set vLLM-specific parameters if self._external_api_mode: + non_default_params = [] + if top_k != -1: + non_default_params.append(f"top_k={top_k}") + if min_p != 0.0: + non_default_params.append(f"min_p={min_p}") + if repetition_penalty != 1.0: + non_default_params.append(f"repetition_penalty={repetition_penalty}") + + if non_default_params: + raise ValueError( + f"vLLM-specific parameters are not supported for external APIs: {', '.join(non_default_params)}. " + "These parameters only work with local vLLM servers." + ) return extra_body or {} # For local vLLM server, use full parameter set From 7d4a5ccfc6155f872a2f1ac363f07f8be3e02bad Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 5 Feb 2026 05:32:57 -0800 Subject: [PATCH 04/11] adress comments Signed-off-by: George Zelenfroind --- .../inference/model/vllm_multimodal.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py index 01997df7a7..1f57f92fa1 100644 --- a/nemo_skills/inference/model/vllm_multimodal.py +++ b/nemo_skills/inference/model/vllm_multimodal.py @@ -100,6 +100,8 @@ def __init__( self.audio_chunk_task_types = audio_chunk_task_types self.chunk_audio_threshold_sec = chunk_audio_threshold_sec + if audio_format not in ("audio_url", "input_audio"): + raise ValueError(f"Unsupported audio_format '{audio_format}'. Use 'audio_url' or 'input_audio'.") self.audio_format = audio_format # Audio OUTPUT config @@ -273,6 +275,19 @@ def _process_audio_response(self, audio_data, response_id: str) -> dict: # Audio INPUT methods # ===================== + def _preprocess_messages_for_model(self, messages: list[dict]) -> list[dict]: + """Preprocess messages - creates copies to avoid mutation. + + Note: /no_think suffix is passed through unchanged (handled by the model). + + Args: + messages: List of message dicts. + + Returns: + Copy of message dicts. + """ + return [copy.deepcopy(msg) for msg in messages] + def content_text_to_list(self, message: dict) -> dict: """Convert message content with audio to proper list format. @@ -423,6 +438,9 @@ async def _generate_with_chunking( chunk_messages.append(msg_copy) + # Preprocess messages before sending to model + chunk_messages = self._preprocess_messages_for_model(chunk_messages) + # Generate for this chunk using parent's generate_async result = await super().generate_async( prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs @@ -482,7 +500,9 @@ async def generate_async( return await self._generate_with_chunking(messages, audio_path, duration, tokens_to_generate, **kwargs) # No chunking needed - convert audio fields to base64 format - prompt = [self.content_text_to_list(msg) for msg in messages] + messages = [self.content_text_to_list(copy.deepcopy(msg)) for msg in messages] + messages = self._preprocess_messages_for_model(messages) + prompt = messages # Call parent's generate_async (which handles audio OUTPUT via _parse_chat_completion_response) return await super().generate_async(prompt=prompt, tokens_to_generate=tokens_to_generate, **kwargs) From e7d0d92d49bac7e2dbddbf4f764fe1489da504b6 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 5 Feb 2026 05:54:07 -0800 Subject: [PATCH 05/11] add multimodal to allowerd Signed-off-by: George Zelenfroind --- .gitignore | 3 +++ nemo_skills/pipeline/utils/server.py | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 1f602ec71f..8f3a307e62 100644 --- a/.gitignore +++ b/.gitignore @@ -48,3 +48,6 @@ CLAUDE.md .codex .idea + +#scripts at root level +/*.sh diff --git a/nemo_skills/pipeline/utils/server.py b/nemo_skills/pipeline/utils/server.py index 87abca4a99..9fb84a680f 100644 --- a/nemo_skills/pipeline/utils/server.py +++ b/nemo_skills/pipeline/utils/server.py @@ -24,6 +24,7 @@ class SupportedServersSelfHosted(str, Enum): trtllm = "trtllm" vllm = "vllm" + vllm_multimodal = "vllm_multimodal" sglang = "sglang" megatron = "megatron" generic = "generic" @@ -32,6 +33,7 @@ class SupportedServersSelfHosted(str, Enum): class SupportedServers(str, Enum): trtllm = "trtllm" vllm = "vllm" + vllm_multimodal = "vllm_multimodal" sglang = "sglang" megatron = "megatron" openai = "openai" @@ -125,7 +127,7 @@ def get_server_command( # check if the model path is mounted if not vllm, sglang, or trtllm; # vllm, sglang, trtllm can also pass model name as "model_path" so we need special processing - if server_type not in ["vllm", "sglang", "trtllm", "generic"]: + if server_type not in ["vllm", "vllm_multimodal", "sglang", "trtllm", "generic"]: check_if_mounted(cluster_config, model_path) # the model path will be mounted, so generally it will start with / @@ -158,7 +160,8 @@ def get_server_command( f" --micro-batch-size 1 " # that's a training argument, ignored here, but required to specify.. f" {server_args} " ) - elif server_type == "vllm": + elif server_type in ["vllm", "vllm_multimodal"]: + # vllm_multimodal uses the same vLLM server; multimodal handling is on the client side server_entrypoint = server_entrypoint or "-m nemo_skills.inference.server.serve_vllm" start_vllm_cmd = ( f"python3 {server_entrypoint} " From 0363027a7863c1eec86e4398248314663e94cc10 Mon Sep 17 00:00:00 2001 From: mmkrtchyan Date: Thu, 19 Feb 2026 15:43:11 +0400 Subject: [PATCH 06/11] adding nvidia inference api test Signed-off-by: mmkrtchyan --- tests/test_nvidia_inference_api.py | 134 +++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 tests/test_nvidia_inference_api.py diff --git a/tests/test_nvidia_inference_api.py b/tests/test_nvidia_inference_api.py new file mode 100644 index 0000000000..86befccda1 --- /dev/null +++ b/tests/test_nvidia_inference_api.py @@ -0,0 +1,134 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Integration tests for NVIDIA Inference API with VLLMMultimodalModel (audio input).""" + +import asyncio +import os +from pathlib import Path + +import pytest + +from nemo_skills.inference.model.vllm_multimodal import VLLMMultimodalModel + +NVIDIA_BASE_URL = "https://inference-api.nvidia.com/v1" +MODEL = "gcp/google/gemini-2.5-flash-lite" + +TEST_AUDIO_DIR = Path(__file__).parent / "slurm-tests" / "asr_nim" / "wavs" +TEST_AUDIO_T2 = TEST_AUDIO_DIR / "t2_16.wav" # "sample 2 this is a test of text to speech synthesis" +TEST_AUDIO_T3 = TEST_AUDIO_DIR / "t3_16.wav" # "sample 3 hello how are you today" + +requires_nvidia_api_key = pytest.mark.skipif( + not os.getenv("NVIDIA_API_KEY"), + reason="NVIDIA_API_KEY environment variable not set", +) + +requires_test_audio = pytest.mark.skipif( + not TEST_AUDIO_T2.exists() or not TEST_AUDIO_T3.exists(), + reason="Test audio files not found at tests/slurm-tests/asr_nim/wavs/", +) + + +@requires_nvidia_api_key +def test_nvidia_api_text_only(): + """Smoke test: text-only chat completion via NVIDIA Inference API.""" + model = VLLMMultimodalModel( + model=MODEL, + base_url=NVIDIA_BASE_URL, + audio_format="input_audio", + ) + + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello! Can you help me?"}, + ] + + result = asyncio.run( + model.generate_async( + prompt=messages, + tokens_to_generate=1024, + temperature=0.7, + ) + ) + + assert "generation" in result + assert len(result["generation"]) > 0 + print(f"[text-only] generation: {result['generation'][:200]}") + + +@requires_nvidia_api_key +@requires_test_audio +def test_nvidia_api_audio_input(): + model = VLLMMultimodalModel( + model=MODEL, + base_url=NVIDIA_BASE_URL, + audio_format="input_audio", + data_dir=str(TEST_AUDIO_DIR), + ) + + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "What do you hear in this audio? Describe it briefly.", + "audio": {"path": "t2_16.wav"}, + }, + ] + + result = asyncio.run( + model.generate_async( + prompt=messages, + tokens_to_generate=1024, + temperature=0.7, + ) + ) + + assert "generation" in result + assert len(result["generation"]) > 0 + print(f"[audio-input] generation: {result['generation'][:300]}") + + +@requires_nvidia_api_key +@requires_test_audio +def test_nvidia_api_audio_with_transcription_prompt(): + """Integration test: ask the model to transcribe audio content.""" + model = VLLMMultimodalModel( + model=MODEL, + base_url=NVIDIA_BASE_URL, + audio_format="input_audio", + data_dir=str(TEST_AUDIO_DIR), + ) + + messages = [ + {"role": "system", "content": "You are a helpful assistant that can listen to audio."}, + { + "role": "user", + "content": "Please listen to this audio and tell me what you hear.", + "audio": {"path": "t3_16.wav"}, + }, + ] + + result = asyncio.run( + model.generate_async( + prompt=messages, + tokens_to_generate=1024, + temperature=0.7, + ) + ) + + assert "generation" in result + assert len(result["generation"]) > 0 + assert result.get("num_generated_tokens", 0) > 0 + print(f"[transcription] generation: {result['generation'][:300]}") + print(f"[transcription] tokens: {result.get('num_generated_tokens')}") From 12a76c947f1d1a82b233224103e220714a0c055d Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 19 Feb 2026 04:16:35 -0800 Subject: [PATCH 07/11] update tests and adress comments Signed-off-by: George Zelenfroind --- .../inference/model/vllm_multimodal.py | 39 ++++++++++++------- tests/test_vllm_audio.py | 23 ++++++++--- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py index 1f57f92fa1..80a27bc6fb 100644 --- a/nemo_skills/inference/model/vllm_multimodal.py +++ b/nemo_skills/inference/model/vllm_multimodal.py @@ -76,7 +76,7 @@ def __init__( enable_audio_chunking: bool = True, audio_chunk_task_types: list[str] | None = None, chunk_audio_threshold_sec: int = 30, - audio_format: str = "input_audio", + audio_format: str | None = None, **kwargs, ): """Initialize VLLMMultimodalModel with audio I/O and external API support. @@ -87,7 +87,7 @@ def __init__( enable_audio_chunking: Master switch for audio chunking. audio_chunk_task_types: If None, chunk all task types; if specified, only chunk these. chunk_audio_threshold_sec: Audio duration threshold for chunking (in seconds). - audio_format: Format for audio content ("audio_url" or "input_audio"). + audio_format: Format for audio content ("audio_url" or "input_audio"). If None, select by mode. **kwargs: Other parameters passed to VLLMModel/BaseModel. """ super().__init__(model=model, base_url=base_url, **kwargs) @@ -100,6 +100,8 @@ def __init__( self.audio_chunk_task_types = audio_chunk_task_types self.chunk_audio_threshold_sec = chunk_audio_threshold_sec + if audio_format is None: + audio_format = "input_audio" if self._external_api_mode else "audio_url" if audio_format not in ("audio_url", "input_audio"): raise ValueError(f"Unsupported audio_format '{audio_format}'. Use 'audio_url' or 'input_audio'.") self.audio_format = audio_format @@ -221,9 +223,12 @@ def _parse_chat_completion_response(self, response, include_response: bool = Fal if "generation" in result and result["generation"]: match = DEBUG_INFO_PATTERN.search(result["generation"]) if match: - result["debug_info"] = json.loads(match.group(1)) - # Strip debug_info from generation - result["generation"] = DEBUG_INFO_PATTERN.sub("", result["generation"]) + try: + result["debug_info"] = json.loads(match.group(1)) + # Strip debug_info from generation + result["generation"] = DEBUG_INFO_PATTERN.sub("", result["generation"]) + except json.JSONDecodeError: + LOG.warning("Failed to parse debug_info JSON from content") choice = response.choices[0] if hasattr(choice.message, "audio") and choice.message.audio: @@ -256,16 +261,20 @@ def _process_audio_response(self, audio_data, response_id: str) -> dict: return audio_info if self.output_audio_dir: - audio_bytes = base64.b64decode(audio_base64) - filename = f"{response_id}.wav" - filepath = os.path.join(self.output_audio_dir, filename) - - with open(filepath, "wb") as f: - f.write(audio_bytes) - - audio_info["path"] = filepath - audio_info["size_bytes"] = len(audio_bytes) - LOG.info(f"Saved audio: {filepath} ({len(audio_bytes)} bytes)") + try: + audio_bytes = base64.b64decode(audio_base64) + filename = f"{response_id}.wav" + filepath = os.path.join(self.output_audio_dir, filename) + + with open(filepath, "wb") as f: + f.write(audio_bytes) + + audio_info["path"] = filepath + audio_info["size_bytes"] = len(audio_bytes) + LOG.info(f"Saved audio: {filepath} ({len(audio_bytes)} bytes)") + except Exception as e: + LOG.warning(f"Failed to save audio: {e}") + audio_info["data"] = audio_base64 else: audio_info["data"] = audio_base64 diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py index ca7fba0331..a919ab33cb 100644 --- a/tests/test_vllm_audio.py +++ b/tests/test_vllm_audio.py @@ -67,6 +67,22 @@ def mock_vllm_multimodal_model(tmp_path): return model +@pytest.fixture +def mock_vllm_multimodal_model_input_audio(tmp_path): + """Create a mock VLLMMultimodalModel configured for input_audio.""" + with patch.object(VLLMMultimodalModel, "__init__", lambda self, **kwargs: None): + model = VLLMMultimodalModel() + model.data_dir = str(tmp_path) + model.output_dir = None + model.output_audio_dir = None + model.enable_audio_chunking = True + model.audio_chunk_task_types = None + model.chunk_audio_threshold_sec = 30 + model.audio_format = "input_audio" + model._tunnel = None + return model + + def test_content_text_to_list_with_audio(mock_vllm_multimodal_model, tmp_path): """Test converting string content with audio to list format. @@ -86,17 +102,14 @@ def test_content_text_to_list_with_audio(mock_vllm_multimodal_model, tmp_path): assert result["content"][1]["type"] == "text" -def test_content_text_to_list_with_input_audio_format(mock_vllm_multimodal_model, tmp_path): +def test_content_text_to_list_with_input_audio_format(mock_vllm_multimodal_model_input_audio, tmp_path): """Test audio conversion with input_audio format (OpenAI native).""" audio_path = tmp_path / "test.wav" with open(audio_path, "wb") as f: f.write(b"RIFF" + b"\x00" * 100) - # Switch to input_audio format - mock_vllm_multimodal_model.audio_format = "input_audio" - message = {"role": "user", "content": "Describe this audio", "audio": {"path": "test.wav"}} - result = mock_vllm_multimodal_model.content_text_to_list(message) + result = mock_vllm_multimodal_model_input_audio.content_text_to_list(message) assert isinstance(result["content"], list) assert len(result["content"]) == 2 From f5b57dcc0edd279faa932b975b77bd39160cb1f3 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 19 Feb 2026 04:37:02 -0800 Subject: [PATCH 08/11] address review nits Signed-off-by: George Zelenfroind --- nemo_skills/inference/generate.py | 2 +- tests/test_nvidia_inference_api.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index ef6986f83e..21b559e10f 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -632,7 +632,7 @@ def drop_fields_from_messages(self, output): # Filter out content types specified in drop_content_types config message["content"] = [ - content for content in message["content"] if content.get("type") not in self.cfg.drop_content_types + content for content in message["content"] if content["type"] not in self.cfg.drop_content_types ] async def postprocess_single_output(self, output, original_data_point): diff --git a/tests/test_nvidia_inference_api.py b/tests/test_nvidia_inference_api.py index 86befccda1..bc766c09d8 100644 --- a/tests/test_nvidia_inference_api.py +++ b/tests/test_nvidia_inference_api.py @@ -70,6 +70,7 @@ def test_nvidia_api_text_only(): @requires_nvidia_api_key @requires_test_audio def test_nvidia_api_audio_input(): + """Integration test: audio-input generation using a local test audio file.""" model = VLLMMultimodalModel( model=MODEL, base_url=NVIDIA_BASE_URL, @@ -129,6 +130,6 @@ def test_nvidia_api_audio_with_transcription_prompt(): assert "generation" in result assert len(result["generation"]) > 0 - assert result.get("num_generated_tokens", 0) > 0 + assert result["num_generated_tokens"] > 0 print(f"[transcription] generation: {result['generation'][:300]}") - print(f"[transcription] tokens: {result.get('num_generated_tokens')}") + print(f"[transcription] tokens: {result['num_generated_tokens']}") From 28f44b8a05986d56e2d88b3c3453420437ab32ce Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 19 Feb 2026 04:52:50 -0800 Subject: [PATCH 09/11] avoid extra deepcopy after audio injection Signed-off-by: George Zelenfroind --- .../inference/model/vllm_multimodal.py | 27 +++++-------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py index 80a27bc6fb..da2d13eb6a 100644 --- a/nemo_skills/inference/model/vllm_multimodal.py +++ b/nemo_skills/inference/model/vllm_multimodal.py @@ -180,7 +180,7 @@ def _get_api_key(self, api_key: str | None, api_key_env_var: str | None, base_ur return api_key - def _build_request_body(self, top_k, min_p, repetition_penalty, extra_body: dict = None): + def _build_request_body(self, top_k, min_p, repetition_penalty, extra_body: dict | None = None): """Build request body, skipping vLLM-specific params for external APIs. Args: @@ -312,7 +312,7 @@ def content_text_to_list(self, message: dict) -> dict: New message dict with content converted to list format including audio. """ if "audio" not in message and "audios" not in message: - return message + return copy.deepcopy(message) result = copy.deepcopy(message) @@ -417,9 +417,7 @@ async def _generate_with_chunking( result = None # Track cumulative statistics across chunks - total_input_tokens = 0 - total_generated_tokens = 0 - total_time = 0.0 + total_num_generated_tokens = 0 for chunk_idx, audio_chunk in enumerate(chunks): chunk_messages = [] @@ -447,18 +445,13 @@ async def _generate_with_chunking( chunk_messages.append(msg_copy) - # Preprocess messages before sending to model - chunk_messages = self._preprocess_messages_for_model(chunk_messages) - # Generate for this chunk using parent's generate_async result = await super().generate_async( prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs ) # Sum statistics from each chunk - total_input_tokens += result.get("input_tokens", 0) - total_generated_tokens += result.get("generated_tokens", 0) - total_time += result.get("time_elapsed", 0.0) + total_num_generated_tokens += result["num_generated_tokens"] generation = result["generation"] chunk_results.append(generation.strip()) @@ -466,17 +459,12 @@ async def _generate_with_chunking( # Aggregate results aggregated_text = " ".join(chunk_results) - if not result: - raise RuntimeError("Audio chunk generation returned no result") - final_result = result.copy() final_result["generation"] = aggregated_text final_result["num_audio_chunks"] = len(chunks) final_result["audio_duration"] = duration # Update with summed statistics - final_result["input_tokens"] = total_input_tokens - final_result["generated_tokens"] = total_generated_tokens - final_result["time_elapsed"] = total_time + final_result["num_generated_tokens"] = total_num_generated_tokens return final_result @@ -484,7 +472,7 @@ async def generate_async( self, prompt: str | list[dict] | None = None, tokens_to_generate: int | None = None, - task_type: str = None, + task_type: str | None = None, **kwargs, ) -> dict: """Generate with automatic audio chunking for long audio files. @@ -509,8 +497,7 @@ async def generate_async( return await self._generate_with_chunking(messages, audio_path, duration, tokens_to_generate, **kwargs) # No chunking needed - convert audio fields to base64 format - messages = [self.content_text_to_list(copy.deepcopy(msg)) for msg in messages] - messages = self._preprocess_messages_for_model(messages) + messages = [self.content_text_to_list(msg) for msg in messages] prompt = messages # Call parent's generate_async (which handles audio OUTPUT via _parse_chat_completion_response) From ac97b44a29d730c4e919714f5010b040bf705599 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 19 Feb 2026 05:20:19 -0800 Subject: [PATCH 10/11] use nv inference api key and add soundfile Signed-off-by: George Zelenfroind --- nemo_skills/inference/model/vllm_multimodal.py | 6 +++--- requirements/common-tests.txt | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py index da2d13eb6a..4b8006abe5 100644 --- a/nemo_skills/inference/model/vllm_multimodal.py +++ b/nemo_skills/inference/model/vllm_multimodal.py @@ -152,11 +152,11 @@ def _get_api_key(self, api_key: str | None, api_key_env_var: str | None, base_ur # Auto-detect API key based on base_url if base_url: if "api.nvidia.com" in base_url or "inference-api.nvidia.com" in base_url: - api_key = os.getenv("NVIDIA_API_KEY") + api_key = os.getenv("NV_INFERENCE_API_KEY") or os.getenv("NVIDIA_API_KEY") if not api_key: raise ValueError( - "NVIDIA_API_KEY is required for NVIDIA APIs and could not be found. " - "Set NVIDIA_API_KEY environment variable or pass api_key explicitly." + "NV_INFERENCE_API_KEY or NVIDIA_API_KEY is required for NVIDIA APIs and could not be found. " + "Set NV_INFERENCE_API_KEY/NVIDIA_API_KEY environment variable or pass api_key explicitly." ) return api_key diff --git a/requirements/common-tests.txt b/requirements/common-tests.txt index 4eca96f92a..8b2fbcce4d 100644 --- a/requirements/common-tests.txt +++ b/requirements/common-tests.txt @@ -16,3 +16,4 @@ pytest pytest-asyncio pytest-cov pytest-timeout +soundfile From 64a860a093b0c0d73621cb6e119a9472f0747a03 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Thu, 19 Feb 2026 05:47:50 -0800 Subject: [PATCH 11/11] update yml for jkeys and default key Signed-off-by: George Zelenfroind --- .github/workflows/tests.yml | 1 + tests/test_nvidia_inference_api.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 837ca2d58b..6f1e285408 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -85,6 +85,7 @@ jobs: . - name: Run all tests env: + NV_INFERENCE_API_KEY: ${{ secrets.NV_INFERENCE_API_KEY }} NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | diff --git a/tests/test_nvidia_inference_api.py b/tests/test_nvidia_inference_api.py index bc766c09d8..e6abe6d8b5 100644 --- a/tests/test_nvidia_inference_api.py +++ b/tests/test_nvidia_inference_api.py @@ -30,8 +30,8 @@ TEST_AUDIO_T3 = TEST_AUDIO_DIR / "t3_16.wav" # "sample 3 hello how are you today" requires_nvidia_api_key = pytest.mark.skipif( - not os.getenv("NVIDIA_API_KEY"), - reason="NVIDIA_API_KEY environment variable not set", + not (os.getenv("NV_INFERENCE_API_KEY") or os.getenv("NVIDIA_API_KEY")), + reason="NV_INFERENCE_API_KEY/NVIDIA_API_KEY environment variable not set", ) requires_test_audio = pytest.mark.skipif(