From 6136ec9c4e8191342a8c7e6a4ba147cda5b1c132 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 5 Feb 2026 04:36:38 -0800
Subject: [PATCH 01/11] update vllm miltimodal for api calls convenience

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 .../inference/model/vllm_multimodal.py        | 219 ++++++++++++------
 1 file changed, 148 insertions(+), 71 deletions(-)

diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py
index c2285b0ccc..482e638cdc 100644
--- a/nemo_skills/inference/model/vllm_multimodal.py
+++ b/nemo_skills/inference/model/vllm_multimodal.py
@@ -17,6 +17,7 @@
 This module provides a multimodal model class that handles:
 - Audio INPUT: encoding audio files to base64, chunking long audio
 - Audio OUTPUT: saving audio responses from the server to disk
+- External API support: NVIDIA Inference API, OpenAI, and other OpenAI-compatible APIs
 """
 
 import base64
@@ -32,6 +33,7 @@
     audio_file_to_base64,
     chunk_audio,
     load_audio_file,
+    make_audio_content_block,
     save_audio_chunk_to_base64,
 )
 from .vllm import VLLMModel
@@ -43,10 +45,10 @@
 
 
 class VLLMMultimodalModel(VLLMModel):
-    """VLLMModel with support for audio input and output.
+    """VLLMModel with support for audio input/output and external APIs.
 
     Audio INPUT capabilities:
-    1. Converts audio file paths to base64-encoded audio_url format
+    1. Converts audio file paths to base64-encoded input_audio format
     2. Chunks long audio files for models with duration limits
     3. Aggregates results from chunked audio processing
 
@@ -54,29 +56,55 @@ class VLLMMultimodalModel(VLLMModel):
     1. Saves audio responses from the server to disk / output_dir/audio/
     2. Replaces the base64 data with the file path in the result
 
+    External API support:
+    - Automatically detects external APIs (any non-localhost URL)
+    - Smart API key handling from environment variables
+    - Skips vLLM-specific params for external APIs
+
+    Example usage:
+        # Local vLLM server
+        model = VLLMMultimodalModel(model="Qwen/Qwen2-Audio-7B")
+
+        # NVIDIA Inference API
+        model = VLLMMultimodalModel(
+            model="gcp/google/gemini-2.5-pro",
+            base_url="https://inference-api.nvidia.com/v1"
+        )
     """
 
     def __init__(
         self,
+        model: str | None = None,
+        base_url: str | None = None,
         enable_audio_chunking: bool = True,
         audio_chunk_task_types: list[str] | None = None,
         chunk_audio_threshold_sec: int = 30,
+        audio_format: str = "input_audio",
         **kwargs,
     ):
-        """Initialize VLLMMultimodalModel with audio I/O support.
+        """Initialize VLLMMultimodalModel with audio I/O and external API support.
 
         Args:
+            model: Model name (e.g., "Qwen/Qwen2-Audio-7B" for local, "gcp/google/gemini-2.5-pro" for NVIDIA API).
+            base_url: API base URL. If None, defaults to local server. For external APIs, provide the full URL.
             enable_audio_chunking: Master switch for audio chunking.
             audio_chunk_task_types: If None, chunk all task types; if specified, only chunk these.
             chunk_audio_threshold_sec: Audio duration threshold for chunking (in seconds).
+            audio_format: Format for audio content (must be "input_audio").
             **kwargs: Other parameters passed to VLLMModel/BaseModel.
         """
-        super().__init__(**kwargs)
+        # Determine if this is an external API (non-local URL)
+        self._external_api_mode = not self._is_local_url(base_url)
+
+        super().__init__(model=model, base_url=base_url, **kwargs)
 
         # Audio INPUT config
         self.enable_audio_chunking = enable_audio_chunking
         self.audio_chunk_task_types = audio_chunk_task_types
         self.chunk_audio_threshold_sec = chunk_audio_threshold_sec
+        if audio_format != "input_audio":
+            raise ValueError(f"Unsupported audio_format '{audio_format}'. Only 'input_audio' is supported.")
+        self.audio_format = audio_format
 
         # Audio OUTPUT config
         self.output_audio_dir = None
@@ -85,6 +113,92 @@ def __init__(
             os.makedirs(self.output_audio_dir, exist_ok=True)
             LOG.info(f"Audio responses will be saved to: {self.output_audio_dir}")
 
+    def _is_local_url(self, base_url: str | None) -> bool:
+        """Check if the base_url points to a local server.
+
+        Args:
+            base_url: API base URL.
+
+        Returns:
+            True if local server, False otherwise.
+        """
+        if not base_url:
+            return True  # No URL means local server (will use default host:port)
+        local_patterns = ["127.0.0.1", "localhost", "0.0.0.0"]
+        return any(pattern in base_url for pattern in local_patterns)
+
+    def _get_api_key(self, api_key: str | None, api_key_env_var: str | None, base_url: str) -> str | None:
+        """Get API key with smart detection for external APIs.
+
+        Checks for API keys in the following order:
+        1. Explicit api_key argument
+        2. Environment variable specified by api_key_env_var
+        3. Auto-detect based on base_url (NVIDIA_API_KEY, OPENAI_API_KEY, etc.)
+
+        Args:
+            api_key: Explicit API key.
+            api_key_env_var: Environment variable name for API key.
+            base_url: API base URL for auto-detection.
+
+        Returns:
+            API key string or None.
+        """
+        # First, try parent class logic (explicit key or env var)
+        api_key = super()._get_api_key(api_key, api_key_env_var, base_url)
+
+        if api_key is not None:
+            return api_key
+
+        # Auto-detect API key based on base_url
+        if base_url:
+            if "api.nvidia.com" in base_url or "inference-api.nvidia.com" in base_url:
+                api_key = os.getenv("NVIDIA_API_KEY")
+                if not api_key:
+                    raise ValueError(
+                        "NVIDIA_API_KEY is required for NVIDIA APIs and could not be found. "
+                        "Set NVIDIA_API_KEY environment variable or pass api_key explicitly."
+                    )
+                return api_key
+
+            if "api.openai.com" in base_url:
+                api_key = os.getenv("OPENAI_API_KEY")
+                if not api_key:
+                    raise ValueError(
+                        "OPENAI_API_KEY is required for OpenAI APIs and could not be found. "
+                        "Set OPENAI_API_KEY environment variable or pass api_key explicitly."
+                    )
+                return api_key
+
+            if "generativelanguage.googleapis.com" in base_url:
+                api_key = os.getenv("GOOGLE_API_KEY")
+                if not api_key:
+                    raise ValueError(
+                        "GOOGLE_API_KEY is required for Google APIs and could not be found. "
+                        "Set GOOGLE_API_KEY environment variable or pass api_key explicitly."
+                    )
+                return api_key
+
+        return api_key
+
+    def _build_request_body(self, top_k, min_p, repetition_penalty, extra_body: dict = None):
+        """Build request body, skipping vLLM-specific params for external APIs.
+
+        Args:
+            top_k: Top-k sampling parameter.
+            min_p: Min-p sampling parameter.
+            repetition_penalty: Repetition penalty parameter.
+            extra_body: Additional parameters to include.
+
+        Returns:
+            Dictionary of extra body parameters for the request.
+        """
+        # For external APIs, skip vLLM-specific parameters
+        if self._external_api_mode:
+            return extra_body or {}
+
+        # For local vLLM server, use full parameter set
+        return super()._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body)
+
     def _parse_chat_completion_response(self, response, include_response: bool = False, **kwargs) -> dict:
         """Parse chat completion response and save any audio to disk."""
         result = super()._parse_chat_completion_response(response, include_response=include_response, **kwargs)
@@ -158,62 +272,46 @@ def content_text_to_list(self, message: dict) -> dict:
         """Convert message content with audio to proper list format.
 
         Handles 'audio' or 'audios' keys in messages and converts them to
-        base64-encoded audio_url content items.
+        base64-encoded input_audio content items.
 
-        CRITICAL: Audio must come BEFORE text for Qwen models to transcribe correctly.
+        CRITICAL: Audio must come BEFORE text for models to process correctly.
 
         Args:
             message: Message dict that may contain 'audio' or 'audios' fields.
 
         Returns:
-            Message dict with content converted to list format including audio.
+            New message dict with content converted to list format including audio.
         """
         if "audio" not in message and "audios" not in message:
             return message
 
-        content = message.get("content", "")
+        result = copy.deepcopy(message)
+
+        if "content" not in result:
+            raise KeyError("Missing required 'content' in message")
+        content = result["content"]
         if isinstance(content, str):
-            message["content"] = [{"type": "text", "text": content}]
-        elif isinstance(content, list):
-            message["content"] = content
-        else:
+            result["content"] = [{"type": "text", "text": content}]
+        elif not isinstance(content, list):
             raise TypeError(f"Unexpected content type: {type(content)}")
 
         audio_items = []
 
-        if "audio" in message:
-            audio = message["audio"]
+        if "audio" in result:
+            audio = result.pop("audio")
             audio_path = os.path.join(self.data_dir, audio["path"])
             base64_audio = audio_file_to_base64(audio_path)
-            audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
-            audio_items.append(audio_message)
-            del message["audio"]  # Remove original audio field after conversion
-        elif "audios" in message:
-            for audio in message["audios"]:
+            audio_items.append(make_audio_content_block(base64_audio, self.audio_format))
+        elif "audios" in result:
+            for audio in result.pop("audios"):
                 audio_path = os.path.join(self.data_dir, audio["path"])
                 base64_audio = audio_file_to_base64(audio_path)
-                audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
-                audio_items.append(audio_message)
-            del message["audios"]  # Remove original audios field after conversion
+                audio_items.append(make_audio_content_block(base64_audio, self.audio_format))
 
-        # Insert audio items at the BEGINNING of content list (before text)
         if audio_items:
-            message["content"] = audio_items + message["content"]
-
-        return message
-
-    def _preprocess_messages_for_model(self, messages: list[dict]) -> list[dict]:
-        """Preprocess messages - creates copies to avoid mutation.
+            result["content"] = audio_items + result["content"]
 
-        Note: /no_think suffix is passed through unchanged (handled by the model).
-
-        Args:
-            messages: List of message dicts.
-
-        Returns:
-            Copy of message dicts.
-        """
-        return [copy.deepcopy(msg) for msg in messages]
+        return result
 
     def _needs_audio_chunking(self, messages: list[dict], task_type: str = None) -> tuple[bool, str, float]:
         """Check if audio in messages needs chunking.
@@ -235,11 +333,14 @@ def _needs_audio_chunking(self, messages: list[dict], task_type: str = None) ->
 
         # Find audio in messages
         for msg in messages:
-            if msg.get("role") == "user":
-                audio_info = msg.get("audio")
-                if not audio_info:
-                    audios = msg.get("audios", [])
+            if msg["role"] == "user":
+                if "audio" in msg:
+                    audio_info = msg["audio"]
+                elif "audios" in msg:
+                    audios = msg["audios"]
                     audio_info = audios[0] if audios else {}
+                else:
+                    continue
                 if audio_info and "path" in audio_info:
                     audio_path = os.path.join(self.data_dir, audio_info["path"])
 
@@ -300,16 +401,16 @@ async def _generate_with_chunking(
                 if msg_copy["role"] == "user" and ("audio" in msg_copy or "audios" in msg_copy):
                     chunk_base64 = save_audio_chunk_to_base64(audio_chunk, sampling_rate)
 
-                    content = msg_copy.get("content", "")
+                    if "content" not in msg_copy:
+                        raise KeyError("Missing required 'content' in message")
+                    content = msg_copy["content"]
                     if isinstance(content, str):
                         text_content = [{"type": "text", "text": content}]
                     else:
                         text_content = content
 
                     # Add audio chunk at the beginning (before text)
-                    msg_copy["content"] = [
-                        {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{chunk_base64}"}}
-                    ] + text_content
+                    msg_copy["content"] = [make_audio_content_block(chunk_base64, self.audio_format)] + text_content
 
                     # Remove original audio fields
                     msg_copy.pop("audio", None)
@@ -317,9 +418,6 @@ async def _generate_with_chunking(
 
                 chunk_messages.append(msg_copy)
 
-            # Preprocess messages (strip /no_think for Qwen)
-            chunk_messages = self._preprocess_messages_for_model(chunk_messages)
-
             # Generate for this chunk using parent's generate_async
             result = await super().generate_async(
                 prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs
@@ -379,28 +477,7 @@ async def generate_async(
                 return await self._generate_with_chunking(messages, audio_path, duration, tokens_to_generate, **kwargs)
 
             # No chunking needed - convert audio fields to base64 format
-            messages = [self.content_text_to_list(copy.deepcopy(msg)) for msg in messages]
-            messages = self._preprocess_messages_for_model(messages)
-            prompt = messages
+            prompt = [self.content_text_to_list(msg) for msg in messages]
 
         # Call parent's generate_async (which handles audio OUTPUT via _parse_chat_completion_response)
         return await super().generate_async(prompt=prompt, tokens_to_generate=tokens_to_generate, **kwargs)
-
-    def _build_chat_request_params(
-        self,
-        messages: list[dict],
-        **kwargs,
-    ) -> dict:
-        """Build chat request parameters with audio preprocessing.
-
-        Args:
-            messages: List of message dicts.
-            **kwargs: Additional parameters for the request.
-
-        Returns:
-            Request parameters dict.
-        """
-        # content_text_to_list THEN preprocess
-        messages = [self.content_text_to_list(copy.deepcopy(msg)) for msg in messages]
-        messages = self._preprocess_messages_for_model(messages)
-        return super()._build_chat_request_params(messages=messages, **kwargs)

From 7695ab39ae2bc3c0176e0349ab215231ec10fc2c Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 5 Feb 2026 05:20:36 -0800
Subject: [PATCH 02/11] Update to contribution, tests for input audio  format

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/inference/generate.py             |  2 +-
 nemo_skills/inference/model/audio_utils.py    |  4 +-
 .../inference/model/vllm_multimodal.py        | 49 +++++++------------
 tests/test_vllm_audio.py                      | 38 ++++++++++++--
 4 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index e05a532793..94c610a22d 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -207,7 +207,7 @@ class GenerationTaskConfig:
     enable_litellm_cache: bool = False
 
     # List of content types to drop from messages (e.g., base64 audio) to keep output files smaller
-    drop_content_types: list[str] = field(default_factory=lambda: ["audio_url"])
+    drop_content_types: list[str] = field(default_factory=lambda: ["audio_url", "input_audio"])
 
     # Audio configuration - set by benchmarks that need audio processing (mmau-pro, audiobench, etc.)
     enable_audio: bool = False  # Enable audio preprocessing (set by benchmark configs)
diff --git a/nemo_skills/inference/model/audio_utils.py b/nemo_skills/inference/model/audio_utils.py
index 02c8eaf459..d4b4c44dbd 100644
--- a/nemo_skills/inference/model/audio_utils.py
+++ b/nemo_skills/inference/model/audio_utils.py
@@ -145,6 +145,8 @@ def make_audio_content_block(base64_audio: str, audio_format: str = "audio_url")
     if audio_format == "input_audio":
         # OpenAI native format (works with NVIDIA API / Gemini / Azure)
         return {"type": "input_audio", "input_audio": {"data": base64_audio, "format": "wav"}}
-    else:
+    elif audio_format == "audio_url":
         # Data URI format (works with vLLM / Qwen)
         return {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
+    else:
+        raise ValueError(f"Unsupported audio_format '{audio_format}'. Use 'audio_url' or 'input_audio'.")
diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py
index 482e638cdc..478ab83908 100644
--- a/nemo_skills/inference/model/vllm_multimodal.py
+++ b/nemo_skills/inference/model/vllm_multimodal.py
@@ -56,10 +56,7 @@ class VLLMMultimodalModel(VLLMModel):
     1. Saves audio responses from the server to disk / output_dir/audio/
     2. Replaces the base64 data with the file path in the result
 
-    External API support:
-    - Automatically detects external APIs (any non-localhost URL)
-    - Smart API key handling from environment variables
-    - Skips vLLM-specific params for external APIs
+    Also supports external APIs (NVIDIA, OpenAI) via base_url parameter.
 
     Example usage:
         # Local vLLM server
@@ -90,20 +87,19 @@ def __init__(
             enable_audio_chunking: Master switch for audio chunking.
             audio_chunk_task_types: If None, chunk all task types; if specified, only chunk these.
             chunk_audio_threshold_sec: Audio duration threshold for chunking (in seconds).
-            audio_format: Format for audio content (must be "input_audio").
+            audio_format: Format for audio content ("audio_url" or "input_audio").
             **kwargs: Other parameters passed to VLLMModel/BaseModel.
         """
-        # Determine if this is an external API (non-local URL)
-        self._external_api_mode = not self._is_local_url(base_url)
-
         super().__init__(model=model, base_url=base_url, **kwargs)
 
+        # Determine if this is an external API (non-local URL)
+        self._external_api_mode = not self._is_local_url(self.base_url)
+
         # Audio INPUT config
         self.enable_audio_chunking = enable_audio_chunking
         self.audio_chunk_task_types = audio_chunk_task_types
         self.chunk_audio_threshold_sec = chunk_audio_threshold_sec
-        if audio_format != "input_audio":
-            raise ValueError(f"Unsupported audio_format '{audio_format}'. Only 'input_audio' is supported.")
+
         self.audio_format = audio_format
 
         # Audio OUTPUT config
@@ -207,12 +203,9 @@ def _parse_chat_completion_response(self, response, include_response: bool = Fal
         if "generation" in result and result["generation"]:
             match = DEBUG_INFO_PATTERN.search(result["generation"])
             if match:
-                try:
-                    result["debug_info"] = json.loads(match.group(1))
-                    # Strip debug_info from generation
-                    result["generation"] = DEBUG_INFO_PATTERN.sub("", result["generation"])
-                except json.JSONDecodeError:
-                    LOG.warning("Failed to parse debug_info JSON from content")
+                result["debug_info"] = json.loads(match.group(1))
+                # Strip debug_info from generation
+                result["generation"] = DEBUG_INFO_PATTERN.sub("", result["generation"])
 
         choice = response.choices[0]
         if hasattr(choice.message, "audio") and choice.message.audio:
@@ -245,20 +238,16 @@ def _process_audio_response(self, audio_data, response_id: str) -> dict:
             return audio_info
 
         if self.output_audio_dir:
-            try:
-                audio_bytes = base64.b64decode(audio_base64)
-                filename = f"{response_id}.wav"
-                filepath = os.path.join(self.output_audio_dir, filename)
-
-                with open(filepath, "wb") as f:
-                    f.write(audio_bytes)
-
-                audio_info["path"] = filepath
-                audio_info["size_bytes"] = len(audio_bytes)
-                LOG.info(f"Saved audio: {filepath} ({len(audio_bytes)} bytes)")
-            except Exception as e:
-                LOG.warning(f"Failed to save audio: {e}")
-                audio_info["data"] = audio_base64
+            audio_bytes = base64.b64decode(audio_base64)
+            filename = f"{response_id}.wav"
+            filepath = os.path.join(self.output_audio_dir, filename)
+
+            with open(filepath, "wb") as f:
+                f.write(audio_bytes)
+
+            audio_info["path"] = filepath
+            audio_info["size_bytes"] = len(audio_bytes)
+            LOG.info(f"Saved audio: {filepath} ({len(audio_bytes)} bytes)")
         else:
             audio_info["data"] = audio_base64
 
diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py
index 0c8ca1b89a..ca7fba0331 100644
--- a/tests/test_vllm_audio.py
+++ b/tests/test_vllm_audio.py
@@ -42,6 +42,15 @@ def test_audio_file_to_base64():
         os.unlink(temp_path)
 
 
+def _is_valid_audio_content(content_item: dict) -> bool:
+    """Check if content item is a valid audio block (either format)."""
+    if content_item.get("type") == "audio_url":
+        return content_item.get("audio_url", {}).get("url", "").startswith("data:audio/wav;base64,")
+    elif content_item.get("type") == "input_audio":
+        return "data" in content_item.get("input_audio", {})
+    return False
+
+
 @pytest.fixture
 def mock_vllm_multimodal_model(tmp_path):
     """Create a mock VLLMMultimodalModel for testing audio preprocessing."""
@@ -53,6 +62,7 @@ def mock_vllm_multimodal_model(tmp_path):
         model.enable_audio_chunking = True
         model.audio_chunk_task_types = None
         model.chunk_audio_threshold_sec = 30
+        model.audio_format = "audio_url"  # Test audio_url format (for vLLM/Qwen)
         model._tunnel = None
         return model
 
@@ -72,8 +82,28 @@ def test_content_text_to_list_with_audio(mock_vllm_multimodal_model, tmp_path):
 
     assert isinstance(result["content"], list)
     assert len(result["content"]) == 2
-    assert result["content"][0]["type"] == "audio_url"
-    assert result["content"][0]["audio_url"]["url"].startswith("data:audio/wav;base64,")
+    assert _is_valid_audio_content(result["content"][0])
+    assert result["content"][1]["type"] == "text"
+
+
+def test_content_text_to_list_with_input_audio_format(mock_vllm_multimodal_model, tmp_path):
+    """Test audio conversion with input_audio format (OpenAI native)."""
+    audio_path = tmp_path / "test.wav"
+    with open(audio_path, "wb") as f:
+        f.write(b"RIFF" + b"\x00" * 100)
+
+    # Switch to input_audio format
+    mock_vllm_multimodal_model.audio_format = "input_audio"
+
+    message = {"role": "user", "content": "Describe this audio", "audio": {"path": "test.wav"}}
+    result = mock_vllm_multimodal_model.content_text_to_list(message)
+
+    assert isinstance(result["content"], list)
+    assert len(result["content"]) == 2
+    # Verify input_audio format structure
+    assert result["content"][0]["type"] == "input_audio"
+    assert "data" in result["content"][0]["input_audio"]
+    assert result["content"][0]["input_audio"]["format"] == "wav"
     assert result["content"][1]["type"] == "text"
 
 
@@ -100,8 +130,8 @@ def test_content_text_to_list_with_multiple_audios(mock_vllm_multimodal_model, t
     assert isinstance(result["content"], list)
     assert len(result["content"]) == 3
     # Audio MUST come before text for Qwen Audio
-    assert result["content"][0]["type"] == "audio_url"
-    assert result["content"][1]["type"] == "audio_url"
+    assert _is_valid_audio_content(result["content"][0])
+    assert _is_valid_audio_content(result["content"][1])
     assert result["content"][2]["type"] == "text"
 
 

From 8a3fb5dbd2f0f237ce0867f0607237308af757fa Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 5 Feb 2026 05:24:13 -0800
Subject: [PATCH 03/11] handling of topk top p params

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 .../inference/model/vllm_multimodal.py        | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py
index 478ab83908..01997df7a7 100644
--- a/nemo_skills/inference/model/vllm_multimodal.py
+++ b/nemo_skills/inference/model/vllm_multimodal.py
@@ -180,16 +180,32 @@ def _build_request_body(self, top_k, min_p, repetition_penalty, extra_body: dict
         """Build request body, skipping vLLM-specific params for external APIs.
 
         Args:
-            top_k: Top-k sampling parameter.
-            min_p: Min-p sampling parameter.
-            repetition_penalty: Repetition penalty parameter.
+            top_k: Top-k sampling parameter (vLLM, default -1).
+            min_p: Min-p sampling parameter (vLLM, default 0.0).
+            repetition_penalty: Repetition penalty parameter (vLLM, default 1.0).
             extra_body: Additional parameters to include.
 
         Returns:
             Dictionary of extra body parameters for the request.
+
+        Raises:
+            ValueError: If vLLM-specific params are set to non-default values in external API mode.
         """
-        # For external APIs, skip vLLM-specific parameters
+        # For external APIs, fail if user explicitly set vLLM-specific parameters
         if self._external_api_mode:
+            non_default_params = []
+            if top_k != -1:
+                non_default_params.append(f"top_k={top_k}")
+            if min_p != 0.0:
+                non_default_params.append(f"min_p={min_p}")
+            if repetition_penalty != 1.0:
+                non_default_params.append(f"repetition_penalty={repetition_penalty}")
+
+            if non_default_params:
+                raise ValueError(
+                    f"vLLM-specific parameters are not supported for external APIs: {', '.join(non_default_params)}. "
+                    "These parameters only work with local vLLM servers."
+                )
             return extra_body or {}
 
         # For local vLLM server, use full parameter set

From 7d4a5ccfc6155f872a2f1ac363f07f8be3e02bad Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 5 Feb 2026 05:32:57 -0800
Subject: [PATCH 04/11] adress comments

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 .../inference/model/vllm_multimodal.py        | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py
index 01997df7a7..1f57f92fa1 100644
--- a/nemo_skills/inference/model/vllm_multimodal.py
+++ b/nemo_skills/inference/model/vllm_multimodal.py
@@ -100,6 +100,8 @@ def __init__(
         self.audio_chunk_task_types = audio_chunk_task_types
         self.chunk_audio_threshold_sec = chunk_audio_threshold_sec
 
+        if audio_format not in ("audio_url", "input_audio"):
+            raise ValueError(f"Unsupported audio_format '{audio_format}'. Use 'audio_url' or 'input_audio'.")
         self.audio_format = audio_format
 
         # Audio OUTPUT config
@@ -273,6 +275,19 @@ def _process_audio_response(self, audio_data, response_id: str) -> dict:
     # Audio INPUT methods
     # =====================
 
+    def _preprocess_messages_for_model(self, messages: list[dict]) -> list[dict]:
+        """Preprocess messages - creates copies to avoid mutation.
+
+        Note: /no_think suffix is passed through unchanged (handled by the model).
+
+        Args:
+            messages: List of message dicts.
+
+        Returns:
+            Copy of message dicts.
+        """
+        return [copy.deepcopy(msg) for msg in messages]
+
     def content_text_to_list(self, message: dict) -> dict:
         """Convert message content with audio to proper list format.
 
@@ -423,6 +438,9 @@ async def _generate_with_chunking(
 
                 chunk_messages.append(msg_copy)
 
+            # Preprocess messages before sending to model
+            chunk_messages = self._preprocess_messages_for_model(chunk_messages)
+
             # Generate for this chunk using parent's generate_async
             result = await super().generate_async(
                 prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs
@@ -482,7 +500,9 @@ async def generate_async(
                 return await self._generate_with_chunking(messages, audio_path, duration, tokens_to_generate, **kwargs)
 
             # No chunking needed - convert audio fields to base64 format
-            prompt = [self.content_text_to_list(msg) for msg in messages]
+            messages = [self.content_text_to_list(copy.deepcopy(msg)) for msg in messages]
+            messages = self._preprocess_messages_for_model(messages)
+            prompt = messages
 
         # Call parent's generate_async (which handles audio OUTPUT via _parse_chat_completion_response)
         return await super().generate_async(prompt=prompt, tokens_to_generate=tokens_to_generate, **kwargs)

From e7d0d92d49bac7e2dbddbf4f764fe1489da504b6 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 5 Feb 2026 05:54:07 -0800
Subject: [PATCH 05/11] add multimodal to allowerd

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 .gitignore                           | 3 +++
 nemo_skills/pipeline/utils/server.py | 7 +++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1f602ec71f..8f3a307e62 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,3 +48,6 @@ CLAUDE.md
 .codex
 
 .idea
+
+#scripts at root level
+/*.sh
diff --git a/nemo_skills/pipeline/utils/server.py b/nemo_skills/pipeline/utils/server.py
index 87abca4a99..9fb84a680f 100644
--- a/nemo_skills/pipeline/utils/server.py
+++ b/nemo_skills/pipeline/utils/server.py
@@ -24,6 +24,7 @@
 class SupportedServersSelfHosted(str, Enum):
     trtllm = "trtllm"
     vllm = "vllm"
+    vllm_multimodal = "vllm_multimodal"
     sglang = "sglang"
     megatron = "megatron"
     generic = "generic"
@@ -32,6 +33,7 @@ class SupportedServersSelfHosted(str, Enum):
 class SupportedServers(str, Enum):
     trtllm = "trtllm"
     vllm = "vllm"
+    vllm_multimodal = "vllm_multimodal"
     sglang = "sglang"
     megatron = "megatron"
     openai = "openai"
@@ -125,7 +127,7 @@ def get_server_command(
 
     # check if the model path is mounted if not vllm, sglang, or trtllm;
     # vllm, sglang, trtllm can also pass model name as "model_path" so we need special processing
-    if server_type not in ["vllm", "sglang", "trtllm", "generic"]:
+    if server_type not in ["vllm", "vllm_multimodal", "sglang", "trtllm", "generic"]:
         check_if_mounted(cluster_config, model_path)
 
     # the model path will be mounted, so generally it will start with /
@@ -158,7 +160,8 @@ def get_server_command(
             f"    --micro-batch-size 1 "  # that's a training argument, ignored here, but required to specify..
             f"    {server_args} "
         )
-    elif server_type == "vllm":
+    elif server_type in ["vllm", "vllm_multimodal"]:
+        # vllm_multimodal uses the same vLLM server; multimodal handling is on the client side
         server_entrypoint = server_entrypoint or "-m nemo_skills.inference.server.serve_vllm"
         start_vllm_cmd = (
             f"python3 {server_entrypoint} "

From 0363027a7863c1eec86e4398248314663e94cc10 Mon Sep 17 00:00:00 2001
From: mmkrtchyan <mmkrtchyan@nvidia.com>
Date: Thu, 19 Feb 2026 15:43:11 +0400
Subject: [PATCH 06/11] adding nvidia inference api test

Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 tests/test_nvidia_inference_api.py | 134 +++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 tests/test_nvidia_inference_api.py

diff --git a/tests/test_nvidia_inference_api.py b/tests/test_nvidia_inference_api.py
new file mode 100644
index 0000000000..86befccda1
--- /dev/null
+++ b/tests/test_nvidia_inference_api.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Integration tests for NVIDIA Inference API with VLLMMultimodalModel (audio input)."""
+
+import asyncio
+import os
+from pathlib import Path
+
+import pytest
+
+from nemo_skills.inference.model.vllm_multimodal import VLLMMultimodalModel
+
+NVIDIA_BASE_URL = "https://inference-api.nvidia.com/v1"
+MODEL = "gcp/google/gemini-2.5-flash-lite"
+
+TEST_AUDIO_DIR = Path(__file__).parent / "slurm-tests" / "asr_nim" / "wavs"
+TEST_AUDIO_T2 = TEST_AUDIO_DIR / "t2_16.wav"  # "sample 2 this is a test of text to speech synthesis"
+TEST_AUDIO_T3 = TEST_AUDIO_DIR / "t3_16.wav"  # "sample 3 hello how are you today"
+
+requires_nvidia_api_key = pytest.mark.skipif(
+    not os.getenv("NVIDIA_API_KEY"),
+    reason="NVIDIA_API_KEY environment variable not set",
+)
+
+requires_test_audio = pytest.mark.skipif(
+    not TEST_AUDIO_T2.exists() or not TEST_AUDIO_T3.exists(),
+    reason="Test audio files not found at tests/slurm-tests/asr_nim/wavs/",
+)
+
+
+@requires_nvidia_api_key
+def test_nvidia_api_text_only():
+    """Smoke test: text-only chat completion via NVIDIA Inference API."""
+    model = VLLMMultimodalModel(
+        model=MODEL,
+        base_url=NVIDIA_BASE_URL,
+        audio_format="input_audio",
+    )
+
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Hello! Can you help me?"},
+    ]
+
+    result = asyncio.run(
+        model.generate_async(
+            prompt=messages,
+            tokens_to_generate=1024,
+            temperature=0.7,
+        )
+    )
+
+    assert "generation" in result
+    assert len(result["generation"]) > 0
+    print(f"[text-only] generation: {result['generation'][:200]}")
+
+
+@requires_nvidia_api_key
+@requires_test_audio
+def test_nvidia_api_audio_input():
+    model = VLLMMultimodalModel(
+        model=MODEL,
+        base_url=NVIDIA_BASE_URL,
+        audio_format="input_audio",
+        data_dir=str(TEST_AUDIO_DIR),
+    )
+
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {
+            "role": "user",
+            "content": "What do you hear in this audio? Describe it briefly.",
+            "audio": {"path": "t2_16.wav"},
+        },
+    ]
+
+    result = asyncio.run(
+        model.generate_async(
+            prompt=messages,
+            tokens_to_generate=1024,
+            temperature=0.7,
+        )
+    )
+
+    assert "generation" in result
+    assert len(result["generation"]) > 0
+    print(f"[audio-input] generation: {result['generation'][:300]}")
+
+
+@requires_nvidia_api_key
+@requires_test_audio
+def test_nvidia_api_audio_with_transcription_prompt():
+    """Integration test: ask the model to transcribe audio content."""
+    model = VLLMMultimodalModel(
+        model=MODEL,
+        base_url=NVIDIA_BASE_URL,
+        audio_format="input_audio",
+        data_dir=str(TEST_AUDIO_DIR),
+    )
+
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant that can listen to audio."},
+        {
+            "role": "user",
+            "content": "Please listen to this audio and tell me what you hear.",
+            "audio": {"path": "t3_16.wav"},
+        },
+    ]
+
+    result = asyncio.run(
+        model.generate_async(
+            prompt=messages,
+            tokens_to_generate=1024,
+            temperature=0.7,
+        )
+    )
+
+    assert "generation" in result
+    assert len(result["generation"]) > 0
+    assert result.get("num_generated_tokens", 0) > 0
+    print(f"[transcription] generation: {result['generation'][:300]}")
+    print(f"[transcription] tokens: {result.get('num_generated_tokens')}")

From 12a76c947f1d1a82b233224103e220714a0c055d Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 19 Feb 2026 04:16:35 -0800
Subject: [PATCH 07/11] update tests and adress comments

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 .../inference/model/vllm_multimodal.py        | 39 ++++++++++++-------
 tests/test_vllm_audio.py                      | 23 ++++++++---
 2 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py
index 1f57f92fa1..80a27bc6fb 100644
--- a/nemo_skills/inference/model/vllm_multimodal.py
+++ b/nemo_skills/inference/model/vllm_multimodal.py
@@ -76,7 +76,7 @@ def __init__(
         enable_audio_chunking: bool = True,
         audio_chunk_task_types: list[str] | None = None,
         chunk_audio_threshold_sec: int = 30,
-        audio_format: str = "input_audio",
+        audio_format: str | None = None,
         **kwargs,
     ):
         """Initialize VLLMMultimodalModel with audio I/O and external API support.
@@ -87,7 +87,7 @@ def __init__(
             enable_audio_chunking: Master switch for audio chunking.
             audio_chunk_task_types: If None, chunk all task types; if specified, only chunk these.
             chunk_audio_threshold_sec: Audio duration threshold for chunking (in seconds).
-            audio_format: Format for audio content ("audio_url" or "input_audio").
+            audio_format: Format for audio content ("audio_url" or "input_audio"). If None, select by mode.
             **kwargs: Other parameters passed to VLLMModel/BaseModel.
         """
         super().__init__(model=model, base_url=base_url, **kwargs)
@@ -100,6 +100,8 @@ def __init__(
         self.audio_chunk_task_types = audio_chunk_task_types
         self.chunk_audio_threshold_sec = chunk_audio_threshold_sec
 
+        if audio_format is None:
+            audio_format = "input_audio" if self._external_api_mode else "audio_url"
         if audio_format not in ("audio_url", "input_audio"):
             raise ValueError(f"Unsupported audio_format '{audio_format}'. Use 'audio_url' or 'input_audio'.")
         self.audio_format = audio_format
@@ -221,9 +223,12 @@ def _parse_chat_completion_response(self, response, include_response: bool = Fal
         if "generation" in result and result["generation"]:
             match = DEBUG_INFO_PATTERN.search(result["generation"])
             if match:
-                result["debug_info"] = json.loads(match.group(1))
-                # Strip debug_info from generation
-                result["generation"] = DEBUG_INFO_PATTERN.sub("", result["generation"])
+                try:
+                    result["debug_info"] = json.loads(match.group(1))
+                    # Strip debug_info from generation
+                    result["generation"] = DEBUG_INFO_PATTERN.sub("", result["generation"])
+                except json.JSONDecodeError:
+                    LOG.warning("Failed to parse debug_info JSON from content")
 
         choice = response.choices[0]
         if hasattr(choice.message, "audio") and choice.message.audio:
@@ -256,16 +261,20 @@ def _process_audio_response(self, audio_data, response_id: str) -> dict:
             return audio_info
 
         if self.output_audio_dir:
-            audio_bytes = base64.b64decode(audio_base64)
-            filename = f"{response_id}.wav"
-            filepath = os.path.join(self.output_audio_dir, filename)
-
-            with open(filepath, "wb") as f:
-                f.write(audio_bytes)
-
-            audio_info["path"] = filepath
-            audio_info["size_bytes"] = len(audio_bytes)
-            LOG.info(f"Saved audio: {filepath} ({len(audio_bytes)} bytes)")
+            try:
+                audio_bytes = base64.b64decode(audio_base64)
+                filename = f"{response_id}.wav"
+                filepath = os.path.join(self.output_audio_dir, filename)
+
+                with open(filepath, "wb") as f:
+                    f.write(audio_bytes)
+
+                audio_info["path"] = filepath
+                audio_info["size_bytes"] = len(audio_bytes)
+                LOG.info(f"Saved audio: {filepath} ({len(audio_bytes)} bytes)")
+            except Exception as e:
+                LOG.warning(f"Failed to save audio: {e}")
+                audio_info["data"] = audio_base64
         else:
             audio_info["data"] = audio_base64
 
diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py
index ca7fba0331..a919ab33cb 100644
--- a/tests/test_vllm_audio.py
+++ b/tests/test_vllm_audio.py
@@ -67,6 +67,22 @@ def mock_vllm_multimodal_model(tmp_path):
         return model
 
 
+@pytest.fixture
+def mock_vllm_multimodal_model_input_audio(tmp_path):
+    """Create a mock VLLMMultimodalModel configured for input_audio."""
+    with patch.object(VLLMMultimodalModel, "__init__", lambda self, **kwargs: None):
+        model = VLLMMultimodalModel()
+        model.data_dir = str(tmp_path)
+        model.output_dir = None
+        model.output_audio_dir = None
+        model.enable_audio_chunking = True
+        model.audio_chunk_task_types = None
+        model.chunk_audio_threshold_sec = 30
+        model.audio_format = "input_audio"
+        model._tunnel = None
+        return model
+
+
 def test_content_text_to_list_with_audio(mock_vllm_multimodal_model, tmp_path):
     """Test converting string content with audio to list format.
 
@@ -86,17 +102,14 @@ def test_content_text_to_list_with_audio(mock_vllm_multimodal_model, tmp_path):
     assert result["content"][1]["type"] == "text"
 
 
-def test_content_text_to_list_with_input_audio_format(mock_vllm_multimodal_model, tmp_path):
+def test_content_text_to_list_with_input_audio_format(mock_vllm_multimodal_model_input_audio, tmp_path):
     """Test audio conversion with input_audio format (OpenAI native)."""
     audio_path = tmp_path / "test.wav"
     with open(audio_path, "wb") as f:
         f.write(b"RIFF" + b"\x00" * 100)
 
-    # Switch to input_audio format
-    mock_vllm_multimodal_model.audio_format = "input_audio"
-
     message = {"role": "user", "content": "Describe this audio", "audio": {"path": "test.wav"}}
-    result = mock_vllm_multimodal_model.content_text_to_list(message)
+    result = mock_vllm_multimodal_model_input_audio.content_text_to_list(message)
 
     assert isinstance(result["content"], list)
     assert len(result["content"]) == 2

From f5b57dcc0edd279faa932b975b77bd39160cb1f3 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 19 Feb 2026 04:37:02 -0800
Subject: [PATCH 08/11] address review nits

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/inference/generate.py  | 2 +-
 tests/test_nvidia_inference_api.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index ef6986f83e..21b559e10f 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -632,7 +632,7 @@ def drop_fields_from_messages(self, output):
 
             # Filter out content types specified in drop_content_types config
             message["content"] = [
-                content for content in message["content"] if content.get("type") not in self.cfg.drop_content_types
+                content for content in message["content"] if content["type"] not in self.cfg.drop_content_types
             ]
 
     async def postprocess_single_output(self, output, original_data_point):
diff --git a/tests/test_nvidia_inference_api.py b/tests/test_nvidia_inference_api.py
index 86befccda1..bc766c09d8 100644
--- a/tests/test_nvidia_inference_api.py
+++ b/tests/test_nvidia_inference_api.py
@@ -70,6 +70,7 @@ def test_nvidia_api_text_only():
 @requires_nvidia_api_key
 @requires_test_audio
 def test_nvidia_api_audio_input():
+    """Integration test: audio-input generation using a local test audio file."""
     model = VLLMMultimodalModel(
         model=MODEL,
         base_url=NVIDIA_BASE_URL,
@@ -129,6 +130,6 @@ def test_nvidia_api_audio_with_transcription_prompt():
 
     assert "generation" in result
     assert len(result["generation"]) > 0
-    assert result.get("num_generated_tokens", 0) > 0
+    assert result["num_generated_tokens"] > 0
     print(f"[transcription] generation: {result['generation'][:300]}")
-    print(f"[transcription] tokens: {result.get('num_generated_tokens')}")
+    print(f"[transcription] tokens: {result['num_generated_tokens']}")

From 28f44b8a05986d56e2d88b3c3453420437ab32ce Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 19 Feb 2026 04:52:50 -0800
Subject: [PATCH 09/11] avoid extra deepcopy after audio injection

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 .../inference/model/vllm_multimodal.py        | 27 +++++--------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py
index 80a27bc6fb..da2d13eb6a 100644
--- a/nemo_skills/inference/model/vllm_multimodal.py
+++ b/nemo_skills/inference/model/vllm_multimodal.py
@@ -180,7 +180,7 @@ def _get_api_key(self, api_key: str | None, api_key_env_var: str | None, base_ur
 
         return api_key
 
-    def _build_request_body(self, top_k, min_p, repetition_penalty, extra_body: dict = None):
+    def _build_request_body(self, top_k, min_p, repetition_penalty, extra_body: dict | None = None):
         """Build request body, skipping vLLM-specific params for external APIs.
 
         Args:
@@ -312,7 +312,7 @@ def content_text_to_list(self, message: dict) -> dict:
             New message dict with content converted to list format including audio.
         """
         if "audio" not in message and "audios" not in message:
-            return message
+            return copy.deepcopy(message)
 
         result = copy.deepcopy(message)
 
@@ -417,9 +417,7 @@ async def _generate_with_chunking(
         result = None
 
         # Track cumulative statistics across chunks
-        total_input_tokens = 0
-        total_generated_tokens = 0
-        total_time = 0.0
+        total_num_generated_tokens = 0
 
         for chunk_idx, audio_chunk in enumerate(chunks):
             chunk_messages = []
@@ -447,18 +445,13 @@ async def _generate_with_chunking(
 
                 chunk_messages.append(msg_copy)
 
-            # Preprocess messages before sending to model
-            chunk_messages = self._preprocess_messages_for_model(chunk_messages)
-
             # Generate for this chunk using parent's generate_async
             result = await super().generate_async(
                 prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs
             )
 
             # Sum statistics from each chunk
-            total_input_tokens += result.get("input_tokens", 0)
-            total_generated_tokens += result.get("generated_tokens", 0)
-            total_time += result.get("time_elapsed", 0.0)
+            total_num_generated_tokens += result["num_generated_tokens"]
 
             generation = result["generation"]
             chunk_results.append(generation.strip())
@@ -466,17 +459,12 @@ async def _generate_with_chunking(
         # Aggregate results
         aggregated_text = " ".join(chunk_results)
 
-        if not result:
-            raise RuntimeError("Audio chunk generation returned no result")
-
         final_result = result.copy()
         final_result["generation"] = aggregated_text
         final_result["num_audio_chunks"] = len(chunks)
         final_result["audio_duration"] = duration
         # Update with summed statistics
-        final_result["input_tokens"] = total_input_tokens
-        final_result["generated_tokens"] = total_generated_tokens
-        final_result["time_elapsed"] = total_time
+        final_result["num_generated_tokens"] = total_num_generated_tokens
 
         return final_result
 
@@ -484,7 +472,7 @@ async def generate_async(
         self,
         prompt: str | list[dict] | None = None,
         tokens_to_generate: int | None = None,
-        task_type: str = None,
+        task_type: str | None = None,
         **kwargs,
     ) -> dict:
         """Generate with automatic audio chunking for long audio files.
@@ -509,8 +497,7 @@ async def generate_async(
                 return await self._generate_with_chunking(messages, audio_path, duration, tokens_to_generate, **kwargs)
 
             # No chunking needed - convert audio fields to base64 format
-            messages = [self.content_text_to_list(copy.deepcopy(msg)) for msg in messages]
-            messages = self._preprocess_messages_for_model(messages)
+            messages = [self.content_text_to_list(msg) for msg in messages]
             prompt = messages
 
         # Call parent's generate_async (which handles audio OUTPUT via _parse_chat_completion_response)

From ac97b44a29d730c4e919714f5010b040bf705599 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 19 Feb 2026 05:20:19 -0800
Subject: [PATCH 10/11] use nv inference api key and add soundfile

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/inference/model/vllm_multimodal.py | 6 +++---
 requirements/common-tests.txt                  | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/nemo_skills/inference/model/vllm_multimodal.py b/nemo_skills/inference/model/vllm_multimodal.py
index da2d13eb6a..4b8006abe5 100644
--- a/nemo_skills/inference/model/vllm_multimodal.py
+++ b/nemo_skills/inference/model/vllm_multimodal.py
@@ -152,11 +152,11 @@ def _get_api_key(self, api_key: str | None, api_key_env_var: str | None, base_ur
         # Auto-detect API key based on base_url
         if base_url:
             if "api.nvidia.com" in base_url or "inference-api.nvidia.com" in base_url:
-                api_key = os.getenv("NVIDIA_API_KEY")
+                api_key = os.getenv("NV_INFERENCE_API_KEY") or os.getenv("NVIDIA_API_KEY")
                 if not api_key:
                     raise ValueError(
-                        "NVIDIA_API_KEY is required for NVIDIA APIs and could not be found. "
-                        "Set NVIDIA_API_KEY environment variable or pass api_key explicitly."
+                        "NV_INFERENCE_API_KEY or NVIDIA_API_KEY is required for NVIDIA APIs and could not be found. "
+                        "Set NV_INFERENCE_API_KEY/NVIDIA_API_KEY environment variable or pass api_key explicitly."
                     )
                 return api_key
 
diff --git a/requirements/common-tests.txt b/requirements/common-tests.txt
index 4eca96f92a..8b2fbcce4d 100644
--- a/requirements/common-tests.txt
+++ b/requirements/common-tests.txt
@@ -16,3 +16,4 @@ pytest
 pytest-asyncio
 pytest-cov
 pytest-timeout
+soundfile

From 64a860a093b0c0d73621cb6e119a9472f0747a03 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Thu, 19 Feb 2026 05:47:50 -0800
Subject: [PATCH 11/11] update yml for jkeys and default key

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 .github/workflows/tests.yml        | 1 +
 tests/test_nvidia_inference_api.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 837ca2d58b..6f1e285408 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -85,6 +85,7 @@ jobs:
           .
     - name: Run all tests
       env:
+        NV_INFERENCE_API_KEY: ${{ secrets.NV_INFERENCE_API_KEY }}
         NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: |
diff --git a/tests/test_nvidia_inference_api.py b/tests/test_nvidia_inference_api.py
index bc766c09d8..e6abe6d8b5 100644
--- a/tests/test_nvidia_inference_api.py
+++ b/tests/test_nvidia_inference_api.py
@@ -30,8 +30,8 @@
 TEST_AUDIO_T3 = TEST_AUDIO_DIR / "t3_16.wav"  # "sample 3 hello how are you today"
 
 requires_nvidia_api_key = pytest.mark.skipif(
-    not os.getenv("NVIDIA_API_KEY"),
-    reason="NVIDIA_API_KEY environment variable not set",
+    not (os.getenv("NV_INFERENCE_API_KEY") or os.getenv("NVIDIA_API_KEY")),
+    reason="NV_INFERENCE_API_KEY/NVIDIA_API_KEY environment variable not set",
 )
 
 requires_test_audio = pytest.mark.skipif(