From 07a408e18115b3d54e869730066eb7eaf995849c Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 13 Nov 2025 12:18:31 -0800
Subject: [PATCH 01/38] print

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/model/vllm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index e9a2146520..1b4d9f7998 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -117,6 +117,7 @@ def _build_chat_request_params(
         tools: list[dict] | None = None,
         extra_body: dict = None,
     ) -> dict:
+        print("messages", messages)
         request = {
             "messages": messages,
             "max_tokens": tokens_to_generate,

From 6acbc9f61e2b571e6becb09fea95e0eb04f90b21 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 13 Nov 2025 13:48:12 -0800
Subject: [PATCH 02/38] content_text_to_list

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/model/vllm.py | 35 ++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index 1b4d9f7998..b5250654ae 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import logging
-
+import base64
 import requests
 
 from nemo_skills.utils import get_logger_name
@@ -23,6 +23,38 @@
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
+def audio_file_to_base64(audio_file_path: str):
+    """Encodes an audio file into a base64 string."""
+    with open(audio_file_path, "rb") as audio_file:
+        audio_content = audio_file.read()
+        return base64.b64encode(audio_content).decode("utf-8")
+
+def content_text_to_list(message):
+    content = message["content"]
+    if isinstance(content, str):
+        message["content"] = [{"type": "text", "text": content}]
+    elif isinstance(content, list):
+        message["content"] = content
+    else:
+        raise TypeError(str(content))
+
+    if "audios" in message:
+        for audio in message["audios"]:
+            base64_audio = audio_file_to_base64(audio["path"])
+            audio_message = {
+                "type": "audio_url",
+                "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}
+            }
+            message["content"].append(audio_message)
+    if "audio" in message:
+        audio = message["audio"]
+        base64_audio = audio_file_to_base64(audio["path"])
+        audio_message = {
+            "type": "audio_url",
+            "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}
+        }
+        message["content"].append(audio_message)
+    return message
 
 class VLLMModel(BaseModel):
     def __init__(self, **kwargs):
@@ -117,6 +149,7 @@ def _build_chat_request_params(
         tools: list[dict] | None = None,
         extra_body: dict = None,
     ) -> dict:
+        messages = [content_text_to_list(message) for message in messages]
         print("messages", messages)
         request = {
             "messages": messages,

From 78c6313655a721fb56844f0e9da5d940431ee992 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 13 Nov 2025 14:23:29 -0800
Subject: [PATCH 03/38] rm print

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/model/vllm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index b5250654ae..3bfbd25a6c 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -150,7 +150,6 @@ def _build_chat_request_params(
         extra_body: dict = None,
     ) -> dict:
         messages = [content_text_to_list(message) for message in messages]
-        print("messages", messages)
         request = {
             "messages": messages,
             "max_tokens": tokens_to_generate,

From 8f5c3a96939dfa2940cc7b875f7f5c88328e5716 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 13 Nov 2025 15:53:41 -0800
Subject: [PATCH 04/38] self.discard_binary_types

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py   | 11 ++++++++
 nemo_skills/inference/model/vllm.py | 39 ++++++++++++++++++-----------
 2 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index aae36c7351..6a1cef71e9 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -525,6 +525,14 @@ def dump_outputs(self, outputs, data_points, fout):
         for output in outputs:
             fout.write(json.dumps(output) + "\n")
 
+    def discard_binary_types(self, output):
+        binary_types = {"audio_url"}
+        if isinstance(output["content"], list):
+            for message in output["content"]:
+                if "type" in message and message["type"] in binary_types:
+                    message[message["type"]]["url"] = ""
+        return output
+        
     async def postprocess_single_output(self, output, original_data_point):
         # to make it easier to follow up with other generations and limit accidental errors, we are adding
         # all of the original data to the output file alongside the new generations
@@ -540,6 +548,9 @@ async def postprocess_single_output(self, output, original_data_point):
         for key in output:
             original_data_point.pop(key, None)
         output.update(original_data_point)
+        
+        output = self.discard_binary_types(output)
+
         if self.cfg.parse_reasoning:
             parse_reasoning(
                 output,
diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index 3bfbd25a6c..bf783eb9e5 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -30,15 +30,32 @@ def audio_file_to_base64(audio_file_path: str):
         return base64.b64encode(audio_content).decode("utf-8")
 
 def content_text_to_list(message):
-    content = message["content"]
-    if isinstance(content, str):
-        message["content"] = [{"type": "text", "text": content}]
-    elif isinstance(content, list):
-        message["content"] = content
-    else:
-        raise TypeError(str(content))
+    if "audio" in message:
+        content = message["content"]
+        if isinstance(content, str):
+            message["content"] = [{"type": "text", "text": content}]
+        elif isinstance(content, list):
+            message["content"] = content
+        else:
+            raise TypeError(str(content))
+
+        audio = message["audio"]
+        base64_audio = audio_file_to_base64(audio["path"])
+        audio_message = {
+            "type": "audio_url",
+            "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}
+        }
+        message["content"].append(audio_message)
 
     if "audios" in message:
+        content = message["content"]
+        if isinstance(content, str):
+            message["content"] = [{"type": "text", "text": content}]
+        elif isinstance(content, list):
+            message["content"] = content
+        else:
+            raise TypeError(str(content))
+        
         for audio in message["audios"]:
             base64_audio = audio_file_to_base64(audio["path"])
             audio_message = {
@@ -46,14 +63,6 @@ def content_text_to_list(message):
                 "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}
             }
             message["content"].append(audio_message)
-    if "audio" in message:
-        audio = message["audio"]
-        base64_audio = audio_file_to_base64(audio["path"])
-        audio_message = {
-            "type": "audio_url",
-            "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}
-        }
-        message["content"].append(audio_message)
     return message
 
 class VLLMModel(BaseModel):

From 5e36e553e843540b265d4891261734bbb71e94d7 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 13 Nov 2025 15:59:57 -0800
Subject: [PATCH 05/38] print

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 6a1cef71e9..5fd987fffd 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -527,6 +527,7 @@ def dump_outputs(self, outputs, data_points, fout):
 
     def discard_binary_types(self, output):
         binary_types = {"audio_url"}
+        print("output", output)
         if isinstance(output["content"], list):
             for message in output["content"]:
                 if "type" in message and message["type"] in binary_types:

From 862be043e7fa514c40b113669ff35b140b6b0df5 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 13 Nov 2025 16:09:48 -0800
Subject: [PATCH 06/38] messages

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 5fd987fffd..b829599916 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -525,14 +525,12 @@ def dump_outputs(self, outputs, data_points, fout):
         for output in outputs:
             fout.write(json.dumps(output) + "\n")
 
-    def discard_binary_types(self, output):
+    def drop_binary_data(self, output):
         binary_types = {"audio_url"}
-        print("output", output)
-        if isinstance(output["content"], list):
-            for message in output["content"]:
-                if "type" in message and message["type"] in binary_types:
-                    message[message["type"]]["url"] = ""
-        return output
+        if isinstance(output["messages"][0]["content"], list):
+            for content in output["messages"][0]["content"]:
+                if "type" in content and content["type"] in binary_types:
+                    content[content["type"]]["url"] = ""
         
     async def postprocess_single_output(self, output, original_data_point):
         # to make it easier to follow up with other generations and limit accidental errors, we are adding
@@ -550,7 +548,7 @@ async def postprocess_single_output(self, output, original_data_point):
             original_data_point.pop(key, None)
         output.update(original_data_point)
         
-        output = self.discard_binary_types(output)
+        self.drop_binary_data(output)
 
         if self.cfg.parse_reasoning:
             parse_reasoning(

From 6768b00c65f06d6ff247289074e2d34b3acf00b8 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 13 Nov 2025 16:22:07 -0800
Subject: [PATCH 07/38] binary_data_to_drop_from_output

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py   |  4 ++--
 nemo_skills/inference/model/vllm.py | 17 -----------------
 2 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index b829599916..b8ee7c4e13 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -526,10 +526,10 @@ def dump_outputs(self, outputs, data_points, fout):
             fout.write(json.dumps(output) + "\n")
 
     def drop_binary_data(self, output):
-        binary_types = {"audio_url"}
+        binary_data_to_drop_from_output = {"audio_url"}
         if isinstance(output["messages"][0]["content"], list):
             for content in output["messages"][0]["content"]:
-                if "type" in content and content["type"] in binary_types:
+                if "type" in content and content["type"] in binary_data_to_drop_from_output:
                     content[content["type"]]["url"] = ""
         
     async def postprocess_single_output(self, output, original_data_point):
diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index bf783eb9e5..ab4a3ab029 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -30,23 +30,6 @@ def audio_file_to_base64(audio_file_path: str):
         return base64.b64encode(audio_content).decode("utf-8")
 
 def content_text_to_list(message):
-    if "audio" in message:
-        content = message["content"]
-        if isinstance(content, str):
-            message["content"] = [{"type": "text", "text": content}]
-        elif isinstance(content, list):
-            message["content"] = content
-        else:
-            raise TypeError(str(content))
-
-        audio = message["audio"]
-        base64_audio = audio_file_to_base64(audio["path"])
-        audio_message = {
-            "type": "audio_url",
-            "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}
-        }
-        message["content"].append(audio_message)
-
     if "audios" in message:
         content = message["content"]
         if isinstance(content, str):

From f522c0b719c0643d8e5409a424893a69e2850f01 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 14 Nov 2025 11:31:06 -0800
Subject: [PATCH 08/38] print

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/model/vllm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index ab4a3ab029..509516238d 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -50,6 +50,7 @@ def content_text_to_list(message):
 
 class VLLMModel(BaseModel):
     def __init__(self, **kwargs):
+        print("kwargskwargs", str(kwargs))
         super().__init__(**kwargs)
 
     def _get_tokenizer_endpoint(self):

From d48d43f1b7ee1fb8d1178f89c055caecbfc26105 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 14 Nov 2025 11:37:21 -0800
Subject: [PATCH 09/38] print

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py   | 1 +
 nemo_skills/inference/model/vllm.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index b8ee7c4e13..c33ba57d4c 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -607,6 +607,7 @@ async def generate_with_semaphore(self, **generation_params):
         as long as those requests also use this function.
         """
         async with self.semaphore:
+            print("generation_paramsgeneration_params", str(generation_params))
             return await self.llm.generate_async(**generation_params)
 
     async def evaluate_single_datapoint(self, data_point):
diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index 509516238d..ab4a3ab029 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -50,7 +50,6 @@ def content_text_to_list(message):
 
 class VLLMModel(BaseModel):
     def __init__(self, **kwargs):
-        print("kwargskwargs", str(kwargs))
         super().__init__(**kwargs)
 
     def _get_tokenizer_endpoint(self):

From 4487ca1478f297e3f32d757922b6ad94e6279e0d Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 14 Nov 2025 14:46:14 -0800
Subject: [PATCH 10/38] data_dir

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py   |  1 -
 nemo_skills/inference/model/vllm.py | 43 +++++++++++++++--------------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index c33ba57d4c..b8ee7c4e13 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -607,7 +607,6 @@ async def generate_with_semaphore(self, **generation_params):
         as long as those requests also use this function.
         """
         async with self.semaphore:
-            print("generation_paramsgeneration_params", str(generation_params))
             return await self.llm.generate_async(**generation_params)
 
     async def evaluate_single_datapoint(self, data_point):
diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index ab4a3ab029..02a07f1fc2 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import logging
+import os
 import base64
 import requests
 
@@ -29,27 +30,10 @@ def audio_file_to_base64(audio_file_path: str):
         audio_content = audio_file.read()
         return base64.b64encode(audio_content).decode("utf-8")
 
-def content_text_to_list(message):
-    if "audios" in message:
-        content = message["content"]
-        if isinstance(content, str):
-            message["content"] = [{"type": "text", "text": content}]
-        elif isinstance(content, list):
-            message["content"] = content
-        else:
-            raise TypeError(str(content))
-        
-        for audio in message["audios"]:
-            base64_audio = audio_file_to_base64(audio["path"])
-            audio_message = {
-                "type": "audio_url",
-                "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}
-            }
-            message["content"].append(audio_message)
-    return message
 
 class VLLMModel(BaseModel):
-    def __init__(self, **kwargs):
+    def __init__(self, data_dir, **kwargs):
+        self.data_dir = data_dir
         super().__init__(**kwargs)
 
     def _get_tokenizer_endpoint(self):
@@ -123,6 +107,25 @@ def _build_completion_request_params(
             "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body),
         }
 
+    def content_text_to_list(self, message):
+        if "audios" in message:
+            content = message["content"]
+            if isinstance(content, str):
+                message["content"] = [{"type": "text", "text": content}]
+            elif isinstance(content, list):
+                message["content"] = content
+            else:
+                raise TypeError(str(content))
+            
+            for audio in message["audios"]:
+                base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
+                audio_message = {
+                    "type": "audio_url",
+                    "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}
+                }
+                message["content"].append(audio_message)
+        return message
+
     def _build_chat_request_params(
         self,
         messages: list[dict],
@@ -141,7 +144,7 @@ def _build_chat_request_params(
         tools: list[dict] | None = None,
         extra_body: dict = None,
     ) -> dict:
-        messages = [content_text_to_list(message) for message in messages]
+        messages = [self.content_text_to_list(message) for message in messages]
         request = {
             "messages": messages,
             "max_tokens": tokens_to_generate,

From c53d4059844dc6a782b759a7750f37688417f758 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 14 Nov 2025 15:08:19 -0800
Subject: [PATCH 11/38] data_dir=''

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py   | 3 ++-
 nemo_skills/inference/model/vllm.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index b8ee7c4e13..32b457cfce 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -391,7 +391,8 @@ def setup_llm(self):
                 additional_config={"sandbox": self.cfg.sandbox},
             )
         else:
-            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
+            print("self.cfgself.cfg", str(self.cfg))
+            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=data_dir)
 
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg
diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index 02a07f1fc2..6036bc8869 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -32,7 +32,7 @@ def audio_file_to_base64(audio_file_path: str):
 
 
 class VLLMModel(BaseModel):
-    def __init__(self, data_dir, **kwargs):
+    def __init__(self, data_dir: str = "", **kwargs):
         self.data_dir = data_dir
         super().__init__(**kwargs)
 

From a7cf5b5d688a4b71291b51cd322b48fd077af5a3 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 14 Nov 2025 15:21:22 -0800
Subject: [PATCH 12/38] eval_config

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 32b457cfce..ac6c39218d 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -392,7 +392,8 @@ def setup_llm(self):
             )
         else:
             print("self.cfgself.cfg", str(self.cfg))
-            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=data_dir)
+            print("self.cfg.eval_config[]", self.cfg.eval_config["data_dir"])
+            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=self.cfg.eval_config["data_dir"])
 
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg

From 689e6fcbfb96aa58cea7903ec6cfa91daf291052 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 14 Nov 2025 15:39:29 -0800
Subject: [PATCH 13/38] eval_type

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index ac6c39218d..2181bd6589 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import asyncio
+import os
 import json
 import logging
 import random
@@ -391,9 +392,7 @@ def setup_llm(self):
                 additional_config={"sandbox": self.cfg.sandbox},
             )
         else:
-            print("self.cfgself.cfg", str(self.cfg))
-            print("self.cfg.eval_config[]", self.cfg.eval_config["data_dir"])
-            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=self.cfg.eval_config["data_dir"])
+            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type))
 
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg

From 9536b8b00e7edc06b6d41aafe692791346d167b1 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 14 Nov 2025 16:37:18 -0800
Subject: [PATCH 14/38] if

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 2181bd6589..a1dc5e6ac8 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -392,7 +392,11 @@ def setup_llm(self):
                 additional_config={"sandbox": self.cfg.sandbox},
             )
         else:
-            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type))
+            if self.cfg.eval_config["data_dir"]:
+                data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
+            else:
+                data_dir = ""
+            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir)
 
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg

From 155a4806cb5deb875235bc2e4ef9b7361ad03780 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 14 Nov 2025 16:45:57 -0800
Subject: [PATCH 15/38] or

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index a1dc5e6ac8..afe665dd3e 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -392,10 +392,10 @@ def setup_llm(self):
                 additional_config={"sandbox": self.cfg.sandbox},
             )
         else:
-            if self.cfg.eval_config["data_dir"]:
-                data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
-            else:
+            if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)):
                 data_dir = ""
+            else:
+                data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
             llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir)
 
         if self.cfg.parallel_thinking.mode is not None:

From 0d9292d711892070c1bb0d58f8c5100827ad3c57 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 14 Nov 2025 16:54:04 -0800
Subject: [PATCH 16/38] print

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index afe665dd3e..4876b29d51 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -392,6 +392,7 @@ def setup_llm(self):
                 additional_config={"sandbox": self.cfg.sandbox},
             )
         else:
+            print("self.cfg.eval_config", self.cfg)
             if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)):
                 data_dir = ""
             else:

From c9d99a52661ca564a3f28d9184dbd699978ba114 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 14 Nov 2025 17:03:02 -0800
Subject: [PATCH 17/38] print

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 4876b29d51..364bd73375 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -393,10 +393,10 @@ def setup_llm(self):
             )
         else:
             print("self.cfg.eval_config", self.cfg)
-            if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)):
-                data_dir = ""
-            else:
-                data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
+            # if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)):
+                # data_dir = ""
+            # else:
+            data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
             llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir)
 
         if self.cfg.parallel_thinking.mode is not None:

From 0c54cad5efea067d10e54f57d7d28946c2b4aebc Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 14 Nov 2025 17:25:42 -0800
Subject: [PATCH 18/38] print

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 364bd73375..7caf31a0fc 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -392,11 +392,11 @@ def setup_llm(self):
                 additional_config={"sandbox": self.cfg.sandbox},
             )
         else:
-            print("self.cfg.eval_config", self.cfg)
-            # if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)):
-                # data_dir = ""
-            # else:
-            data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
+            if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)):
+                print("self.cfg.eval_config", self.cfg)
+                data_dir = ""
+            else:
+                data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
             llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir)
 
         if self.cfg.parallel_thinking.mode is not None:

From fc7a3445b3de2023264392a2b7624292948204c8 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 14 Nov 2025 17:33:23 -0800
Subject: [PATCH 19/38] ++val_type

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/dataset/mmau-pro/closed_form/__init__.py | 1 +
 nemo_skills/dataset/mmau-pro/open_ended/__init__.py  | 1 +
 nemo_skills/inference/generate.py                    | 1 -
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/dataset/mmau-pro/closed_form/__init__.py b/nemo_skills/dataset/mmau-pro/closed_form/__init__.py
index 4e3b424d84..4390c1d887 100644
--- a/nemo_skills/dataset/mmau-pro/closed_form/__init__.py
+++ b/nemo_skills/dataset/mmau-pro/closed_form/__init__.py
@@ -16,6 +16,7 @@
 METRICS_TYPE = "mmau_pro_closed_form"
 SCORE_MODULE = "nemo_skills.evaluation.metrics.mmau_pro_metrics"
 GENERATION_ARGS = "++prompt_format=openai"
+EVAL_ARGS = "++eval_type=mmau-pro"
 
 # NVEmbed judge configuration for closed-form evaluation
 JUDGE_PIPELINE_ARGS = {
diff --git a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py
index 22773d6fed..9e1e812a97 100644
--- a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py
+++ b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py
@@ -16,6 +16,7 @@
 METRICS_TYPE = "mmau_pro_open_ended"
 SCORE_MODULE = "nemo_skills.evaluation.metrics.mmau_pro_metrics"
 GENERATION_ARGS = "++prompt_format=openai"
+EVAL_ARGS = "++eval_type=mmau-pro"
 
 # Judge configuration for open-ended evaluation using NVIDIA API
 JUDGE_PIPELINE_ARGS = {
diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 7caf31a0fc..afe665dd3e 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -393,7 +393,6 @@ def setup_llm(self):
             )
         else:
             if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)):
-                print("self.cfg.eval_config", self.cfg)
                 data_dir = ""
             else:
                 data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)

From c68f8af81c5551e1ee818109a31dd4f0b16c1ed3 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 20 Nov 2025 10:08:30 -0800
Subject: [PATCH 20/38] audio

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py   |  5 ++---
 nemo_skills/inference/model/vllm.py | 11 ++++++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index afe665dd3e..b60cfd3eae 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -392,9 +392,8 @@ def setup_llm(self):
                 additional_config={"sandbox": self.cfg.sandbox},
             )
         else:
-            if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)):
-                data_dir = ""
-            else:
+            data_dir = ""
+            if "data_dir" in self.cfg.eval_config and not (isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None))):
                 data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
             llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir)
 
diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index 6036bc8869..e50e2c90c4 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -108,7 +108,7 @@ def _build_completion_request_params(
         }
 
     def content_text_to_list(self, message):
-        if "audios" in message:
+        if "audio" in message or "audios" in message:
             content = message["content"]
             if isinstance(content, str):
                 message["content"] = [{"type": "text", "text": content}]
@@ -117,6 +117,15 @@ def content_text_to_list(self, message):
             else:
                 raise TypeError(str(content))
             
+        if "audio" in message:
+            audio = message["audio"]
+            base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
+            audio_message = {
+                "type": "audio_url",
+                "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}
+            }
+            message["content"].append(audio_message)
+        elif "audios" in message:
             for audio in message["audios"]:
                 base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
                 audio_message = {

From cfaf38a4dcf4969144cc265aeac829ccbba2301e Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 20 Nov 2025 17:51:15 -0800
Subject: [PATCH 21/38] rm data_dir

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index b60cfd3eae..9bd81e297b 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -392,10 +392,10 @@ def setup_llm(self):
                 additional_config={"sandbox": self.cfg.sandbox},
             )
         else:
-            data_dir = ""
             if "data_dir" in self.cfg.eval_config and not (isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None))):
                 data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
-            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir)
+                llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir)
+            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
 
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg

From 81861646b1310bac6b59231b8fb5892250142a6b Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 20 Nov 2025 18:02:29 -0800
Subject: [PATCH 22/38] else

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 9bd81e297b..776bbfe420 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -395,7 +395,8 @@ def setup_llm(self):
             if "data_dir" in self.cfg.eval_config and not (isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None))):
                 data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
                 llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir)
-            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
+            else:
+                llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
 
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg

From 813d675d7214ccba076fe9de686256a7bd5d6911 Mon Sep 17 00:00:00 2001
From: mmkrtchyan <mmkrtchyan@nvidia.com>
Date: Wed, 26 Nov 2025 15:23:42 +0400
Subject: [PATCH 23/38] mmau-pro open-ended metrics improvement

Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 .../dataset/mmau-pro/open_ended/__init__.py   |   3 +-
 nemo_skills/dataset/mmau-pro/prepare.py       |  23 +--
 .../evaluation/metrics/mmau_pro_metrics.py    | 132 ++++++++++++++----
 nemo_skills/inference/generate.py             |  16 ++-
 nemo_skills/prompt/config/judge/mmau-pro.yaml |  30 ++++
 nemo_skills/prompt/config/judge/speechlm.yaml |  28 ----
 6 files changed, 163 insertions(+), 69 deletions(-)
 create mode 100644 nemo_skills/prompt/config/judge/mmau-pro.yaml
 delete mode 100644 nemo_skills/prompt/config/judge/speechlm.yaml

diff --git a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py
index 9e1e812a97..c5f09272d2 100644
--- a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py
+++ b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py
@@ -16,7 +16,6 @@
 METRICS_TYPE = "mmau_pro_open_ended"
 SCORE_MODULE = "nemo_skills.evaluation.metrics.mmau_pro_metrics"
 GENERATION_ARGS = "++prompt_format=openai"
-EVAL_ARGS = "++eval_type=mmau-pro"
 
 # Judge configuration for open-ended evaluation using NVIDIA API
 JUDGE_PIPELINE_ARGS = {
@@ -24,4 +23,4 @@
     "server_type": "openai",
     "server_address": "https://integrate.api.nvidia.com/v1",
 }
-JUDGE_ARGS = "++prompt_config=judge/speechlm ++generation_key=judgement"
+JUDGE_ARGS = "++prompt_config=judge/mmau-pro ++generation_key=judgement"
diff --git a/nemo_skills/dataset/mmau-pro/prepare.py b/nemo_skills/dataset/mmau-pro/prepare.py
index a6f04d621b..0ea66ec2b7 100644
--- a/nemo_skills/dataset/mmau-pro/prepare.py
+++ b/nemo_skills/dataset/mmau-pro/prepare.py
@@ -75,8 +75,8 @@ def format_entry(entry, with_audio=False):
     if category == "open":
         content = entry["question"]
     elif choices and len(choices) > 1:
-        options_text = "\n".join(f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices))
-        content = f"{entry['question']}\n\n{options_text}"
+        options_text = "\n".join(f"{chr(65 + i)}) {choice}" for i, choice in enumerate(choices))
+        content = f"{entry['question']}\n\n{options_text}\n\nRespond with the complete text of the correct option, not just the letter."
     else:
         content = entry["question"]
 
@@ -84,13 +84,18 @@ def format_entry(entry, with_audio=False):
 
     if entry.get("audio_path"):
         audio_path = entry["audio_path"]
-
-        if isinstance(audio_path, list) and audio_path:
-            user_message["audios"] = [{"path": path, "duration": 10.0} for path in audio_path]
-        elif isinstance(audio_path, str):
-            user_message["audio"] = {"path": audio_path, "duration": 10.0}
-
-    formatted_entry["messages"] = [user_message]
+        # Prepend /dataset/mmau-pro/ to make paths absolute for cluster
+        if len(audio_path) == 1:
+            user_message["audio"] = {"path": f"/dataset/mmau-pro/{audio_path[0]}"}
+        else:
+            user_message["audios"] = [{"path": f"/dataset/mmau-pro/{path}"} for path in audio_path]
+
+    # Don't use /no_think for open-ended questions to allow reasoning
+    system_content = "You are a helpful assistant."
+    if category != "open":
+        system_content += " /no_think"
+
+    formatted_entry["messages"] = [{"role": "system", "content": system_content}, user_message]
     return formatted_entry
 
 
diff --git a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py
index f079049cc1..5e619fce90 100644
--- a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py
+++ b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py
@@ -1,26 +1,48 @@
-# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import logging
-
+import re
+import numpy as np
 from nemo_skills.evaluation.metrics.base import BaseMetrics, as_int, as_percentage
-from nemo_skills.evaluation.metrics.utils import is_correct_judgement
 from nemo_skills.utils import get_logger_name
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
+def extract_multicriteria_scores(judgement_text: str) -> dict[str, float]:
+    """Extract multi-criteria scores (1-5 scale) from LLM judge evaluation.
+
+    Expected format:
+        CORRECTNESS: [score] - [justification]
+        RELEVANCE: [score] - [justification]
+        COMPLETENESS: [score] - [justification]
+        CLARITY: [score] - [justification]
+        OVERALL: [score] - [overall assessment]
+
+    Returns:
+        Dictionary with keys: correctness, relevance, completeness, clarity, overall
+        Defaults to 3.0 if score not found.
+    """
+    scores = {}
+
+    patterns = {
+        "correctness": r"CORRECTNESS:\s*(\d+(?:\.\d+)?)",
+        "relevance": r"RELEVANCE:\s*(\d+(?:\.\d+)?)",
+        "completeness": r"COMPLETENESS:\s*(\d+(?:\.\d+)?)",
+        "clarity": r"CLARITY:\s*(\d+(?:\.\d+)?)",
+        "overall": r"OVERALL:\s*(\d+(?:\.\d+)?)",
+    }
+
+    for criterion, pattern in patterns.items():
+        match = re.search(pattern, judgement_text, re.IGNORECASE)
+        scores[criterion] = float(match.group(1)) if match else 3.0
+
+    # Fallback: compute overall if missing or still 3.0
+    if "overall" not in scores or scores["overall"] == 3.0:
+        criteria_scores = [scores.get(k, 3.0) for k in ["correctness", "relevance", "completeness", "clarity"]]
+        scores["overall"] = sum(criteria_scores) / len(criteria_scores)
+
+    return scores
+
+
 class MMAUProMetrics(BaseMetrics):
     """Metrics class for MMAU-Pro benchmark (all subgroups)."""
 
@@ -28,16 +50,24 @@ def __init__(self, compute_no_answer: bool = True, max_k: int = 1):
         super().__init__(compute_no_answer=compute_no_answer)
         self.max_k = max_k
 
+        # Track multi-criteria scores for open-ended questions (1-5 scale)
+        self.multicriteria_scores = {
+            "correctness": [],
+            "relevance": [],
+            "completeness": [],
+            "clarity": [],
+            "overall": [],
+        }
+
     def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
         """Extract correctness scores from prediction."""
         score_dict = {}
 
-        # Open-ended: extract from judge result
+        # Open-ended: use LLM judge correctness score >= 3 as correct
         if "judgement" in prediction:
-            judge_result = is_correct_judgement(prediction["judgement"])
-            score_dict["judge_correct"] = judge_result
-            score_dict["correct"] = judge_result
-        # Closed-form and instruction following: use is_correct
+            multicriteria = extract_multicriteria_scores(prediction["judgement"])
+            score_dict["correct"] = multicriteria.get("correctness", 3.0) >= 3.0
+        # Closed-form / instruction-following: use binary correctness
         elif "is_correct" in prediction:
             score_dict["correct"] = prediction["is_correct"]
         else:
@@ -58,24 +88,61 @@ def get_incorrect_sample(self, prediction: dict) -> dict:
     def update(self, predictions):
         """Update metrics with new predictions."""
         super().update(predictions)
+
         predicted_answers = [pred.get("generation", None).strip() or None for pred in predictions]
         self._compute_pass_at_k(predictions=predictions, predicted_answers=predicted_answers)
         self._compute_majority_at_k(predictions=predictions, predicted_answers=predicted_answers)
 
+        # Collect multi-criteria scores for open-ended questions
+        for pred in predictions:
+            if "judgement" in pred:
+                multicriteria = extract_multicriteria_scores(pred["judgement"])
+                for criterion in self.multicriteria_scores:
+                    self.multicriteria_scores[criterion].append(multicriteria.get(criterion, 3.0))
+
     def get_metrics(self):
         """Get computed metrics."""
         metrics_dict = super().get_metrics()
+
         for agg_mode, agg_metrics in metrics_dict.items():
-            # Ensure avg_tokens is always present for MMAU-Pro
+            # Ensure avg_tokens is present
             if "avg_tokens" not in agg_metrics:
                 agg_metrics["avg_tokens"] = 0
             if "no_answer" in agg_metrics:
                 agg_metrics["no_answer"] = agg_metrics["no_answer"] / 2.0
-            # Set success_rate from correct or judge_correct
-            if "judge_correct" in agg_metrics:
-                agg_metrics["success_rate"] = agg_metrics["judge_correct"]
+
+            # Add multi-criteria averages for open-ended (convert 1-5 scale to percentage)
+            if self.multicriteria_scores["overall"]:
+                for criterion in self.multicriteria_scores:
+                    scores = self.multicriteria_scores[criterion]
+                    if scores:
+                        # Convert 1-5 scale to 0-100 percentage scale
+                        avg_score = np.mean(scores)
+                        std_score = np.std(scores)
+                        agg_metrics[f"avg_{criterion}"] = (avg_score / 5.0) * 100
+                        agg_metrics[f"std_{criterion}"] = (std_score / 5.0) * 100
+
+                # Set correct and success_rate to avg_correctness for open-ended
+                agg_metrics["correct"] = agg_metrics["avg_correctness"]
+                agg_metrics["success_rate"] = agg_metrics["avg_correctness"]
+                
+                # Calculate good/poor response rates based on overall >= 4 or <= 2
+                overall_scores = self.multicriteria_scores["overall"]
+                good_responses = sum(1 for score in overall_scores if score >= 4.0)
+                poor_responses = sum(1 for score in overall_scores if score <= 2.0)
+
+                agg_metrics["good_response_rate"] = (good_responses / len(overall_scores)) * 100
+                agg_metrics["poor_response_rate"] = (poor_responses / len(overall_scores)) * 100
+
+            # For closed-form / instruction-following: use binary correctness
             elif "correct" in agg_metrics:
                 agg_metrics["success_rate"] = agg_metrics["correct"]
+
+            # Round all numeric values to 2 decimal places
+            for key, value in agg_metrics.items():
+                if isinstance(value, float) and not isinstance(value, bool):
+                    agg_metrics[key] = round(value, 2)
+
         return metrics_dict
 
     def metrics_to_print(self):
@@ -87,5 +154,20 @@ def metrics_to_print(self):
         }
         if self.compute_no_answer:
             base_metrics["no_answer"] = as_percentage
+
+        # Add multi-criteria metrics for open-ended questions (now in percentage format)
+        if self.multicriteria_scores["overall"]:
+            base_metrics.update(
+                {
+                    "avg_overall": as_percentage,
+                    "avg_correctness": as_percentage,
+                    "avg_relevance": as_percentage,
+                    "avg_completeness": as_percentage,
+                    "avg_clarity": as_percentage,
+                    "good_response_rate": as_percentage,
+                    "poor_response_rate": as_percentage,
+                }
+            )
+
         base_metrics["num_entries"] = as_int
         return base_metrics
diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 776bbfe420..75886767b4 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -531,11 +531,17 @@ def dump_outputs(self, outputs, data_points, fout):
             fout.write(json.dumps(output) + "\n")
 
     def drop_binary_data(self, output):
-        binary_data_to_drop_from_output = {"audio_url"}
-        if isinstance(output["messages"][0]["content"], list):
-            for content in output["messages"][0]["content"]:
-                if "type" in content and content["type"] in binary_data_to_drop_from_output:
-                    content[content["type"]]["url"] = ""
+        """Remove binary data (like base64 audio) from messages to keep output files smaller."""            
+        for message in output["messages"]:
+            # Skip if content is not a list (e.g., string content in system messages)
+            if not isinstance(message.get("content"), list):
+                continue
+                
+            # Filter out audio_url items from list-style content
+            message["content"] = [
+                content for content in message["content"]
+                if content.get("type") != "audio_url"
+            ]
         
     async def postprocess_single_output(self, output, original_data_point):
         # to make it easier to follow up with other generations and limit accidental errors, we are adding
diff --git a/nemo_skills/prompt/config/judge/mmau-pro.yaml b/nemo_skills/prompt/config/judge/mmau-pro.yaml
new file mode 100644
index 0000000000..5339e4ab0d
--- /dev/null
+++ b/nemo_skills/prompt/config/judge/mmau-pro.yaml
@@ -0,0 +1,30 @@
+# Judge prompt configuration for Speech/Audio Language Model evaluation
+# Used for evaluating open-ended responses in MMAU-Pro benchmark
+# Uses multi-criteria scoring on 1-5 scale
+
+user: |-
+  You are an expert evaluator for audio and speech-related questions. Please evaluate the quality of a model's response to a question.
+
+  Question: {question}
+
+  Reference Answer: {expected_answer}
+
+  Model Response: {generation}
+
+  Please evaluate the model response on the following criteria and provide scores from 1-5 (where 5 is best):
+
+  1. **Correctness**: How factually accurate is the response compared to the reference?
+  2. **Relevance**: How well does the response address the specific question asked?
+  3. **Completeness**: Does the response cover all important aspects mentioned in the reference?
+  4. **Clarity**: How clear and well-structured is the response?
+
+  For each criterion, provide:
+  - A score from 1-5
+  - A brief justification (1-2 sentences)
+
+  Format your response as:
+  CORRECTNESS: [score] - [justification]
+  RELEVANCE: [score] - [justification]
+  COMPLETENESS: [score] - [justification]
+  CLARITY: [score] - [justification]
+  OVERALL: [average score] - [overall assessment]
diff --git a/nemo_skills/prompt/config/judge/speechlm.yaml b/nemo_skills/prompt/config/judge/speechlm.yaml
deleted file mode 100644
index 4862558145..0000000000
--- a/nemo_skills/prompt/config/judge/speechlm.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Judge prompt configuration for Speech/Audio Language Model evaluation
-# Used for evaluating open-ended responses in MMAU-Pro benchmark
-# Follows nemo-skills standard Yes/No judgement pattern
-
-user: |-
-  You are an expert evaluator for audio and speech-related questions. Please evaluate whether the model's response correctly answers the question.
-
-  Question: {question}
-
-  Reference Answer: {expected_answer}
-
-  Model Response: {generation}
-
-  Your task is to determine if the model's response is correct based on the reference answer. Consider:
-
-  1. **Factual Accuracy**: Is the information in the response factually correct?
-  2. **Relevance**: Does the response address the specific question asked?
-  3. **Completeness**: Does the response cover the key points from the reference answer?
-
-  Please first explain your reasoning in 2-3 sentences, then provide your final judgement.
-
-  Your final judgement must be either "Yes" or "No":
-  - "Yes" if the model response is correct and adequately answers the question
-  - "No" if the model response is incorrect, irrelevant, or inadequate
-
-  Format your response as:
-  Reasoning: [Your explanation]
-  Judgement: [Yes or No]

From 1b7e7151c4bc3665665356a0a4bf9caa2c5d80fa Mon Sep 17 00:00:00 2001
From: mmkrtchyan <mmkrtchyan@nvidia.com>
Date: Wed, 26 Nov 2025 18:17:43 +0400
Subject: [PATCH 24/38] pre-commit fixes

Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 .../evaluation/metrics/mmau_pro_metrics.py    | 18 ++++++++++++++-
 nemo_skills/inference/generate.py             | 23 +++++++++----------
 nemo_skills/inference/model/vllm.py           | 16 +++++--------
 3 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py
index 5e619fce90..000dbcf13f 100644
--- a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py
+++ b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py
@@ -1,6 +1,22 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import logging
 import re
+
 import numpy as np
+
 from nemo_skills.evaluation.metrics.base import BaseMetrics, as_int, as_percentage
 from nemo_skills.utils import get_logger_name
 
@@ -125,7 +141,7 @@ def get_metrics(self):
                 # Set correct and success_rate to avg_correctness for open-ended
                 agg_metrics["correct"] = agg_metrics["avg_correctness"]
                 agg_metrics["success_rate"] = agg_metrics["avg_correctness"]
-                
+
                 # Calculate good/poor response rates based on overall >= 4 or <= 2
                 overall_scores = self.multicriteria_scores["overall"]
                 good_responses = sum(1 for score in overall_scores if score >= 4.0)
diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 75886767b4..f78e3baef3 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 import asyncio
-import os
 import json
 import logging
+import os
 import random
 import shutil
 import subprocess
@@ -392,9 +392,11 @@ def setup_llm(self):
                 additional_config={"sandbox": self.cfg.sandbox},
             )
         else:
-            if "data_dir" in self.cfg.eval_config and not (isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None))):
-                data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
-                llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir)
+            if "data_dir" in self.cfg.eval_config and not (
+                isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None))
+            ):
+                data_dir = os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
+                llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=data_dir)
             else:
                 llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
 
@@ -531,18 +533,15 @@ def dump_outputs(self, outputs, data_points, fout):
             fout.write(json.dumps(output) + "\n")
 
     def drop_binary_data(self, output):
-        """Remove binary data (like base64 audio) from messages to keep output files smaller."""            
+        """Remove binary data (like base64 audio) from messages to keep output files smaller."""
         for message in output["messages"]:
             # Skip if content is not a list (e.g., string content in system messages)
             if not isinstance(message.get("content"), list):
                 continue
-                
+
             # Filter out audio_url items from list-style content
-            message["content"] = [
-                content for content in message["content"]
-                if content.get("type") != "audio_url"
-            ]
-        
+            message["content"] = [content for content in message["content"] if content.get("type") != "audio_url"]
+
     async def postprocess_single_output(self, output, original_data_point):
         # to make it easier to follow up with other generations and limit accidental errors, we are adding
         # all of the original data to the output file alongside the new generations
@@ -558,7 +557,7 @@ async def postprocess_single_output(self, output, original_data_point):
         for key in output:
             original_data_point.pop(key, None)
         output.update(original_data_point)
-        
+
         self.drop_binary_data(output)
 
         if self.cfg.parse_reasoning:
diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index e50e2c90c4..cff46cf0e6 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import base64
 import logging
 import os
-import base64
+
 import requests
 
 from nemo_skills.utils import get_logger_name
@@ -24,6 +25,7 @@
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
+
 def audio_file_to_base64(audio_file_path: str):
     """Encodes an audio file into a base64 string."""
     with open(audio_file_path, "rb") as audio_file:
@@ -116,22 +118,16 @@ def content_text_to_list(self, message):
                 message["content"] = content
             else:
                 raise TypeError(str(content))
-            
+
         if "audio" in message:
             audio = message["audio"]
             base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
-            audio_message = {
-                "type": "audio_url",
-                "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}
-            }
+            audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
             message["content"].append(audio_message)
         elif "audios" in message:
             for audio in message["audios"]:
                 base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
-                audio_message = {
-                    "type": "audio_url",
-                    "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}
-                }
+                audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
                 message["content"].append(audio_message)
         return message
 

From ea8f2943c0ab0dfb416f2bb2006e8f83e48a1181 Mon Sep 17 00:00:00 2001
From: mmkrtchyan <mmkrtchyan@nvidia.com>
Date: Wed, 26 Nov 2025 19:26:16 +0400
Subject: [PATCH 25/38] Add audio tests for vLLM server

Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 tests/gpu-tests/test_vllm_audio.py |  84 ++++++++++++++++
 tests/test_vllm_audio.py           | 156 +++++++++++++++++++++++++++++
 2 files changed, 240 insertions(+)
 create mode 100644 tests/gpu-tests/test_vllm_audio.py
 create mode 100644 tests/test_vllm_audio.py

diff --git a/tests/gpu-tests/test_vllm_audio.py b/tests/gpu-tests/test_vllm_audio.py
new file mode 100644
index 0000000000..8183adaa80
--- /dev/null
+++ b/tests/gpu-tests/test_vllm_audio.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+from utils import require_env_var
+
+
+@pytest.mark.gpu
+def test_vllm_audio_generation():
+    """Integration test: Generate with vLLM server using audio input."""
+    model_path = require_env_var("NEMO_SKILLS_TEST_HF_MODEL")
+    model_type = require_env_var("NEMO_SKILLS_TEST_MODEL_TYPE")
+
+    output_dir = f"/tmp/nemo-skills-tests/{model_type}/vllm-audio-generation"
+    # Clean up output directory
+    if Path(output_dir).exists():
+        shutil.rmtree(output_dir)
+
+    # Create test input file with audio
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
+        test_data = [
+            {
+                "problem": "Transcribe this audio",
+                "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t2_16.wav"},
+            },
+            {
+                "problem": "What is in this audio?",
+                "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t3_16.wav"},
+            },
+        ]
+        for item in test_data:
+            f.write(json.dumps(item) + '\n')
+        input_file = f.name
+
+    try:
+        cmd = (
+            f"ns generate "
+            f"    --cluster test-local --config_dir {Path(__file__).absolute().parent} "
+            f"    --model {model_path} "
+            f"    --output_dir {output_dir} "
+            f"    --server_type vllm "
+            f"    --server_gpus 1 "
+            f"    --server_nodes 1 "
+            f"    --server_args '--enforce-eager' "
+            f"    --input_file={input_file} "
+            f"    ++prompt_config=openai "
+            f"    ++skip_filled=False "
+        )
+        subprocess.run(cmd, shell=True, check=True)
+
+        # Verify output exists and has audio-related generation
+        with open(f"{output_dir}/output.jsonl") as fin:
+            lines = fin.readlines()
+        
+        assert len(lines) == 2, "Should have 2 output lines"
+        
+        for line in lines:
+            data = json.loads(line)
+            assert "generation" in data, "Should have generation field"
+            assert len(data["generation"]) > 0, "Generation should not be empty"
+            # If model supports audio, generation should contain something
+            print(f"Generated: {data['generation']}")
+
+    finally:
+        # Cleanup temp file
+        Path(input_file).unlink(missing_ok=True)
+
diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py
new file mode 100644
index 0000000000..56bee85aa2
--- /dev/null
+++ b/tests/test_vllm_audio.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import os
+import tempfile
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from nemo_skills.inference.model.vllm import VLLMModel, audio_file_to_base64
+
+
+# -----------------------
+# Unit tests - no server required
+# -----------------------
+
+def test_audio_file_to_base64():
+    """Test basic audio file encoding to base64."""
+    with tempfile.NamedTemporaryFile(mode='wb', suffix='.wav', delete=False) as f:
+        test_content = b'RIFF' + b'\x00' * 100
+        f.write(test_content)
+        temp_path = f.name
+
+    try:
+        result = audio_file_to_base64(temp_path)
+        assert isinstance(result, str)
+        assert len(result) > 0
+        decoded = base64.b64decode(result)
+        assert decoded == test_content
+    finally:
+        os.unlink(temp_path)
+
+
+@pytest.fixture
+def vllm_model(tmp_path):
+    """Create a VLLMModel instance for testing."""
+    audio_dir = tmp_path / "audio"
+    audio_dir.mkdir()
+    model = VLLMModel(model="test-model", data_dir=str(tmp_path), base_url="http://localhost:5000")
+    return model
+
+
+def test_content_text_to_list_with_audio(vllm_model, tmp_path):
+    """Test converting string content with audio to list format."""
+    audio_path = tmp_path / "audio" / "test.wav"
+    audio_path.parent.mkdir(exist_ok=True)
+    with open(audio_path, 'wb') as f:
+        f.write(b'RIFF' + b'\x00' * 100)
+
+    message = {"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}}
+
+    result = vllm_model.content_text_to_list(message)
+
+    assert isinstance(result["content"], list)
+    assert len(result["content"]) == 2
+    assert result["content"][0]["type"] == "text"
+    assert result["content"][1]["type"] == "audio_url"
+    assert result["content"][1]["audio_url"]["url"].startswith("data:audio/wav;base64,")
+
+
+def test_content_text_to_list_with_multiple_audios(vllm_model, tmp_path):
+    """Test handling message with multiple audio files."""
+    audio_dir = tmp_path / "audio"
+    audio_dir.mkdir(exist_ok=True)
+
+    for i in range(2):
+        with open(audio_dir / f"test_{i}.wav", 'wb') as f:
+            f.write(b'RIFF' + b'\x00' * 100)
+
+    message = {
+        "role": "user",
+        "content": "Compare these",
+        "audios": [{"path": "audio/test_0.wav"}, {"path": "audio/test_1.wav"}],
+    }
+
+    result = vllm_model.content_text_to_list(message)
+
+    assert isinstance(result["content"], list)
+    assert len(result["content"]) == 3
+    assert result["content"][0]["type"] == "text"
+    assert result["content"][1]["type"] == "audio_url"
+    assert result["content"][2]["type"] == "audio_url"
+
+
+# -----------------------
+# Request building tests with audio
+# -----------------------
+
+def test_build_chat_request_with_audio(tmp_path, vllm_model):
+    """Test that chat request params are correctly built with audio content."""
+    # Create audio file
+    audio_path = tmp_path / "audio" / "test.wav"
+    audio_path.parent.mkdir(exist_ok=True)
+    with open(audio_path, 'wb') as f:
+        f.write(b'RIFF' + b'\x00' * 100)
+
+    messages = [{"role": "user", "content": "Test audio", "audio": {"path": "audio/test.wav"}}]
+
+    # Build request params - this doesn't make any network calls
+    params = vllm_model._build_chat_request_params(messages=messages, stream=False, tokens_to_generate=10)
+
+    # Validate request structure
+    assert "messages" in params
+    assert len(params["messages"]) == 1
+    content_items = params["messages"][0]["content"]
+    assert isinstance(content_items, list)
+    assert len(content_items) == 2
+    assert content_items[0]["type"] == "text"
+    assert content_items[1]["type"] == "audio_url"
+
+    # Verify base64 encoding is valid
+    audio_url = content_items[1]["audio_url"]["url"]
+    assert audio_url.startswith("data:audio/wav;base64,")
+    audio_b64 = audio_url.split(",", 1)[1]
+    decoded = base64.b64decode(audio_b64)
+    assert decoded.startswith(b'RIFF')
+
+
+@pytest.mark.asyncio
+async def test_generate_with_audio_mocked_response(tmp_path, vllm_model):
+    """Test generate_async with audio by mocking the response (no real server call)."""
+    # Create audio file
+    audio_path = tmp_path / "audio" / "test.wav"
+    audio_path.parent.mkdir(exist_ok=True)
+    with open(audio_path, 'wb') as f:
+        f.write(b'RIFF' + b'\x00' * 100)
+
+    messages = [{"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}}]
+
+    # Mock the entire generate_async method - no actual API call made
+    mock_response = {"generation": "This audio contains speech", "num_generated_tokens": 5}
+    
+    with patch.object(vllm_model, "generate_async", new_callable=AsyncMock) as mock_generate:
+        mock_generate.return_value = mock_response
+
+        # Call the mocked method
+        response = await vllm_model.generate_async(prompt=messages, tokens_to_generate=50, temperature=0.0)
+
+        # Verify the mock was called correctly
+        assert response["generation"] == "This audio contains speech"
+        assert response["num_generated_tokens"] == 5
+        mock_generate.assert_awaited_once()
+
+

From 206c7d60096ef6cc8e237be98de910f0ef93d9c0 Mon Sep 17 00:00:00 2001
From: mmkrtchyan <mmkrtchyan@nvidia.com>
Date: Thu, 27 Nov 2025 19:56:11 +0400
Subject: [PATCH 26/38] making audio paths generic

Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index f78e3baef3..38d9d340a3 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -15,7 +15,6 @@
 import asyncio
 import json
 import logging
-import os
 import random
 import shutil
 import subprocess
@@ -392,11 +391,9 @@ def setup_llm(self):
                 additional_config={"sandbox": self.cfg.sandbox},
             )
         else:
-            if "data_dir" in self.cfg.eval_config and not (
-                isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None))
-            ):
-                data_dir = os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
-                llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=data_dir)
+            # data_dir is only needed if audio/media paths are relative
+            if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config["data_dir"], type(None)):
+                llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=self.cfg.eval_config["data_dir"])
             else:
                 llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
 

From 289c694e110e20c11d052a6904cfd42e0ea26e0e Mon Sep 17 00:00:00 2001
From: mmkrtchyan <mmkrtchyan@nvidia.com>
Date: Tue, 2 Dec 2025 15:32:54 +0400
Subject: [PATCH 27/38] fix generate.py

Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 38d9d340a3..70145639e0 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -380,6 +380,10 @@ def setup_prompt(self):
     def setup_llm(self):
         self.sandbox = get_sandbox(**self.cfg.sandbox) if self.cfg.sandbox is not None else None
 
+        self.data_dir = None
+        if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config.get("data_dir"), type(None)):
+            self.data_dir = self.cfg.eval_config["data_dir"]
+
         if self.cfg.code_execution:
             llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox)
         elif self.cfg.tool_modules is not None:
@@ -391,11 +395,7 @@ def setup_llm(self):
                 additional_config={"sandbox": self.cfg.sandbox},
             )
         else:
-            # data_dir is only needed if audio/media paths are relative
-            if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config["data_dir"], type(None)):
-                llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=self.cfg.eval_config["data_dir"])
-            else:
-                llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
+            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
 
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg

From 3cc7ae5a392a6cd330ef65be7d1fc8ed07ea3821 Mon Sep 17 00:00:00 2001
From: mmkrtchyan <mmkrtchyan@nvidia.com>
Date: Fri, 5 Dec 2025 16:36:03 +0400
Subject: [PATCH 28/38] requested changes and audio/text order fix

Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py   |  80 ++++++++++++++--
 nemo_skills/inference/model/vllm.py |  36 --------
 tests/test_vllm_audio.py            | 136 +++++++++-------------------
 3 files changed, 117 insertions(+), 135 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 70145639e0..7c721a5d3f 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import asyncio
+import base64
 import json
 import logging
 import random
@@ -186,6 +187,9 @@ class GenerateSolutionsConfig:
     # If True, will enable litellm disk cache (useful for keeping intermediate results in case of job timelimit failures)
     enable_litellm_cache: bool = False
 
+    # List of content types to drop from messages (e.g., base64 audio) to keep output files smaller
+    drop_content_types: list[str] = field(default_factory=lambda: ["audio_url"])
+
     # Evaluation setup if requested. If eval_type is set to None, evaluation is skipped
     eval_type: str | None = None  # "lean4-proof", "math", etc.
     eval_config: dict = field(default_factory=dict)  # Config for the evaluator
@@ -380,10 +384,6 @@ def setup_prompt(self):
     def setup_llm(self):
         self.sandbox = get_sandbox(**self.cfg.sandbox) if self.cfg.sandbox is not None else None
 
-        self.data_dir = None
-        if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config.get("data_dir"), type(None)):
-            self.data_dir = self.cfg.eval_config["data_dir"]
-
         if self.cfg.code_execution:
             llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox)
         elif self.cfg.tool_modules is not None:
@@ -525,6 +525,67 @@ def fill_prompt(self, data_point, data):
                 filled_prompt += self.cfg.prompt_suffix
         return filled_prompt
 
+    def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]:
+        """Preprocess the prompt before sending to the model.
+
+        Override this method to add custom preprocessing logic.
+        By default, auto-detects and handles audio file path to base64 conversion.
+        """
+        if not isinstance(prompt, list):
+            return prompt
+
+        # Auto-detect and convert audio file paths to base64
+        prompt = [self._convert_audio_to_base64(message) for message in prompt]
+
+        return prompt
+
+    def _audio_file_to_base64(self, audio_file_path: str) -> str:
+        """Encodes an audio file into a base64 string."""
+        with open(audio_file_path, "rb") as audio_file:
+            audio_content = audio_file.read()
+            return base64.b64encode(audio_content).decode("utf-8")
+
+    def _convert_audio_to_base64(self, message: dict) -> dict:
+        """Convert audio file paths in a message to base64 inline content.
+
+        Looks for 'audio' or 'audios' keys in the message and converts them
+        to base64-encoded audio_url content items. Removes the original keys
+        after conversion to prevent double-processing by model-specific code.
+
+        CRITICAL: Audio must come BEFORE text for Qwen Audio to transcribe correctly.
+        """
+        if "audio" not in message and "audios" not in message:
+            return message
+
+        audio_items = []
+
+        # Handle single audio
+        if "audio" in message:
+            audio = message.pop("audio")
+            base64_audio = self._audio_file_to_base64(audio["path"])
+            audio_items.append({"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}})
+
+        # Handle multiple audios
+        if "audios" in message:
+            for audio in message.pop("audios"):
+                base64_audio = self._audio_file_to_base64(audio["path"])
+                audio_items.append(
+                    {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
+                )
+
+        # Convert existing content to list format and place AFTER audio
+        content = message.get("content", "")
+        if isinstance(content, str):
+            text_items = [{"type": "text", "text": content}] if content else []
+        elif isinstance(content, list):
+            text_items = content
+        else:
+            raise TypeError(f"Unsupported content type: {type(content)}")
+
+        message["content"] = audio_items + text_items
+
+        return message
+
     def dump_outputs(self, outputs, data_points, fout):
         for output in outputs:
             fout.write(json.dumps(output) + "\n")
@@ -536,8 +597,10 @@ def drop_binary_data(self, output):
             if not isinstance(message.get("content"), list):
                 continue
 
-            # Filter out audio_url items from list-style content
-            message["content"] = [content for content in message["content"] if content.get("type") != "audio_url"]
+            # Filter out content types specified in drop_content_types config
+            message["content"] = [
+                content for content in message["content"] if content.get("type") not in self.cfg.drop_content_types
+            ]
 
     async def postprocess_single_output(self, output, original_data_point):
         # to make it easier to follow up with other generations and limit accidental errors, we are adding
@@ -587,10 +650,13 @@ async def process_single_datapoint(self, data_point, all_data):
             # Already a dict from Hydra
             inference_params = dict(self.cfg.inference)
 
+        prompt = self.fill_prompt(data_point, all_data)
+        prompt = self.preprocess_prompt(prompt)
+
         generation_params = {
             **inference_params,
             **self.extra_generate_params,
-            "prompt": self.fill_prompt(data_point, all_data),
+            "prompt": prompt,
             "stop_phrases": [self.cfg.stop_phrase] if self.cfg.stop_phrase else None,
         }
 
diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index cff46cf0e6..fb3de3f0e7 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import base64
 import logging
-import os
 
 import requests
 
@@ -26,18 +24,7 @@
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
-def audio_file_to_base64(audio_file_path: str):
-    """Encodes an audio file into a base64 string."""
-    with open(audio_file_path, "rb") as audio_file:
-        audio_content = audio_file.read()
-        return base64.b64encode(audio_content).decode("utf-8")
-
-
 class VLLMModel(BaseModel):
-    def __init__(self, data_dir: str = "", **kwargs):
-        self.data_dir = data_dir
-        super().__init__(**kwargs)
-
     def _get_tokenizer_endpoint(self):
         """
         Returns the tokenizer endpoint if available, otherwise returns None.
@@ -109,28 +96,6 @@ def _build_completion_request_params(
             "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body),
         }
 
-    def content_text_to_list(self, message):
-        if "audio" in message or "audios" in message:
-            content = message["content"]
-            if isinstance(content, str):
-                message["content"] = [{"type": "text", "text": content}]
-            elif isinstance(content, list):
-                message["content"] = content
-            else:
-                raise TypeError(str(content))
-
-        if "audio" in message:
-            audio = message["audio"]
-            base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
-            audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
-            message["content"].append(audio_message)
-        elif "audios" in message:
-            for audio in message["audios"]:
-                base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
-                audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
-                message["content"].append(audio_message)
-        return message
-
     def _build_chat_request_params(
         self,
         messages: list[dict],
@@ -149,7 +114,6 @@ def _build_chat_request_params(
         tools: list[dict] | None = None,
         extra_body: dict = None,
     ) -> dict:
-        messages = [self.content_text_to_list(message) for message in messages]
         request = {
             "messages": messages,
             "max_tokens": tokens_to_generate,
diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py
index 56bee85aa2..d806c46121 100644
--- a/tests/test_vllm_audio.py
+++ b/tests/test_vllm_audio.py
@@ -15,18 +15,28 @@
 import base64
 import os
 import tempfile
-from unittest.mock import AsyncMock, patch
+from unittest.mock import MagicMock
 
 import pytest
 
-from nemo_skills.inference.model.vllm import VLLMModel, audio_file_to_base64
+from nemo_skills.inference.generate import GenerationTask
 
 
-# -----------------------
-# Unit tests - no server required
-# -----------------------
+@pytest.fixture
+def mock_generation_task():
+    """Create a mock GenerationTask for testing audio preprocessing."""
+    mock_cfg = MagicMock()
+    mock_cfg.drop_content_types = ["audio_url"]
+
+    task = MagicMock(spec=GenerationTask)
+    task.cfg = mock_cfg
+    # Use the real methods
+    task._audio_file_to_base64 = GenerationTask._audio_file_to_base64.__get__(task)
+    task._convert_audio_to_base64 = GenerationTask._convert_audio_to_base64.__get__(task)
+    return task
 
-def test_audio_file_to_base64():
+
+def test_audio_file_to_base64(mock_generation_task):
     """Test basic audio file encoding to base64."""
     with tempfile.NamedTemporaryFile(mode='wb', suffix='.wav', delete=False) as f:
         test_content = b'RIFF' + b'\x00' * 100
@@ -34,7 +44,7 @@ def test_audio_file_to_base64():
         temp_path = f.name
 
     try:
-        result = audio_file_to_base64(temp_path)
+        result = mock_generation_task._audio_file_to_base64(temp_path)
         assert isinstance(result, str)
         assert len(result) > 0
         decoded = base64.b64decode(result)
@@ -43,114 +53,56 @@ def test_audio_file_to_base64():
         os.unlink(temp_path)
 
 
-@pytest.fixture
-def vllm_model(tmp_path):
-    """Create a VLLMModel instance for testing."""
-    audio_dir = tmp_path / "audio"
-    audio_dir.mkdir()
-    model = VLLMModel(model="test-model", data_dir=str(tmp_path), base_url="http://localhost:5000")
-    return model
-
+def test_convert_audio_to_base64_with_audio(mock_generation_task, tmp_path):
+    """Test converting string content with audio to list format.
 
-def test_content_text_to_list_with_audio(vllm_model, tmp_path):
-    """Test converting string content with audio to list format."""
+    CRITICAL: Audio must come BEFORE text for Qwen Audio to transcribe correctly.
+    """
     audio_path = tmp_path / "audio" / "test.wav"
     audio_path.parent.mkdir(exist_ok=True)
     with open(audio_path, 'wb') as f:
         f.write(b'RIFF' + b'\x00' * 100)
 
-    message = {"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}}
+    message = {"role": "user", "content": "Describe this audio", "audio": {"path": str(audio_path)}}
 
-    result = vllm_model.content_text_to_list(message)
+    result = mock_generation_task._convert_audio_to_base64(message)
 
     assert isinstance(result["content"], list)
     assert len(result["content"]) == 2
-    assert result["content"][0]["type"] == "text"
-    assert result["content"][1]["type"] == "audio_url"
-    assert result["content"][1]["audio_url"]["url"].startswith("data:audio/wav;base64,")
+    assert result["content"][0]["type"] == "audio_url"
+    assert result["content"][0]["audio_url"]["url"].startswith("data:audio/wav;base64,")
+    assert result["content"][1]["type"] == "text"
+    assert "audio" not in result
+
 
+def test_convert_audio_to_base64_with_multiple_audios(mock_generation_task, tmp_path):
+    """Test handling message with multiple audio files.
 
-def test_content_text_to_list_with_multiple_audios(vllm_model, tmp_path):
-    """Test handling message with multiple audio files."""
+    CRITICAL: Audio must come BEFORE text for Qwen Audio to transcribe correctly.
+    """
     audio_dir = tmp_path / "audio"
     audio_dir.mkdir(exist_ok=True)
 
+    audio_paths = []
     for i in range(2):
-        with open(audio_dir / f"test_{i}.wav", 'wb') as f:
-            f.write(b'RIFF' + b'\x00' * 100)
+        audio_path = audio_dir / f"test_{i}.wav"
+        with open(audio_path, "wb") as f:
+            f.write(b"RIFF" + b"\x00" * 100)
+        audio_paths.append(str(audio_path))
 
     message = {
         "role": "user",
         "content": "Compare these",
-        "audios": [{"path": "audio/test_0.wav"}, {"path": "audio/test_1.wav"}],
+        "audios": [{"path": audio_paths[0]}, {"path": audio_paths[1]}],
     }
 
-    result = vllm_model.content_text_to_list(message)
+    result = mock_generation_task._convert_audio_to_base64(message)
 
     assert isinstance(result["content"], list)
     assert len(result["content"]) == 3
-    assert result["content"][0]["type"] == "text"
+    # Audio MUST come before text for Qwen Audio
+    assert result["content"][0]["type"] == "audio_url"
     assert result["content"][1]["type"] == "audio_url"
-    assert result["content"][2]["type"] == "audio_url"
-
-
-# -----------------------
-# Request building tests with audio
-# -----------------------
-
-def test_build_chat_request_with_audio(tmp_path, vllm_model):
-    """Test that chat request params are correctly built with audio content."""
-    # Create audio file
-    audio_path = tmp_path / "audio" / "test.wav"
-    audio_path.parent.mkdir(exist_ok=True)
-    with open(audio_path, 'wb') as f:
-        f.write(b'RIFF' + b'\x00' * 100)
-
-    messages = [{"role": "user", "content": "Test audio", "audio": {"path": "audio/test.wav"}}]
-
-    # Build request params - this doesn't make any network calls
-    params = vllm_model._build_chat_request_params(messages=messages, stream=False, tokens_to_generate=10)
-
-    # Validate request structure
-    assert "messages" in params
-    assert len(params["messages"]) == 1
-    content_items = params["messages"][0]["content"]
-    assert isinstance(content_items, list)
-    assert len(content_items) == 2
-    assert content_items[0]["type"] == "text"
-    assert content_items[1]["type"] == "audio_url"
-
-    # Verify base64 encoding is valid
-    audio_url = content_items[1]["audio_url"]["url"]
-    assert audio_url.startswith("data:audio/wav;base64,")
-    audio_b64 = audio_url.split(",", 1)[1]
-    decoded = base64.b64decode(audio_b64)
-    assert decoded.startswith(b'RIFF')
-
-
-@pytest.mark.asyncio
-async def test_generate_with_audio_mocked_response(tmp_path, vllm_model):
-    """Test generate_async with audio by mocking the response (no real server call)."""
-    # Create audio file
-    audio_path = tmp_path / "audio" / "test.wav"
-    audio_path.parent.mkdir(exist_ok=True)
-    with open(audio_path, 'wb') as f:
-        f.write(b'RIFF' + b'\x00' * 100)
-
-    messages = [{"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}}]
-
-    # Mock the entire generate_async method - no actual API call made
-    mock_response = {"generation": "This audio contains speech", "num_generated_tokens": 5}
-    
-    with patch.object(vllm_model, "generate_async", new_callable=AsyncMock) as mock_generate:
-        mock_generate.return_value = mock_response
-
-        # Call the mocked method
-        response = await vllm_model.generate_async(prompt=messages, tokens_to_generate=50, temperature=0.0)
-
-        # Verify the mock was called correctly
-        assert response["generation"] == "This audio contains speech"
-        assert response["num_generated_tokens"] == 5
-        mock_generate.assert_awaited_once()
-
-
+    assert result["content"][2]["type"] == "text"
+    # Original audios key should be removed
+    assert "audios" not in result

From 5825f277ff90b372371d6adba3737da50882f9b8 Mon Sep 17 00:00:00 2001
From: mmkrtchyan <mmkrtchyan@nvidia.com>
Date: Fri, 5 Dec 2025 18:10:02 +0400
Subject: [PATCH 29/38] resolving conflicts and lint fix

Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 tests/gpu-tests/test_vllm_audio.py | 9 ++++-----
 tests/test_vllm_audio.py           | 8 ++++----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/tests/gpu-tests/test_vllm_audio.py b/tests/gpu-tests/test_vllm_audio.py
index 8183adaa80..9272a6d85f 100644
--- a/tests/gpu-tests/test_vllm_audio.py
+++ b/tests/gpu-tests/test_vllm_audio.py
@@ -34,7 +34,7 @@ def test_vllm_audio_generation():
         shutil.rmtree(output_dir)
 
     # Create test input file with audio
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
         test_data = [
             {
                 "problem": "Transcribe this audio",
@@ -46,7 +46,7 @@ def test_vllm_audio_generation():
             },
         ]
         for item in test_data:
-            f.write(json.dumps(item) + '\n')
+            f.write(json.dumps(item) + "\n")
         input_file = f.name
 
     try:
@@ -68,9 +68,9 @@ def test_vllm_audio_generation():
         # Verify output exists and has audio-related generation
         with open(f"{output_dir}/output.jsonl") as fin:
             lines = fin.readlines()
-        
+
         assert len(lines) == 2, "Should have 2 output lines"
-        
+
         for line in lines:
             data = json.loads(line)
             assert "generation" in data, "Should have generation field"
@@ -81,4 +81,3 @@ def test_vllm_audio_generation():
     finally:
         # Cleanup temp file
         Path(input_file).unlink(missing_ok=True)
-
diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py
index d806c46121..1d62b4e8b1 100644
--- a/tests/test_vllm_audio.py
+++ b/tests/test_vllm_audio.py
@@ -38,8 +38,8 @@ def mock_generation_task():
 
 def test_audio_file_to_base64(mock_generation_task):
     """Test basic audio file encoding to base64."""
-    with tempfile.NamedTemporaryFile(mode='wb', suffix='.wav', delete=False) as f:
-        test_content = b'RIFF' + b'\x00' * 100
+    with tempfile.NamedTemporaryFile(mode="wb", suffix=".wav", delete=False) as f:
+        test_content = b"RIFF" + b"\x00" * 100
         f.write(test_content)
         temp_path = f.name
 
@@ -60,8 +60,8 @@ def test_convert_audio_to_base64_with_audio(mock_generation_task, tmp_path):
     """
     audio_path = tmp_path / "audio" / "test.wav"
     audio_path.parent.mkdir(exist_ok=True)
-    with open(audio_path, 'wb') as f:
-        f.write(b'RIFF' + b'\x00' * 100)
+    with open(audio_path, "wb") as f:
+        f.write(b"RIFF" + b"\x00" * 100)
 
     message = {"role": "user", "content": "Describe this audio", "audio": {"path": str(audio_path)}}
 

From 6f58a1bfd351e1c58c07d419971539f5b715db91 Mon Sep 17 00:00:00 2001
From: mmkrtchyan <mmkrtchyan@nvidia.com>
Date: Fri, 5 Dec 2025 18:29:34 +0400
Subject: [PATCH 30/38] Fix drop_binary_data missing messages check

Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 7c721a5d3f..8e72c1ae77 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -592,6 +592,10 @@ def dump_outputs(self, outputs, data_points, fout):
 
     def drop_binary_data(self, output):
         """Remove binary data (like base64 audio) from messages to keep output files smaller."""
+        # Skip if output doesn't have messages (e.g., text completion mode or error cases)
+        if "messages" not in output:
+            return
+
         for message in output["messages"]:
             # Skip if content is not a list (e.g., string content in system messages)
             if not isinstance(message.get("content"), list):

From 0f1b86d10efd8fb160062d9ac35bbcb4b821ef26 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Wed, 10 Dec 2025 08:37:35 -0800
Subject: [PATCH 31/38] Enable audio chunking

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>

clean up

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/inference/generate.py   |  82 +++-----
 nemo_skills/inference/model/vllm.py | 304 ++++++++++++++++++++++++++++
 2 files changed, 331 insertions(+), 55 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 8e72c1ae77..f6d7a8a145 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 import asyncio
-import base64
 import json
 import logging
+import os
 import random
 import shutil
 import subprocess
@@ -190,6 +190,11 @@ class GenerateSolutionsConfig:
     # List of content types to drop from messages (e.g., base64 audio) to keep output files smaller
     drop_content_types: list[str] = field(default_factory=lambda: ["audio_url"])
 
+    # Audio chunking configuration
+    enable_audio_chunking: bool = True
+    audio_chunk_task_types: list[str] | None = None  # If None, chunk all task types; if specified, only chunk these
+    chunk_audio_threshold_sec: int = 30  # Duration in seconds for each audio chunk
+
     # Evaluation setup if requested. If eval_type is set to None, evaluation is skipped
     eval_type: str | None = None  # "lean4-proof", "math", etc.
     eval_config: dict = field(default_factory=dict)  # Config for the evaluator
@@ -384,6 +389,21 @@ def setup_prompt(self):
     def setup_llm(self):
         self.sandbox = get_sandbox(**self.cfg.sandbox) if self.cfg.sandbox is not None else None
 
+        # Prepare data_dir for audio processing
+        data_dir = ""
+        if "data_dir" in self.cfg.eval_config and not (
+            isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None))
+        ):
+            data_dir = os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
+
+        # Prepare audio chunking config
+        audio_chunking_config = {
+            "data_dir": data_dir,
+            "enable_audio_chunking": self.cfg.enable_audio_chunking,
+            "audio_chunk_task_types": self.cfg.audio_chunk_task_types,
+            "chunk_audio_threshold_sec": self.cfg.chunk_audio_threshold_sec,
+        }
+
         if self.cfg.code_execution:
             llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox)
         elif self.cfg.tool_modules is not None:
@@ -395,7 +415,7 @@ def setup_llm(self):
                 additional_config={"sandbox": self.cfg.sandbox},
             )
         else:
-            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
+            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, **audio_chunking_config)
 
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg
@@ -529,63 +549,11 @@ def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]:
         """Preprocess the prompt before sending to the model.
 
         Override this method to add custom preprocessing logic.
-        By default, auto-detects and handles audio file path to base64 conversion.
+        Audio conversion is now handled by the model (e.g., VLLMModel).
         """
         if not isinstance(prompt, list):
             return prompt
 
-        # Auto-detect and convert audio file paths to base64
-        prompt = [self._convert_audio_to_base64(message) for message in prompt]
-
-        return prompt
-
-    def _audio_file_to_base64(self, audio_file_path: str) -> str:
-        """Encodes an audio file into a base64 string."""
-        with open(audio_file_path, "rb") as audio_file:
-            audio_content = audio_file.read()
-            return base64.b64encode(audio_content).decode("utf-8")
-
-    def _convert_audio_to_base64(self, message: dict) -> dict:
-        """Convert audio file paths in a message to base64 inline content.
-
-        Looks for 'audio' or 'audios' keys in the message and converts them
-        to base64-encoded audio_url content items. Removes the original keys
-        after conversion to prevent double-processing by model-specific code.
-
-        CRITICAL: Audio must come BEFORE text for Qwen Audio to transcribe correctly.
-        """
-        if "audio" not in message and "audios" not in message:
-            return message
-
-        audio_items = []
-
-        # Handle single audio
-        if "audio" in message:
-            audio = message.pop("audio")
-            base64_audio = self._audio_file_to_base64(audio["path"])
-            audio_items.append({"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}})
-
-        # Handle multiple audios
-        if "audios" in message:
-            for audio in message.pop("audios"):
-                base64_audio = self._audio_file_to_base64(audio["path"])
-                audio_items.append(
-                    {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
-                )
-
-        # Convert existing content to list format and place AFTER audio
-        content = message.get("content", "")
-        if isinstance(content, str):
-            text_items = [{"type": "text", "text": content}] if content else []
-        elif isinstance(content, list):
-            text_items = content
-        else:
-            raise TypeError(f"Unsupported content type: {type(content)}")
-
-        message["content"] = audio_items + text_items
-
-        return message
-
     def dump_outputs(self, outputs, data_points, fout):
         for output in outputs:
             fout.write(json.dumps(output) + "\n")
@@ -664,6 +632,10 @@ async def process_single_datapoint(self, data_point, all_data):
             "stop_phrases": [self.cfg.stop_phrase] if self.cfg.stop_phrase else None,
         }
 
+        # Pass task_type for audio chunking
+        if "task_type" in data_point:
+            generation_params["task_type"] = data_point["task_type"]
+
         if self.cfg.code_execution:
             if self.cfg.override_max_code_executions and self.cfg.total_code_executions_in_prompt is not None:
                 generation_params["max_code_executions"] = data_point["total_code_executions"]
diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index fb3de3f0e7..894425a874 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import base64
 import logging
+import os
 
 import requests
 
@@ -24,7 +26,102 @@
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
+def audio_file_to_base64(audio_file_path: str) -> str:
+    """Encodes an audio file into a base64 string."""
+    with open(audio_file_path, "rb") as audio_file:
+        audio_content = audio_file.read()
+        return base64.b64encode(audio_content).decode("utf-8")
+
+
+def load_audio_file(audio_file_path: str):
+    """Load audio file and return array and sampling rate."""
+    import soundfile as sf
+
+    audio_array, sampling_rate = sf.read(audio_file_path)
+    return audio_array, sampling_rate
+
+
+def chunk_audio(audio_array, sampling_rate, chunk_duration_sec=30):
+    """Chunk audio array into segments of specified duration.
+
+    Args:
+        audio_array: Audio data as numpy array
+        sampling_rate: Sampling rate in Hz
+        chunk_duration_sec: Duration of each chunk in seconds
+
+    Returns:
+        List of audio chunks
+    """
+    import numpy as np
+
+    chunk_samples = int(chunk_duration_sec * sampling_rate)
+    num_chunks = int(np.ceil(len(audio_array) / chunk_samples))
+
+    chunks = []
+    for i in range(num_chunks):
+        start = i * chunk_samples
+        end = min((i + 1) * chunk_samples, len(audio_array))
+        chunks.append(audio_array[start:end])
+
+    return chunks
+
+
+def save_audio_chunk_to_base64(audio_chunk, sampling_rate):
+    """Save audio chunk to temporary file and convert to base64.
+
+    Args:
+        audio_chunk: Audio data as numpy array
+        sampling_rate: Sampling rate in Hz
+
+    Returns:
+        Base64 encoded audio string
+    """
+    import tempfile
+
+    import soundfile as sf
+
+    # Create temporary file
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+        tmp_path = tmp_file.name
+        sf.write(tmp_path, audio_chunk, sampling_rate)
+
+    try:
+        # Read and encode
+        with open(tmp_path, "rb") as f:
+            audio_content = f.read()
+            encoded = base64.b64encode(audio_content).decode("utf-8")
+    finally:
+        # Clean up
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+
+    return encoded
+
+
 class VLLMModel(BaseModel):
+    def __init__(
+        self,
+        data_dir: str = "",
+        enable_audio_chunking: bool = True,
+        audio_chunk_task_types: list[str] | None = None,
+        chunk_audio_threshold_sec: int = 30,
+        **kwargs,
+    ):
+        """Initialize VLLMModel with audio chunking support.
+
+        Args:
+            data_dir: Base directory for audio files
+            enable_audio_chunking: Master switch for audio chunking
+            audio_chunk_task_types: If None, chunk all task types; if specified, only chunk these
+            chunk_audio_threshold_sec: Audio duration threshold for chunking
+            **kwargs: Other parameters passed to BaseModel
+        """
+        self.data_dir = data_dir
+        self.enable_audio_chunking = enable_audio_chunking
+        self.audio_chunk_task_types = audio_chunk_task_types
+        self.chunk_audio_threshold_sec = chunk_audio_threshold_sec
+        super().__init__(**kwargs)
+
     def _get_tokenizer_endpoint(self):
         """
         Returns the tokenizer endpoint if available, otherwise returns None.
@@ -42,6 +139,64 @@ def _get_tokenizer_endpoint(self):
         except requests.exceptions.RequestException:
             return None
 
+    def content_text_to_list(self, message):
+        """Convert message content with audio to proper list format.
+
+        Handles 'audio' or 'audios' keys in messages and converts them to
+        base64-encoded audio_url content items.
+
+        CRITICAL: Audio must come BEFORE text for Qwen models to transcribe correctly.
+        """
+        if "audio" in message or "audios" in message:
+            content = message["content"]
+            if isinstance(content, str):
+                message["content"] = [{"type": "text", "text": content}]
+            elif isinstance(content, list):
+                message["content"] = content
+            else:
+                raise TypeError(str(content))
+
+        audio_items = []
+
+        if "audio" in message:
+            audio = message["audio"]
+            base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
+            audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
+            audio_items.append(audio_message)
+        elif "audios" in message:
+            for audio in message["audios"]:
+                base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
+                audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
+                audio_items.append(audio_message)
+
+        # Insert audio items at the BEGINNING of content list (before text)
+        if audio_items:
+            message["content"] = audio_items + message["content"]
+
+        return message
+
+    def _preprocess_messages_for_model(self, messages: list[dict]) -> list[dict]:
+        """Preprocess messages based on model-specific requirements.
+
+        Remove /no_think suffix from system message as many models don't
+        recognize it and it degrades performance (especially Qwen).
+        """
+        processed_messages = []
+        for msg in messages:
+            msg_copy = msg.copy()
+
+            if msg_copy.get("role") == "system" and isinstance(msg_copy.get("content"), str):
+                content = msg_copy["content"]
+                if "/no_think" in content:
+                    LOG.info(f"[PREPROCESS] BEFORE: '{content}'")
+                    content = content.replace(" /no_think", "").replace("/no_think", "")
+                    msg_copy["content"] = content.strip()
+                    LOG.info(f"[PREPROCESS] AFTER: '{msg_copy['content']}'")
+
+            processed_messages.append(msg_copy)
+
+        return processed_messages
+
     def _build_request_body(self, top_k, min_p, repetition_penalty, extra_body: dict = None):
         full_extra_body = {
             "min_p": min_p,
@@ -96,6 +251,152 @@ def _build_completion_request_params(
             "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body),
         }
 
+    async def generate_async(
+        self,
+        prompt: str | list[dict] | None = None,
+        tokens_to_generate: int | None = None,
+        task_type: str = None,
+        **kwargs,
+    ) -> dict:
+        """Generate with automatic audio chunking for long audio files.
+
+        This override checks if the prompt (messages) contains long audio.
+        If so, it chunks the audio, processes each chunk separately, and aggregates results.
+        """
+        if isinstance(prompt, list):
+            messages = prompt
+            needs_chunking, audio_path, duration = self._needs_audio_chunking(messages, task_type)
+
+            if needs_chunking:
+                import re
+
+                audio_array, sampling_rate = load_audio_file(audio_path)
+                chunks = chunk_audio(audio_array, sampling_rate, self.chunk_audio_threshold_sec)
+
+                chunk_results = []
+                for chunk_idx, audio_chunk in enumerate(chunks):
+                    chunk_messages = []
+                    for msg in messages:
+                        msg_copy = msg.copy()
+                        if msg_copy.get("role") == "user" and ("audio" in msg_copy or "audios" in msg_copy):
+                            chunk_base64 = save_audio_chunk_to_base64(audio_chunk, sampling_rate)
+
+                            content = msg_copy.get("content", "")
+                            if isinstance(content, str):
+                                msg_copy["content"] = [{"type": "text", "text": content}]
+
+                            # Add audio chunk at the beginning (before text)
+                            msg_copy["content"] = [
+                                {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{chunk_base64}"}}
+                            ] + msg_copy["content"]
+
+                            # Remove original audio fields to avoid double processing
+                            msg_copy.pop("audio", None)
+                            msg_copy.pop("audios", None)
+
+                        chunk_messages.append(msg_copy)
+
+                    # Preprocess messages to strip /no_think for Qwen models
+                    chunk_messages = self._preprocess_messages_for_model(chunk_messages)
+
+                    # Generate for this chunk (pass as prompt, which accepts list[dict])
+                    result = await super().generate_async(
+                        prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs
+                    )
+
+                    generation = result.get("generation", "")
+
+                    # Post-process Qwen2-Audio output using AudioBench's
+                    # https://github.com/AudioLLMs/AudioBench/blob/main/src/model_src/qwen2_audio_7b_instruct.py
+
+                    # Possible issue: Remove SRT subtitle timestamps
+                    timestamp_pattern = r"\d+\s+\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3}\s+"
+                    generation = re.sub(timestamp_pattern, "", generation)
+                    timestamp_pattern_no_num = r"\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3}\s*"
+                    generation = re.sub(timestamp_pattern_no_num, "", generation)
+                    generation = re.sub(r"\\n\d+\s+(?=\d{2}:\d{2})", " ", generation)
+                    generation = re.sub(r"\n\d+\s+(?=\d{2}:\d{2})", " ", generation)
+
+                    # === AudioBench's way of processing output ===
+
+                    # Step 1: Extract from double quotes
+                    match = re.search(r'"((?:\\.|[^"\\])*)"', generation)
+                    if match:
+                        generation = match.group(1)
+
+                    # Handle colon-quote patterns
+                    if ":'" in generation:
+                        generation = "'" + generation.split(":'")[1]
+                    elif ": '" in generation:
+                        generation = "'" + generation.split(": '")[1]
+
+                    # Step 3: ALWAYS extract from single quotes
+                    match = re.search(r"'(.*)'", generation)
+                    if match:
+                        generation = match.group(1)
+
+                    chunk_results.append(generation.strip())
+
+                # Aggregate all chunks
+                aggregated_text = " ".join(chunk_results)
+
+                # Return result with aggregated generation
+                # Use the last chunk's result structure but replace generation
+                if result:
+                    final_result = result.copy()
+                    final_result["generation"] = aggregated_text
+                    final_result["num_audio_chunks"] = len(chunks)
+                    final_result["audio_duration"] = duration
+                else:
+                    final_result = {
+                        "generation": aggregated_text,
+                        "num_audio_chunks": len(chunks),
+                        "audio_duration": duration,
+                    }
+
+                return final_result
+
+        # Default behavior for non-chunked audio or non-list prompts
+        return await super().generate_async(prompt=prompt, tokens_to_generate=tokens_to_generate, **kwargs)
+
+    def _needs_audio_chunking(self, messages: list[dict], task_type: str = None) -> tuple[bool, str, float]:
+        """Check if audio in messages needs chunking.
+
+        Modified to support all task types by default, with optional filtering.
+
+        Returns:
+            Tuple of (needs_chunking, audio_path, duration)
+        """
+        if not self.enable_audio_chunking:
+            return False, None, 0.0
+
+        # Check if task type should be chunked (if filter is specified)
+        if self.audio_chunk_task_types is not None:
+            if task_type not in self.audio_chunk_task_types:
+                return False, None, 0.0
+
+        # Find audio in messages
+        for msg in messages:
+            if msg.get("role") == "user":
+                audio_info = msg.get("audio") or (msg.get("audios", [{}])[0] if msg.get("audios") else {})
+                if audio_info and "path" in audio_info:
+                    audio_path = os.path.join(self.data_dir, audio_info["path"])
+
+                    if not os.path.exists(audio_path):
+                        return False, None, 0.0
+
+                    # Load audio to check duration
+                    try:
+                        audio_array, sampling_rate = load_audio_file(audio_path)
+                        duration = len(audio_array) / sampling_rate
+
+                        if duration > self.chunk_audio_threshold_sec:
+                            return True, audio_path, duration
+                    except Exception:
+                        pass
+
+        return False, None, 0.0
+
     def _build_chat_request_params(
         self,
         messages: list[dict],
@@ -114,6 +415,9 @@ def _build_chat_request_params(
         tools: list[dict] | None = None,
         extra_body: dict = None,
     ) -> dict:
+        # Preprocess messages for model-specific requirements (e.g., remove /no_think for Qwen)
+        messages = self._preprocess_messages_for_model(messages)
+        messages = [self.content_text_to_list(message) for message in messages]
         request = {
             "messages": messages,
             "max_tokens": tokens_to_generate,

From 32ed94cef86d0bc0a8e7d7563e9f3ad19ff98838 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Wed, 10 Dec 2025 09:35:59 -0800
Subject: [PATCH 32/38] remove qwen specific post processing

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/inference/model/vllm.py | 35 -----------------------------
 1 file changed, 35 deletions(-)

diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index 894425a874..8dea2a5071 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -268,8 +268,6 @@ async def generate_async(
             needs_chunking, audio_path, duration = self._needs_audio_chunking(messages, task_type)
 
             if needs_chunking:
-                import re
-
                 audio_array, sampling_rate = load_audio_file(audio_path)
                 chunks = chunk_audio(audio_array, sampling_rate, self.chunk_audio_threshold_sec)
 
@@ -296,48 +294,15 @@ async def generate_async(
 
                         chunk_messages.append(msg_copy)
 
-                    # Preprocess messages to strip /no_think for Qwen models
                     chunk_messages = self._preprocess_messages_for_model(chunk_messages)
 
-                    # Generate for this chunk (pass as prompt, which accepts list[dict])
                     result = await super().generate_async(
                         prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs
                     )
 
                     generation = result.get("generation", "")
-
-                    # Post-process Qwen2-Audio output using AudioBench's
-                    # https://github.com/AudioLLMs/AudioBench/blob/main/src/model_src/qwen2_audio_7b_instruct.py
-
-                    # Possible issue: Remove SRT subtitle timestamps
-                    timestamp_pattern = r"\d+\s+\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3}\s+"
-                    generation = re.sub(timestamp_pattern, "", generation)
-                    timestamp_pattern_no_num = r"\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3}\s*"
-                    generation = re.sub(timestamp_pattern_no_num, "", generation)
-                    generation = re.sub(r"\\n\d+\s+(?=\d{2}:\d{2})", " ", generation)
-                    generation = re.sub(r"\n\d+\s+(?=\d{2}:\d{2})", " ", generation)
-
-                    # === AudioBench's way of processing output ===
-
-                    # Step 1: Extract from double quotes
-                    match = re.search(r'"((?:\\.|[^"\\])*)"', generation)
-                    if match:
-                        generation = match.group(1)
-
-                    # Handle colon-quote patterns
-                    if ":'" in generation:
-                        generation = "'" + generation.split(":'")[1]
-                    elif ": '" in generation:
-                        generation = "'" + generation.split(": '")[1]
-
-                    # Step 3: ALWAYS extract from single quotes
-                    match = re.search(r"'(.*)'", generation)
-                    if match:
-                        generation = match.group(1)
-
                     chunk_results.append(generation.strip())
 
-                # Aggregate all chunks
                 aggregated_text = " ".join(chunk_results)
 
                 # Return result with aggregated generation

From 15f30fd6c91bd5d2810ff933eafd76d65364fbc5 Mon Sep 17 00:00:00 2001
From: mmkrtchyan <mmkrtchyan@nvidia.com>
Date: Tue, 16 Dec 2025 13:27:48 +0400
Subject: [PATCH 33/38] fixing audio chunking config

Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index f6d7a8a145..ddb67ad9a1 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -404,6 +404,10 @@ def setup_llm(self):
             "chunk_audio_threshold_sec": self.cfg.chunk_audio_threshold_sec,
         }
 
+        # Only pass audio_chunking_config to models that support it (VLLM-based servers)
+        audio_supported_servers = {"vllm"}
+        server_type = self.cfg.server.get("server_type", "").lower()
+
         if self.cfg.code_execution:
             llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox)
         elif self.cfg.tool_modules is not None:
@@ -414,8 +418,10 @@ def setup_llm(self):
                 tokenizer=self.tokenizer,
                 additional_config={"sandbox": self.cfg.sandbox},
             )
-        else:
+        elif server_type in audio_supported_servers:
             llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, **audio_chunking_config)
+        else:
+            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
 
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg

From 8a0765a180d527a5a9f8dfd00cd1dc0895a545d9 Mon Sep 17 00:00:00 2001
From: mmkrtchyan <mmkrtchyan@nvidia.com>
Date: Tue, 16 Dec 2025 14:40:40 +0400
Subject: [PATCH 34/38] Fix audio tests to use VLLMModel instead of
 GenerationTask

Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py |  3 +-
 tests/test_vllm_audio.py          | 59 ++++++++++++++-----------------
 2 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index ddb67ad9a1..ec3aeee248 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -557,8 +557,7 @@ def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]:
         Override this method to add custom preprocessing logic.
         Audio conversion is now handled by the model (e.g., VLLMModel).
         """
-        if not isinstance(prompt, list):
-            return prompt
+        return prompt
 
     def dump_outputs(self, outputs, data_points, fout):
         for output in outputs:
diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py
index 1d62b4e8b1..05cc2645eb 100644
--- a/tests/test_vllm_audio.py
+++ b/tests/test_vllm_audio.py
@@ -15,28 +15,14 @@
 import base64
 import os
 import tempfile
-from unittest.mock import MagicMock
+from unittest.mock import patch
 
 import pytest
 
-from nemo_skills.inference.generate import GenerationTask
+from nemo_skills.inference.model.vllm import VLLMModel, audio_file_to_base64
 
 
-@pytest.fixture
-def mock_generation_task():
-    """Create a mock GenerationTask for testing audio preprocessing."""
-    mock_cfg = MagicMock()
-    mock_cfg.drop_content_types = ["audio_url"]
-
-    task = MagicMock(spec=GenerationTask)
-    task.cfg = mock_cfg
-    # Use the real methods
-    task._audio_file_to_base64 = GenerationTask._audio_file_to_base64.__get__(task)
-    task._convert_audio_to_base64 = GenerationTask._convert_audio_to_base64.__get__(task)
-    return task
-
-
-def test_audio_file_to_base64(mock_generation_task):
+def test_audio_file_to_base64():
     """Test basic audio file encoding to base64."""
     with tempfile.NamedTemporaryFile(mode="wb", suffix=".wav", delete=False) as f:
         test_content = b"RIFF" + b"\x00" * 100
@@ -44,7 +30,7 @@ def test_audio_file_to_base64(mock_generation_task):
         temp_path = f.name
 
     try:
-        result = mock_generation_task._audio_file_to_base64(temp_path)
+        result = audio_file_to_base64(temp_path)
         assert isinstance(result, str)
         assert len(result) > 0
         decoded = base64.b64decode(result)
@@ -53,42 +39,51 @@ def test_audio_file_to_base64(mock_generation_task):
         os.unlink(temp_path)
 
 
-def test_convert_audio_to_base64_with_audio(mock_generation_task, tmp_path):
+@pytest.fixture
+def mock_vllm_model():
+    """Create a mock VLLMModel for testing audio preprocessing."""
+    with patch.object(VLLMModel, "__init__", lambda self, **kwargs: None):
+        model = VLLMModel()
+        model.data_dir = ""
+        return model
+
+
+def test_content_text_to_list_with_audio(mock_vllm_model, tmp_path):
     """Test converting string content with audio to list format.
 
     CRITICAL: Audio must come BEFORE text for Qwen Audio to transcribe correctly.
     """
-    audio_path = tmp_path / "audio" / "test.wav"
-    audio_path.parent.mkdir(exist_ok=True)
+    audio_path = tmp_path / "test.wav"
     with open(audio_path, "wb") as f:
         f.write(b"RIFF" + b"\x00" * 100)
 
-    message = {"role": "user", "content": "Describe this audio", "audio": {"path": str(audio_path)}}
+    # Set data_dir to tmp_path parent so path resolution works
+    mock_vllm_model.data_dir = str(tmp_path)
+
+    message = {"role": "user", "content": "Describe this audio", "audio": {"path": "test.wav"}}
 
-    result = mock_generation_task._convert_audio_to_base64(message)
+    result = mock_vllm_model.content_text_to_list(message)
 
     assert isinstance(result["content"], list)
     assert len(result["content"]) == 2
     assert result["content"][0]["type"] == "audio_url"
     assert result["content"][0]["audio_url"]["url"].startswith("data:audio/wav;base64,")
     assert result["content"][1]["type"] == "text"
-    assert "audio" not in result
 
 
-def test_convert_audio_to_base64_with_multiple_audios(mock_generation_task, tmp_path):
+def test_content_text_to_list_with_multiple_audios(mock_vllm_model, tmp_path):
     """Test handling message with multiple audio files.
 
     CRITICAL: Audio must come BEFORE text for Qwen Audio to transcribe correctly.
     """
-    audio_dir = tmp_path / "audio"
-    audio_dir.mkdir(exist_ok=True)
-
     audio_paths = []
     for i in range(2):
-        audio_path = audio_dir / f"test_{i}.wav"
+        audio_path = tmp_path / f"test_{i}.wav"
         with open(audio_path, "wb") as f:
             f.write(b"RIFF" + b"\x00" * 100)
-        audio_paths.append(str(audio_path))
+        audio_paths.append(f"test_{i}.wav")
+
+    mock_vllm_model.data_dir = str(tmp_path)
 
     message = {
         "role": "user",
@@ -96,7 +91,7 @@ def test_convert_audio_to_base64_with_multiple_audios(mock_generation_task, tmp_
         "audios": [{"path": audio_paths[0]}, {"path": audio_paths[1]}],
     }
 
-    result = mock_generation_task._convert_audio_to_base64(message)
+    result = mock_vllm_model.content_text_to_list(message)
 
     assert isinstance(result["content"], list)
     assert len(result["content"]) == 3
@@ -104,5 +99,3 @@ def test_convert_audio_to_base64_with_multiple_audios(mock_generation_task, tmp_
     assert result["content"][0]["type"] == "audio_url"
     assert result["content"][1]["type"] == "audio_url"
     assert result["content"][2]["type"] == "text"
-    # Original audios key should be removed
-    assert "audios" not in result

From f8c3089daaca298bb38724e516ece553aea24cb0 Mon Sep 17 00:00:00 2001
From: mmkrtchyan <mmkrtchyan@nvidia.com>
Date: Tue, 16 Dec 2025 17:10:06 +0400
Subject: [PATCH 35/38] adding vllm integration test to run_qwen.sh

Signed-off-by: mmkrtchyan <mmkrtchyan@nvidia.com>
---
 nemo_skills/inference/generate.py  | 9 +++++++++
 tests/gpu-tests/run_qwen.sh        | 4 ++++
 tests/gpu-tests/test_vllm_audio.py | 6 +++---
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index ec3aeee248..820a175479 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -549,6 +549,15 @@ def fill_prompt(self, data_point, data):
                 filled_prompt[-1]["content"] += self.cfg.prompt_suffix
             else:
                 filled_prompt += self.cfg.prompt_suffix
+
+        # Copy audio fields from data_point to the user message for audio models
+        if isinstance(filled_prompt, list):
+            for msg in filled_prompt:
+                if msg.get("role") == "user":
+                    if "audio" in data_point:
+                        msg["audio"] = data_point["audio"]
+                    break
+
         return filled_prompt
 
     def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]:
diff --git a/tests/gpu-tests/run_qwen.sh b/tests/gpu-tests/run_qwen.sh
index 10201cf9ed..01803c1dae 100755
--- a/tests/gpu-tests/run_qwen.sh
+++ b/tests/gpu-tests/run_qwen.sh
@@ -20,6 +20,10 @@ pytest tests/gpu-tests/test_contamination.py -s -x
 # Tool calling tests (uses same Qwen3-4B-Instruct model)
 pytest tests/gpu-tests/test_tool_calling.py -s -x
 
+# Audio tests (requires Qwen2.5-Omni model)
+export NEMO_SKILLS_TEST_HF_MODEL=Qwen/Qwen2.5-Omni-7B
+pytest tests/gpu-tests/test_vllm_audio.py -s -x
+
 # TODO: Add fast context retry tests
 # pytest tests/gpu-tests/test_context_retry.py -s -x
 
diff --git a/tests/gpu-tests/test_vllm_audio.py b/tests/gpu-tests/test_vllm_audio.py
index 9272a6d85f..2f5d33ef94 100644
--- a/tests/gpu-tests/test_vllm_audio.py
+++ b/tests/gpu-tests/test_vllm_audio.py
@@ -37,11 +37,11 @@ def test_vllm_audio_generation():
     with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
         test_data = [
             {
-                "problem": "Transcribe this audio",
+                "question": "Transcribe this audio",
                 "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t2_16.wav"},
             },
             {
-                "problem": "What is in this audio?",
+                "question": "What is in this audio?",
                 "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t3_16.wav"},
             },
         ]
@@ -60,7 +60,7 @@ def test_vllm_audio_generation():
             f"    --server_nodes 1 "
             f"    --server_args '--enforce-eager' "
             f"    --input_file={input_file} "
-            f"    ++prompt_config=openai "
+            f"    ++prompt_config=generic/default "
             f"    ++skip_filled=False "
         )
         subprocess.run(cmd, shell=True, check=True)

From a88736e3c6dba18407256cfe358262f7bdbace80 Mon Sep 17 00:00:00 2001
From: George Armstrong <georgea@nvidia.com>
Date: Mon, 22 Dec 2025 16:41:47 -0800
Subject: [PATCH 36/38] move audio processing into model (#1137)

Signed-off-by: George Armstrong <georgea@nvidia.com>
---
 nemo_skills/inference/generate.py             |  50 ++-
 nemo_skills/inference/model/__init__.py       |  42 ++
 .../inference/model/audio_processor.py        | 374 ++++++++++++++++++
 nemo_skills/inference/model/vllm.py           | 271 +------------
 4 files changed, 443 insertions(+), 294 deletions(-)
 create mode 100644 nemo_skills/inference/model/audio_processor.py

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 820a175479..d994feaca8 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -15,7 +15,6 @@
 import asyncio
 import json
 import logging
-import os
 import random
 import shutil
 import subprocess
@@ -39,6 +38,8 @@
     supports_single_eval,
 )
 from nemo_skills.inference.model import (
+    AudioProcessor,
+    AudioProcessorConfig,
     ParallelThinkingConfig,
     get_code_execution_model,
     get_model,
@@ -190,10 +191,10 @@ class GenerateSolutionsConfig:
     # List of content types to drop from messages (e.g., base64 audio) to keep output files smaller
     drop_content_types: list[str] = field(default_factory=lambda: ["audio_url"])
 
-    # Audio chunking configuration
-    enable_audio_chunking: bool = True
-    audio_chunk_task_types: list[str] | None = None  # If None, chunk all task types; if specified, only chunk these
-    chunk_audio_threshold_sec: int = 30  # Duration in seconds for each audio chunk
+    # Audio processing configuration (EXPERIMENTAL)
+    # Set to enable audio file preprocessing (file->base64 conversion, chunking for long audio)
+    # Example: ++audio.data_dir=/path/to/audio ++audio.enable_chunking=true
+    audio: AudioProcessorConfig | None = None
 
     # Evaluation setup if requested. If eval_type is set to None, evaluation is skipped
     eval_type: str | None = None  # "lean4-proof", "math", etc.
@@ -389,25 +390,6 @@ def setup_prompt(self):
     def setup_llm(self):
         self.sandbox = get_sandbox(**self.cfg.sandbox) if self.cfg.sandbox is not None else None
 
-        # Prepare data_dir for audio processing
-        data_dir = ""
-        if "data_dir" in self.cfg.eval_config and not (
-            isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None))
-        ):
-            data_dir = os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)
-
-        # Prepare audio chunking config
-        audio_chunking_config = {
-            "data_dir": data_dir,
-            "enable_audio_chunking": self.cfg.enable_audio_chunking,
-            "audio_chunk_task_types": self.cfg.audio_chunk_task_types,
-            "chunk_audio_threshold_sec": self.cfg.chunk_audio_threshold_sec,
-        }
-
-        # Only pass audio_chunking_config to models that support it (VLLM-based servers)
-        audio_supported_servers = {"vllm"}
-        server_type = self.cfg.server.get("server_type", "").lower()
-
         if self.cfg.code_execution:
             llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox)
         elif self.cfg.tool_modules is not None:
@@ -418,11 +400,25 @@ def setup_llm(self):
                 tokenizer=self.tokenizer,
                 additional_config={"sandbox": self.cfg.sandbox},
             )
-        elif server_type in audio_supported_servers:
-            llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, **audio_chunking_config)
         else:
             llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
 
+        # Audio wrapper (preprocesses messages before they reach the model)
+        if self.cfg.audio is not None:
+            audio_supported_servers = {"vllm"}
+            server_type = self.cfg.server.get("server_type", "").lower()
+            if server_type not in audio_supported_servers:
+                raise ValueError(
+                    f"Audio processing is not supported for server_type='{server_type}'. "
+                    f"Supported server types: {audio_supported_servers}"
+                )
+            llm = AudioProcessor(
+                llm,
+                self.cfg.audio,
+                eval_config=dict(self.cfg.eval_config),
+                eval_type=self.cfg.eval_type,
+            )
+
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg
             inference_override_config = {
@@ -564,7 +560,7 @@ def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]:
         """Preprocess the prompt before sending to the model.
 
         Override this method to add custom preprocessing logic.
-        Audio conversion is now handled by the model (e.g., VLLMModel).
+        Audio conversion is handled by the AudioProcessor wrapper.
         """
         return prompt
 
diff --git a/nemo_skills/inference/model/__init__.py b/nemo_skills/inference/model/__init__.py
index bd2e246499..214c28b500 100644
--- a/nemo_skills/inference/model/__init__.py
+++ b/nemo_skills/inference/model/__init__.py
@@ -12,6 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""Model wrappers and factories for inference.
+
+This module provides:
+- Base model implementations (VLLMModel, OpenAIModel, etc.)
+- Wrappers for additional capabilities:
+  - AudioProcessor: Audio file preprocessing and chunking
+  - CodeExecutionWrapper: Code execution in sandboxes
+  - ToolCallingWrapper: Tool/function calling support
+  - ParallelThinkingTask: Parallel thinking/reasoning
+- Factory functions: get_model, get_code_execution_model, get_tool_calling_model, etc.
+"""
+
 import dataclasses
 
 from nemo_skills.mcp.utils import locate
@@ -19,6 +31,9 @@
 
 # NIM models (speech)
 from .asr_nim import ASRNIMModel
+
+# Audio processing
+from .audio_processor import AudioProcessor, AudioProcessorConfig
 from .azure import AzureOpenAIModel
 
 # Base classes
@@ -80,6 +95,33 @@ def get_model(server_type, tokenizer=None, model_class: str | None = None, **kwa
     return loaded_class(tokenizer=tokenizer, **kwargs)
 
 
+def get_audio_model(server_type, audio_config: dict | AudioProcessorConfig | None = None, **kwargs):
+    """A helper function to create a model with audio processing capabilities.
+
+    This wraps the base model with AudioProcessor for handling audio files in messages.
+
+    Args:
+        server_type: The type of server (vllm, sglang, openai, etc.)
+        audio_config: Audio processing configuration. Can be:
+            - None: No audio processing (returns base model)
+            - dict: Will be converted to AudioProcessorConfig
+            - AudioProcessorConfig: Used directly
+        **kwargs: Additional arguments passed to get_model()
+
+    Returns:
+        Model optionally wrapped with AudioProcessor
+    """
+    model = get_model(server_type=server_type, **kwargs)
+
+    if audio_config is None:
+        return model
+
+    if isinstance(audio_config, dict):
+        audio_config = AudioProcessorConfig(**audio_config)
+
+    return AudioProcessor(model, audio_config)
+
+
 def get_code_execution_model(server_type, tokenizer=None, code_execution=None, sandbox=None, **kwargs):
     """A helper function to make it easier to set server through cmd."""
     model = get_model(server_type=server_type, tokenizer=tokenizer, **kwargs)
diff --git a/nemo_skills/inference/model/audio_processor.py b/nemo_skills/inference/model/audio_processor.py
new file mode 100644
index 0000000000..29c49b80b1
--- /dev/null
+++ b/nemo_skills/inference/model/audio_processor.py
@@ -0,0 +1,374 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Audio processing wrapper for multimodal models.
+
+This module provides an AudioProcessor wrapper that can be composed with any
+BaseModel to add audio preprocessing capabilities. It handles:
+- Converting audio file paths to base64-encoded audio_url format
+- Chunking long audio files for models with duration limits
+- Aggregating results from chunked audio processing
+"""
+
+import base64
+import logging
+import os
+
+from nemo_skills.utils import get_logger_name, nested_dataclass
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+@nested_dataclass(kw_only=True)
+class AudioProcessorConfig:
+    """Configuration for audio preprocessing.
+
+    Attributes:
+        data_dir: Base directory for resolving relative audio file paths.
+        enable_chunking: Whether to chunk long audio files.
+        chunk_task_types: If None, chunk all task types; if specified, only chunk these.
+        chunk_threshold_sec: Audio duration threshold (in seconds) above which to chunk.
+    """
+
+    data_dir: str = ""
+    enable_chunking: bool = True
+    chunk_task_types: list[str] | None = None
+    chunk_threshold_sec: int = 30
+
+
+def audio_file_to_base64(audio_file_path: str) -> str:
+    """Encodes an audio file into a base64 string."""
+    with open(audio_file_path, "rb") as audio_file:
+        audio_content = audio_file.read()
+        return base64.b64encode(audio_content).decode("utf-8")
+
+
+def load_audio_file(audio_file_path: str):
+    """Load audio file and return array and sampling rate."""
+    import soundfile as sf
+
+    audio_array, sampling_rate = sf.read(audio_file_path)
+    return audio_array, sampling_rate
+
+
+def chunk_audio(audio_array, sampling_rate, chunk_duration_sec=30):
+    """Chunk audio array into segments of specified duration.
+
+    Args:
+        audio_array: Audio data as numpy array
+        sampling_rate: Sampling rate in Hz
+        chunk_duration_sec: Duration of each chunk in seconds
+
+    Returns:
+        List of audio chunks
+    """
+    import numpy as np
+
+    chunk_samples = int(chunk_duration_sec * sampling_rate)
+    num_chunks = int(np.ceil(len(audio_array) / chunk_samples))
+
+    chunks = []
+    for i in range(num_chunks):
+        start = i * chunk_samples
+        end = min((i + 1) * chunk_samples, len(audio_array))
+        chunks.append(audio_array[start:end])
+
+    return chunks
+
+
+def save_audio_chunk_to_base64(audio_chunk, sampling_rate) -> str:
+    """Save audio chunk to temporary file and convert to base64.
+
+    Args:
+        audio_chunk: Audio data as numpy array
+        sampling_rate: Sampling rate in Hz
+
+    Returns:
+        Base64 encoded audio string
+    """
+    import tempfile
+
+    import soundfile as sf
+
+    # Create temporary file
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+        tmp_path = tmp_file.name
+        sf.write(tmp_path, audio_chunk, sampling_rate)
+
+    try:
+        # Read and encode
+        with open(tmp_path, "rb") as f:
+            audio_content = f.read()
+            encoded = base64.b64encode(audio_content).decode("utf-8")
+    finally:
+        # Clean up
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+
+    return encoded
+
+
+class AudioProcessor:
+    """Wraps any model to add audio preprocessing capabilities.
+
+    This wrapper handles:
+    - Converting audio file paths in messages to base64-encoded audio_url format
+    - Chunking long audio files and aggregating results
+    - Passing through all other requests unchanged
+
+    Example usage:
+        model = get_model(server_type="vllm", ...)
+        audio_model = AudioProcessor(model, AudioProcessorConfig(), eval_config={...}, eval_type="audio")
+        result = await audio_model.generate_async(prompt=messages, ...)
+    """
+
+    def __init__(
+        self,
+        model,
+        config: AudioProcessorConfig,
+        eval_config: dict | None = None,
+        eval_type: str | None = None,
+    ):
+        """Initialize AudioProcessor wrapper.
+
+        Args:
+            model: The underlying model to wrap (must have generate_async method)
+            config: Audio processing configuration
+            eval_config: Optional eval config dict (contains "data_dir" key) for inferring data_dir
+            eval_type: Optional eval type string for inferring data_dir
+        """
+        self.model = model
+        self.config = config
+
+        # Resolve data_dir: explicit config takes precedence, then infer from eval_config
+        if config.data_dir:
+            self.data_dir = config.data_dir
+        elif eval_config is not None and eval_type is not None:
+            eval_data_dir = eval_config.get("data_dir")
+            if eval_data_dir is not None:
+                self.data_dir = os.path.join(eval_data_dir, eval_type)
+            else:
+                self.data_dir = ""
+        else:
+            self.data_dir = ""
+
+        # Expose common model attributes for compatibility
+        if hasattr(model, "model_name_or_path"):
+            self.model_name_or_path = model.model_name_or_path
+        if hasattr(model, "tokenizer"):
+            self.tokenizer = model.tokenizer
+
+    async def generate_async(
+        self,
+        prompt: str | list[dict] | None = None,
+        task_type: str = None,
+        **kwargs,
+    ) -> dict:
+        """Generate with automatic audio preprocessing and chunking.
+
+        If the prompt contains audio that needs chunking, processes each chunk
+        separately and aggregates results. Otherwise, converts audio to base64
+        and passes through to the underlying model.
+
+        Args:
+            prompt: Either a string (text completion) or list of messages (chat)
+            task_type: Optional task type for chunking filtering
+            **kwargs: Additional arguments passed to the underlying model
+
+        Returns:
+            Generation result dict with 'generation' key and optional metadata
+        """
+        if isinstance(prompt, list):
+            messages = prompt
+            needs_chunking, audio_path, duration = self._check_chunking_needed(messages, task_type)
+
+            if needs_chunking:
+                return await self._generate_with_chunking(messages, audio_path, duration, **kwargs)
+
+            # Convert audio fields to base64 format
+            messages = self._prepare_audio_messages(messages)
+            prompt = messages
+
+        return await self.model.generate_async(prompt=prompt, **kwargs)
+
+    def _prepare_audio_messages(self, messages: list[dict]) -> list[dict]:
+        """Convert audio file references in messages to base64-encoded audio_url format.
+
+        Handles 'audio' or 'audios' keys in messages and converts them to
+        base64-encoded audio_url content items.
+
+        CRITICAL: Audio must come BEFORE text for Qwen models to transcribe correctly.
+        """
+        prepared_messages = []
+
+        for message in messages:
+            msg = message.copy()
+
+            if "audio" not in msg and "audios" not in msg:
+                prepared_messages.append(msg)
+                continue
+
+            # Convert content to list format if needed
+            content = msg.get("content", "")
+            if isinstance(content, str):
+                text_content = [{"type": "text", "text": content}]
+            elif isinstance(content, list):
+                text_content = content
+            else:
+                raise TypeError(f"Unexpected content type: {type(content)}")
+
+            # Build audio content items
+            audio_items = []
+
+            if "audio" in msg:
+                audio = msg["audio"]
+                audio_path = os.path.join(self.data_dir, audio["path"])
+                base64_audio = audio_file_to_base64(audio_path)
+                audio_items.append(
+                    {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
+                )
+                del msg["audio"]
+            elif "audios" in msg:
+                for audio in msg["audios"]:
+                    audio_path = os.path.join(self.data_dir, audio["path"])
+                    base64_audio = audio_file_to_base64(audio_path)
+                    audio_items.append(
+                        {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
+                    )
+                del msg["audios"]
+
+            # Audio items BEFORE text content (required for Qwen models)
+            msg["content"] = audio_items + text_content
+            prepared_messages.append(msg)
+
+        return prepared_messages
+
+    def _check_chunking_needed(self, messages: list[dict], task_type: str = None) -> tuple[bool, str, float]:
+        """Check if audio in messages needs chunking.
+
+        Returns:
+            Tuple of (needs_chunking, audio_path, duration)
+        """
+        if not self.config.enable_chunking:
+            return False, None, 0.0
+
+        # Check if task type should be chunked (if filter is specified)
+        if self.config.chunk_task_types is not None:
+            if task_type not in self.config.chunk_task_types:
+                return False, None, 0.0
+
+        # Find audio in messages
+        for msg in messages:
+            if msg.get("role") == "user":
+                audio_info = msg.get("audio") or (msg.get("audios", [{}])[0] if msg.get("audios") else {})
+                if audio_info and "path" in audio_info:
+                    audio_path = os.path.join(self.data_dir, audio_info["path"])
+
+                    if not os.path.exists(audio_path):
+                        return False, None, 0.0
+
+                    # Load audio to check duration
+                    try:
+                        audio_array, sampling_rate = load_audio_file(audio_path)
+                        duration = len(audio_array) / sampling_rate
+
+                        if duration > self.config.chunk_threshold_sec:
+                            return True, audio_path, duration
+                    except Exception:
+                        pass
+
+        return False, None, 0.0
+
+    async def _generate_with_chunking(
+        self,
+        messages: list[dict],
+        audio_path: str,
+        duration: float,
+        tokens_to_generate: int | None = None,
+        **kwargs,
+    ) -> dict:
+        """Generate by chunking long audio and aggregating results.
+
+        Args:
+            messages: Original messages containing audio reference
+            audio_path: Path to the audio file to chunk
+            duration: Duration of audio in seconds
+            tokens_to_generate: Max tokens per chunk
+            **kwargs: Additional generation parameters
+
+        Returns:
+            Aggregated result with combined generation from all chunks
+        """
+        audio_array, sampling_rate = load_audio_file(audio_path)
+        chunks = chunk_audio(audio_array, sampling_rate, self.config.chunk_threshold_sec)
+
+        LOG.info(f"Chunking audio ({duration:.1f}s) into {len(chunks)} chunks of {self.config.chunk_threshold_sec}s")
+
+        chunk_results = []
+        result = None
+
+        for chunk_idx, audio_chunk in enumerate(chunks):
+            chunk_messages = []
+
+            for msg in messages:
+                msg_copy = msg.copy()
+
+                if msg_copy.get("role") == "user" and ("audio" in msg_copy or "audios" in msg_copy):
+                    chunk_base64 = save_audio_chunk_to_base64(audio_chunk, sampling_rate)
+
+                    content = msg_copy.get("content", "")
+                    if isinstance(content, str):
+                        text_content = [{"type": "text", "text": content}]
+                    else:
+                        text_content = content
+
+                    # Add audio chunk at the beginning (before text)
+                    msg_copy["content"] = [
+                        {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{chunk_base64}"}}
+                    ] + text_content
+
+                    # Remove original audio fields
+                    msg_copy.pop("audio", None)
+                    msg_copy.pop("audios", None)
+
+                chunk_messages.append(msg_copy)
+
+            result = await self.model.generate_async(
+                prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs
+            )
+
+            generation = result.get("generation", "")
+            chunk_results.append(generation.strip())
+
+        # Aggregate results
+        aggregated_text = " ".join(chunk_results)
+
+        if result:
+            final_result = result.copy()
+            final_result["generation"] = aggregated_text
+            final_result["num_audio_chunks"] = len(chunks)
+            final_result["audio_duration"] = duration
+        else:
+            final_result = {
+                "generation": aggregated_text,
+                "num_audio_chunks": len(chunks),
+                "audio_duration": duration,
+            }
+
+        return final_result
+
+    # Proxy other common methods to the underlying model
+    def __getattr__(self, name):
+        """Proxy attribute access to the underlying model."""
+        return getattr(self.model, name)
diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
index 8dea2a5071..ecf75617e1 100644
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import base64
 import logging
-import os
 
 import requests
 
@@ -26,101 +24,12 @@
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
-def audio_file_to_base64(audio_file_path: str) -> str:
-    """Encodes an audio file into a base64 string."""
-    with open(audio_file_path, "rb") as audio_file:
-        audio_content = audio_file.read()
-        return base64.b64encode(audio_content).decode("utf-8")
-
-
-def load_audio_file(audio_file_path: str):
-    """Load audio file and return array and sampling rate."""
-    import soundfile as sf
-
-    audio_array, sampling_rate = sf.read(audio_file_path)
-    return audio_array, sampling_rate
-
-
-def chunk_audio(audio_array, sampling_rate, chunk_duration_sec=30):
-    """Chunk audio array into segments of specified duration.
-
-    Args:
-        audio_array: Audio data as numpy array
-        sampling_rate: Sampling rate in Hz
-        chunk_duration_sec: Duration of each chunk in seconds
-
-    Returns:
-        List of audio chunks
-    """
-    import numpy as np
-
-    chunk_samples = int(chunk_duration_sec * sampling_rate)
-    num_chunks = int(np.ceil(len(audio_array) / chunk_samples))
-
-    chunks = []
-    for i in range(num_chunks):
-        start = i * chunk_samples
-        end = min((i + 1) * chunk_samples, len(audio_array))
-        chunks.append(audio_array[start:end])
-
-    return chunks
-
-
-def save_audio_chunk_to_base64(audio_chunk, sampling_rate):
-    """Save audio chunk to temporary file and convert to base64.
-
-    Args:
-        audio_chunk: Audio data as numpy array
-        sampling_rate: Sampling rate in Hz
-
-    Returns:
-        Base64 encoded audio string
-    """
-    import tempfile
-
-    import soundfile as sf
-
-    # Create temporary file
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-        tmp_path = tmp_file.name
-        sf.write(tmp_path, audio_chunk, sampling_rate)
-
-    try:
-        # Read and encode
-        with open(tmp_path, "rb") as f:
-            audio_content = f.read()
-            encoded = base64.b64encode(audio_content).decode("utf-8")
-    finally:
-        # Clean up
-        if os.path.exists(tmp_path):
-            os.unlink(tmp_path)
-
-    return encoded
-
-
 class VLLMModel(BaseModel):
-    def __init__(
-        self,
-        data_dir: str = "",
-        enable_audio_chunking: bool = True,
-        audio_chunk_task_types: list[str] | None = None,
-        chunk_audio_threshold_sec: int = 30,
-        **kwargs,
-    ):
-        """Initialize VLLMModel with audio chunking support.
+    """VLLM-compatible model client.
 
-        Args:
-            data_dir: Base directory for audio files
-            enable_audio_chunking: Master switch for audio chunking
-            audio_chunk_task_types: If None, chunk all task types; if specified, only chunk these
-            chunk_audio_threshold_sec: Audio duration threshold for chunking
-            **kwargs: Other parameters passed to BaseModel
-        """
-        self.data_dir = data_dir
-        self.enable_audio_chunking = enable_audio_chunking
-        self.audio_chunk_task_types = audio_chunk_task_types
-        self.chunk_audio_threshold_sec = chunk_audio_threshold_sec
-        super().__init__(**kwargs)
+    This is a clean OpenAI-compatible client for VLLM servers.
+    For audio processing capabilities, wrap this model with AudioProcessor.
+    """
 
     def _get_tokenizer_endpoint(self):
         """
@@ -139,64 +48,6 @@ def _get_tokenizer_endpoint(self):
         except requests.exceptions.RequestException:
             return None
 
-    def content_text_to_list(self, message):
-        """Convert message content with audio to proper list format.
-
-        Handles 'audio' or 'audios' keys in messages and converts them to
-        base64-encoded audio_url content items.
-
-        CRITICAL: Audio must come BEFORE text for Qwen models to transcribe correctly.
-        """
-        if "audio" in message or "audios" in message:
-            content = message["content"]
-            if isinstance(content, str):
-                message["content"] = [{"type": "text", "text": content}]
-            elif isinstance(content, list):
-                message["content"] = content
-            else:
-                raise TypeError(str(content))
-
-        audio_items = []
-
-        if "audio" in message:
-            audio = message["audio"]
-            base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
-            audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
-            audio_items.append(audio_message)
-        elif "audios" in message:
-            for audio in message["audios"]:
-                base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"]))
-                audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}
-                audio_items.append(audio_message)
-
-        # Insert audio items at the BEGINNING of content list (before text)
-        if audio_items:
-            message["content"] = audio_items + message["content"]
-
-        return message
-
-    def _preprocess_messages_for_model(self, messages: list[dict]) -> list[dict]:
-        """Preprocess messages based on model-specific requirements.
-
-        Remove /no_think suffix from system message as many models don't
-        recognize it and it degrades performance (especially Qwen).
-        """
-        processed_messages = []
-        for msg in messages:
-            msg_copy = msg.copy()
-
-            if msg_copy.get("role") == "system" and isinstance(msg_copy.get("content"), str):
-                content = msg_copy["content"]
-                if "/no_think" in content:
-                    LOG.info(f"[PREPROCESS] BEFORE: '{content}'")
-                    content = content.replace(" /no_think", "").replace("/no_think", "")
-                    msg_copy["content"] = content.strip()
-                    LOG.info(f"[PREPROCESS] AFTER: '{msg_copy['content']}'")
-
-            processed_messages.append(msg_copy)
-
-        return processed_messages
-
     def _build_request_body(self, top_k, min_p, repetition_penalty, extra_body: dict = None):
         full_extra_body = {
             "min_p": min_p,
@@ -251,117 +102,6 @@ def _build_completion_request_params(
             "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body),
         }
 
-    async def generate_async(
-        self,
-        prompt: str | list[dict] | None = None,
-        tokens_to_generate: int | None = None,
-        task_type: str = None,
-        **kwargs,
-    ) -> dict:
-        """Generate with automatic audio chunking for long audio files.
-
-        This override checks if the prompt (messages) contains long audio.
-        If so, it chunks the audio, processes each chunk separately, and aggregates results.
-        """
-        if isinstance(prompt, list):
-            messages = prompt
-            needs_chunking, audio_path, duration = self._needs_audio_chunking(messages, task_type)
-
-            if needs_chunking:
-                audio_array, sampling_rate = load_audio_file(audio_path)
-                chunks = chunk_audio(audio_array, sampling_rate, self.chunk_audio_threshold_sec)
-
-                chunk_results = []
-                for chunk_idx, audio_chunk in enumerate(chunks):
-                    chunk_messages = []
-                    for msg in messages:
-                        msg_copy = msg.copy()
-                        if msg_copy.get("role") == "user" and ("audio" in msg_copy or "audios" in msg_copy):
-                            chunk_base64 = save_audio_chunk_to_base64(audio_chunk, sampling_rate)
-
-                            content = msg_copy.get("content", "")
-                            if isinstance(content, str):
-                                msg_copy["content"] = [{"type": "text", "text": content}]
-
-                            # Add audio chunk at the beginning (before text)
-                            msg_copy["content"] = [
-                                {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{chunk_base64}"}}
-                            ] + msg_copy["content"]
-
-                            # Remove original audio fields to avoid double processing
-                            msg_copy.pop("audio", None)
-                            msg_copy.pop("audios", None)
-
-                        chunk_messages.append(msg_copy)
-
-                    chunk_messages = self._preprocess_messages_for_model(chunk_messages)
-
-                    result = await super().generate_async(
-                        prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs
-                    )
-
-                    generation = result.get("generation", "")
-                    chunk_results.append(generation.strip())
-
-                aggregated_text = " ".join(chunk_results)
-
-                # Return result with aggregated generation
-                # Use the last chunk's result structure but replace generation
-                if result:
-                    final_result = result.copy()
-                    final_result["generation"] = aggregated_text
-                    final_result["num_audio_chunks"] = len(chunks)
-                    final_result["audio_duration"] = duration
-                else:
-                    final_result = {
-                        "generation": aggregated_text,
-                        "num_audio_chunks": len(chunks),
-                        "audio_duration": duration,
-                    }
-
-                return final_result
-
-        # Default behavior for non-chunked audio or non-list prompts
-        return await super().generate_async(prompt=prompt, tokens_to_generate=tokens_to_generate, **kwargs)
-
-    def _needs_audio_chunking(self, messages: list[dict], task_type: str = None) -> tuple[bool, str, float]:
-        """Check if audio in messages needs chunking.
-
-        Modified to support all task types by default, with optional filtering.
-
-        Returns:
-            Tuple of (needs_chunking, audio_path, duration)
-        """
-        if not self.enable_audio_chunking:
-            return False, None, 0.0
-
-        # Check if task type should be chunked (if filter is specified)
-        if self.audio_chunk_task_types is not None:
-            if task_type not in self.audio_chunk_task_types:
-                return False, None, 0.0
-
-        # Find audio in messages
-        for msg in messages:
-            if msg.get("role") == "user":
-                audio_info = msg.get("audio") or (msg.get("audios", [{}])[0] if msg.get("audios") else {})
-                if audio_info and "path" in audio_info:
-                    audio_path = os.path.join(self.data_dir, audio_info["path"])
-
-                    if not os.path.exists(audio_path):
-                        return False, None, 0.0
-
-                    # Load audio to check duration
-                    try:
-                        audio_array, sampling_rate = load_audio_file(audio_path)
-                        duration = len(audio_array) / sampling_rate
-
-                        if duration > self.chunk_audio_threshold_sec:
-                            return True, audio_path, duration
-                    except Exception:
-                        pass
-
-        return False, None, 0.0
-
     def _build_chat_request_params(
         self,
         messages: list[dict],
@@ -380,9 +120,6 @@ def _build_chat_request_params(
         tools: list[dict] | None = None,
         extra_body: dict = None,
     ) -> dict:
-        # Preprocess messages for model-specific requirements (e.g., remove /no_think for Qwen)
-        messages = self._preprocess_messages_for_model(messages)
-        messages = [self.content_text_to_list(message) for message in messages]
         request = {
             "messages": messages,
             "max_tokens": tokens_to_generate,

From 35219879fb6111aa523cef6822767a2429904f95 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Tue, 23 Dec 2025 08:01:38 -0800
Subject: [PATCH 37/38] fixing task type issue.

1) after refactoring AudioProcessor worked only if we pass ++audio arg. this is inconvenient

2) now we audio-enabling in based on dataset_group=speechlm or task_type=audio

all audio containing benchmarks have one or both

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/inference/generate.py  | 18 +++++++++++++++---
 nemo_skills/pipeline/utils/eval.py |  6 ++++++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index d994feaca8..00993c5f66 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -199,6 +199,7 @@ class GenerateSolutionsConfig:
     # Evaluation setup if requested. If eval_type is set to None, evaluation is skipped
     eval_type: str | None = None  # "lean4-proof", "math", etc.
     eval_config: dict = field(default_factory=dict)  # Config for the evaluator
+    dataset_group: str | None = None  # "math", "code", "speechlm", etc. from benchmark's DATASET_GROUP
 
     def __post_init__(self):
         self._post_init_validate_data()
@@ -404,7 +405,14 @@ def setup_llm(self):
             llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
 
         # Audio wrapper (preprocesses messages before they reach the model)
-        if self.cfg.audio is not None:
+        # Auto-enable for audio benchmarks on vLLM (eval_type=audio OR dataset_group=speechlm)
+        should_enable_audio = (
+            self.cfg.audio is not None or 
+            (self.cfg.server.get("server_type", "").lower() == "vllm" and 
+             (self.cfg.eval_type == "audio" or self.cfg.dataset_group == "speechlm"))
+        )
+        
+        if should_enable_audio:
             audio_supported_servers = {"vllm"}
             server_type = self.cfg.server.get("server_type", "").lower()
             if server_type not in audio_supported_servers:
@@ -412,9 +420,13 @@ def setup_llm(self):
                     f"Audio processing is not supported for server_type='{server_type}'. "
                     f"Supported server types: {audio_supported_servers}"
                 )
+            
+            # Use provided config or create default
+            audio_config = self.cfg.audio if self.cfg.audio is not None else AudioProcessorConfig()
+            
             llm = AudioProcessor(
                 llm,
-                self.cfg.audio,
+                audio_config,
                 eval_config=dict(self.cfg.eval_config),
                 eval_type=self.cfg.eval_type,
             )
@@ -643,7 +655,7 @@ async def process_single_datapoint(self, data_point, all_data):
         }
 
         # Pass task_type for audio chunking
-        if "task_type" in data_point:
+        if isinstance(self.llm, AudioProcessor) and "task_type" in data_point:
             generation_params["task_type"] = data_point["task_type"]
 
         if self.cfg.code_execution:
diff --git a/nemo_skills/pipeline/utils/eval.py b/nemo_skills/pipeline/utils/eval.py
index 736d1c7cb6..d2b8a5f2ac 100644
--- a/nemo_skills/pipeline/utils/eval.py
+++ b/nemo_skills/pipeline/utils/eval.py
@@ -131,6 +131,12 @@ def get_benchmark_args_from_module(
     if eval_args:
         generation_args = f"{eval_args} {generation_args}"
     generation_args += f" ++eval_config.split={split} "
+    
+    # Pass dataset_group from benchmark config if defined
+    dataset_group = get_arg_from_module_or_dict(benchmark_module, "DATASET_GROUP", None, override_dict=override_dict)
+    if dataset_group:
+        generation_args += f" ++dataset_group={dataset_group} "
+    
     requires_sandbox = get_arg_from_module_or_dict(benchmark_module, "REQUIRES_SANDBOX", False, override_dict)
     keep_mounts_for_sandbox = get_arg_from_module_or_dict(
         benchmark_module, "KEEP_MOUNTS_FOR_SANDBOX", False, override_dict

From 1e418e7032bab7d266363b786fe441bb5f63c4a4 Mon Sep 17 00:00:00 2001
From: George Zelenfroind <gzelenfroind@nvidia.com>
Date: Tue, 23 Dec 2025 08:08:05 -0800
Subject: [PATCH 38/38] fix linter

Signed-off-by: George Zelenfroind <gzelenfroind@nvidia.com>
---
 nemo_skills/inference/generate.py  | 13 ++++++-------
 nemo_skills/pipeline/utils/eval.py |  4 ++--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
index 00993c5f66..fd46cc3a9b 100644
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
@@ -406,12 +406,11 @@ def setup_llm(self):
 
         # Audio wrapper (preprocesses messages before they reach the model)
         # Auto-enable for audio benchmarks on vLLM (eval_type=audio OR dataset_group=speechlm)
-        should_enable_audio = (
-            self.cfg.audio is not None or 
-            (self.cfg.server.get("server_type", "").lower() == "vllm" and 
-             (self.cfg.eval_type == "audio" or self.cfg.dataset_group == "speechlm"))
+        should_enable_audio = self.cfg.audio is not None or (
+            self.cfg.server.get("server_type", "").lower() == "vllm"
+            and (self.cfg.eval_type == "audio" or self.cfg.dataset_group == "speechlm")
         )
-        
+
         if should_enable_audio:
             audio_supported_servers = {"vllm"}
             server_type = self.cfg.server.get("server_type", "").lower()
@@ -420,10 +419,10 @@ def setup_llm(self):
                     f"Audio processing is not supported for server_type='{server_type}'. "
                     f"Supported server types: {audio_supported_servers}"
                 )
-            
+
             # Use provided config or create default
             audio_config = self.cfg.audio if self.cfg.audio is not None else AudioProcessorConfig()
-            
+
             llm = AudioProcessor(
                 llm,
                 audio_config,
diff --git a/nemo_skills/pipeline/utils/eval.py b/nemo_skills/pipeline/utils/eval.py
index d2b8a5f2ac..9245337064 100644
--- a/nemo_skills/pipeline/utils/eval.py
+++ b/nemo_skills/pipeline/utils/eval.py
@@ -131,12 +131,12 @@ def get_benchmark_args_from_module(
     if eval_args:
         generation_args = f"{eval_args} {generation_args}"
     generation_args += f" ++eval_config.split={split} "
-    
+
     # Pass dataset_group from benchmark config if defined
     dataset_group = get_arg_from_module_or_dict(benchmark_module, "DATASET_GROUP", None, override_dict=override_dict)
     if dataset_group:
         generation_args += f" ++dataset_group={dataset_group} "
-    
+
     requires_sandbox = get_arg_from_module_or_dict(benchmark_module, "REQUIRES_SANDBOX", False, override_dict)
     keep_mounts_for_sandbox = get_arg_from_module_or_dict(
         benchmark_module, "KEEP_MOUNTS_FOR_SANDBOX", False, override_dict