From 07a408e18115b3d54e869730066eb7eaf995849c Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 13 Nov 2025 12:18:31 -0800 Subject: [PATCH 01/38] print Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/model/vllm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index e9a2146520..1b4d9f7998 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -117,6 +117,7 @@ def _build_chat_request_params( tools: list[dict] | None = None, extra_body: dict = None, ) -> dict: + print("messages", messages) request = { "messages": messages, "max_tokens": tokens_to_generate, From 6acbc9f61e2b571e6becb09fea95e0eb04f90b21 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 13 Nov 2025 13:48:12 -0800 Subject: [PATCH 02/38] content_text_to_list Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/model/vllm.py | 35 ++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index 1b4d9f7998..b5250654ae 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -13,7 +13,7 @@ # limitations under the License. import logging - +import base64 import requests from nemo_skills.utils import get_logger_name @@ -23,6 +23,38 @@ LOG = logging.getLogger(get_logger_name(__file__)) +def audio_file_to_base64(audio_file_path: str): + """Encodes an audio file into a base64 string.""" + with open(audio_file_path, "rb") as audio_file: + audio_content = audio_file.read() + return base64.b64encode(audio_content).decode("utf-8") + +def content_text_to_list(message): + content = message["content"] + if isinstance(content, str): + message["content"] = [{"type": "text", "text": content}] + elif isinstance(content, list): + message["content"] = content + else: + raise TypeError(str(content)) + + if "audios" in message: + for audio in message["audios"]: + base64_audio = audio_file_to_base64(audio["path"]) + audio_message = { + "type": "audio_url", + "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"} + } + message["content"].append(audio_message) + if "audio" in message: + audio = message["audio"] + base64_audio = audio_file_to_base64(audio["path"]) + audio_message = { + "type": "audio_url", + "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"} + } + message["content"].append(audio_message) + return message class VLLMModel(BaseModel): def __init__(self, **kwargs): @@ -117,6 +149,7 @@ def _build_chat_request_params( tools: list[dict] | None = None, extra_body: dict = None, ) -> dict: + messages = [content_text_to_list(message) for message in messages] print("messages", messages) request = { "messages": messages, From 78c6313655a721fb56844f0e9da5d940431ee992 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 13 Nov 2025 14:23:29 -0800 Subject: [PATCH 03/38] rm print Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/model/vllm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index b5250654ae..3bfbd25a6c 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -150,7 +150,6 @@ def _build_chat_request_params( extra_body: dict = None, ) -> dict: messages = [content_text_to_list(message) for message in messages] - print("messages", messages) request = { "messages": messages, "max_tokens": tokens_to_generate, From 8f5c3a96939dfa2940cc7b875f7f5c88328e5716 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 13 Nov 2025 15:53:41 -0800 Subject: [PATCH 04/38] self.discard_binary_types Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 11 ++++++++ nemo_skills/inference/model/vllm.py | 39 ++++++++++++++++++----------- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index aae36c7351..6a1cef71e9 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -525,6 +525,14 @@ def dump_outputs(self, outputs, data_points, fout): for output in outputs: fout.write(json.dumps(output) + "\n") + def discard_binary_types(self, output): + binary_types = {"audio_url"} + if isinstance(output["content"], list): + for message in output["content"]: + if "type" in message and message["type"] in binary_types: + message[message["type"]]["url"] = "" + return output + async def postprocess_single_output(self, output, original_data_point): # to make it easier to follow up with other generations and limit accidental errors, we are adding # all of the original data to the output file alongside the new generations @@ -540,6 +548,9 @@ async def postprocess_single_output(self, output, original_data_point): for key in output: original_data_point.pop(key, None) output.update(original_data_point) + + output = self.discard_binary_types(output) + if self.cfg.parse_reasoning: parse_reasoning( output, diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index 3bfbd25a6c..bf783eb9e5 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -30,15 +30,32 @@ def audio_file_to_base64(audio_file_path: str): return base64.b64encode(audio_content).decode("utf-8") def content_text_to_list(message): - content = message["content"] - if isinstance(content, str): - message["content"] = [{"type": "text", "text": content}] - elif isinstance(content, list): - message["content"] = content - else: - raise TypeError(str(content)) + if "audio" in message: + content = message["content"] + if isinstance(content, str): + message["content"] = [{"type": "text", "text": content}] + elif isinstance(content, list): + message["content"] = content + else: + raise TypeError(str(content)) + + audio = message["audio"] + base64_audio = audio_file_to_base64(audio["path"]) + audio_message = { + "type": "audio_url", + "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"} + } + message["content"].append(audio_message) if "audios" in message: + content = message["content"] + if isinstance(content, str): + message["content"] = [{"type": "text", "text": content}] + elif isinstance(content, list): + message["content"] = content + else: + raise TypeError(str(content)) + for audio in message["audios"]: base64_audio = audio_file_to_base64(audio["path"]) audio_message = { @@ -46,14 +63,6 @@ def content_text_to_list(message): "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"} } message["content"].append(audio_message) - if "audio" in message: - audio = message["audio"] - base64_audio = audio_file_to_base64(audio["path"]) - audio_message = { - "type": "audio_url", - "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"} - } - message["content"].append(audio_message) return message class VLLMModel(BaseModel): From 5e36e553e843540b265d4891261734bbb71e94d7 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 13 Nov 2025 15:59:57 -0800 Subject: [PATCH 05/38] print Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 6a1cef71e9..5fd987fffd 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -527,6 +527,7 @@ def dump_outputs(self, outputs, data_points, fout): def discard_binary_types(self, output): binary_types = {"audio_url"} + print("output", output) if isinstance(output["content"], list): for message in output["content"]: if "type" in message and message["type"] in binary_types: From 862be043e7fa514c40b113669ff35b140b6b0df5 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 13 Nov 2025 16:09:48 -0800 Subject: [PATCH 06/38] messages Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 5fd987fffd..b829599916 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -525,14 +525,12 @@ def dump_outputs(self, outputs, data_points, fout): for output in outputs: fout.write(json.dumps(output) + "\n") - def discard_binary_types(self, output): + def drop_binary_data(self, output): binary_types = {"audio_url"} - print("output", output) - if isinstance(output["content"], list): - for message in output["content"]: - if "type" in message and message["type"] in binary_types: - message[message["type"]]["url"] = "" - return output + if isinstance(output["messages"][0]["content"], list): + for content in output["messages"][0]["content"]: + if "type" in content and content["type"] in binary_types: + content[content["type"]]["url"] = "" async def postprocess_single_output(self, output, original_data_point): # to make it easier to follow up with other generations and limit accidental errors, we are adding @@ -550,7 +548,7 @@ async def postprocess_single_output(self, output, original_data_point): original_data_point.pop(key, None) output.update(original_data_point) - output = self.discard_binary_types(output) + self.drop_binary_data(output) if self.cfg.parse_reasoning: parse_reasoning( From 6768b00c65f06d6ff247289074e2d34b3acf00b8 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 13 Nov 2025 16:22:07 -0800 Subject: [PATCH 07/38] binary_data_to_drop_from_output Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 4 ++-- nemo_skills/inference/model/vllm.py | 17 ----------------- 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index b829599916..b8ee7c4e13 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -526,10 +526,10 @@ def dump_outputs(self, outputs, data_points, fout): fout.write(json.dumps(output) + "\n") def drop_binary_data(self, output): - binary_types = {"audio_url"} + binary_data_to_drop_from_output = {"audio_url"} if isinstance(output["messages"][0]["content"], list): for content in output["messages"][0]["content"]: - if "type" in content and content["type"] in binary_types: + if "type" in content and content["type"] in binary_data_to_drop_from_output: content[content["type"]]["url"] = "" async def postprocess_single_output(self, output, original_data_point): diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index bf783eb9e5..ab4a3ab029 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -30,23 +30,6 @@ def audio_file_to_base64(audio_file_path: str): return base64.b64encode(audio_content).decode("utf-8") def content_text_to_list(message): - if "audio" in message: - content = message["content"] - if isinstance(content, str): - message["content"] = [{"type": "text", "text": content}] - elif isinstance(content, list): - message["content"] = content - else: - raise TypeError(str(content)) - - audio = message["audio"] - base64_audio = audio_file_to_base64(audio["path"]) - audio_message = { - "type": "audio_url", - "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"} - } - message["content"].append(audio_message) - if "audios" in message: content = message["content"] if isinstance(content, str): From f522c0b719c0643d8e5409a424893a69e2850f01 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Nov 2025 11:31:06 -0800 Subject: [PATCH 08/38] print Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/model/vllm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index ab4a3ab029..509516238d 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -50,6 +50,7 @@ def content_text_to_list(message): class VLLMModel(BaseModel): def __init__(self, **kwargs): + print("kwargskwargs", str(kwargs)) super().__init__(**kwargs) def _get_tokenizer_endpoint(self): From d48d43f1b7ee1fb8d1178f89c055caecbfc26105 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Nov 2025 11:37:21 -0800 Subject: [PATCH 09/38] print Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 1 + nemo_skills/inference/model/vllm.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index b8ee7c4e13..c33ba57d4c 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -607,6 +607,7 @@ async def generate_with_semaphore(self, **generation_params): as long as those requests also use this function. """ async with self.semaphore: + print("generation_paramsgeneration_params", str(generation_params)) return await self.llm.generate_async(**generation_params) async def evaluate_single_datapoint(self, data_point): diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index 509516238d..ab4a3ab029 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -50,7 +50,6 @@ def content_text_to_list(message): class VLLMModel(BaseModel): def __init__(self, **kwargs): - print("kwargskwargs", str(kwargs)) super().__init__(**kwargs) def _get_tokenizer_endpoint(self): From 4487ca1478f297e3f32d757922b6ad94e6279e0d Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Nov 2025 14:46:14 -0800 Subject: [PATCH 10/38] data_dir Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 1 - nemo_skills/inference/model/vllm.py | 43 +++++++++++++++-------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index c33ba57d4c..b8ee7c4e13 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -607,7 +607,6 @@ async def generate_with_semaphore(self, **generation_params): as long as those requests also use this function. """ async with self.semaphore: - print("generation_paramsgeneration_params", str(generation_params)) return await self.llm.generate_async(**generation_params) async def evaluate_single_datapoint(self, data_point): diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index ab4a3ab029..02a07f1fc2 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -13,6 +13,7 @@ # limitations under the License. import logging +import os import base64 import requests @@ -29,27 +30,10 @@ def audio_file_to_base64(audio_file_path: str): audio_content = audio_file.read() return base64.b64encode(audio_content).decode("utf-8") -def content_text_to_list(message): - if "audios" in message: - content = message["content"] - if isinstance(content, str): - message["content"] = [{"type": "text", "text": content}] - elif isinstance(content, list): - message["content"] = content - else: - raise TypeError(str(content)) - - for audio in message["audios"]: - base64_audio = audio_file_to_base64(audio["path"]) - audio_message = { - "type": "audio_url", - "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"} - } - message["content"].append(audio_message) - return message class VLLMModel(BaseModel): - def __init__(self, **kwargs): + def __init__(self, data_dir, **kwargs): + self.data_dir = data_dir super().__init__(**kwargs) def _get_tokenizer_endpoint(self): @@ -123,6 +107,25 @@ def _build_completion_request_params( "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body), } + def content_text_to_list(self, message): + if "audios" in message: + content = message["content"] + if isinstance(content, str): + message["content"] = [{"type": "text", "text": content}] + elif isinstance(content, list): + message["content"] = content + else: + raise TypeError(str(content)) + + for audio in message["audios"]: + base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) + audio_message = { + "type": "audio_url", + "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"} + } + message["content"].append(audio_message) + return message + def _build_chat_request_params( self, messages: list[dict], @@ -141,7 +144,7 @@ def _build_chat_request_params( tools: list[dict] | None = None, extra_body: dict = None, ) -> dict: - messages = [content_text_to_list(message) for message in messages] + messages = [self.content_text_to_list(message) for message in messages] request = { "messages": messages, "max_tokens": tokens_to_generate, From c53d4059844dc6a782b759a7750f37688417f758 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Nov 2025 15:08:19 -0800 Subject: [PATCH 11/38] data_dir='' Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 3 ++- nemo_skills/inference/model/vllm.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index b8ee7c4e13..32b457cfce 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -391,7 +391,8 @@ def setup_llm(self): additional_config={"sandbox": self.cfg.sandbox}, ) else: - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) + print("self.cfgself.cfg", str(self.cfg)) + llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=data_dir) if self.cfg.parallel_thinking.mode is not None: # We don't want to override these key variables which overlap with self.cfg diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index 02a07f1fc2..6036bc8869 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -32,7 +32,7 @@ def audio_file_to_base64(audio_file_path: str): class VLLMModel(BaseModel): - def __init__(self, data_dir, **kwargs): + def __init__(self, data_dir: str = "", **kwargs): self.data_dir = data_dir super().__init__(**kwargs) From a7cf5b5d688a4b71291b51cd322b48fd077af5a3 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Nov 2025 15:21:22 -0800 Subject: [PATCH 12/38] eval_config Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 32b457cfce..ac6c39218d 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -392,7 +392,8 @@ def setup_llm(self): ) else: print("self.cfgself.cfg", str(self.cfg)) - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=data_dir) + print("self.cfg.eval_config[]", self.cfg.eval_config["data_dir"]) + llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=self.cfg.eval_config["data_dir"]) if self.cfg.parallel_thinking.mode is not None: # We don't want to override these key variables which overlap with self.cfg From 689e6fcbfb96aa58cea7903ec6cfa91daf291052 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Nov 2025 15:39:29 -0800 Subject: [PATCH 13/38] eval_type Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index ac6c39218d..2181bd6589 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -13,6 +13,7 @@ # limitations under the License. import asyncio +import os import json import logging import random @@ -391,9 +392,7 @@ def setup_llm(self): additional_config={"sandbox": self.cfg.sandbox}, ) else: - print("self.cfgself.cfg", str(self.cfg)) - print("self.cfg.eval_config[]", self.cfg.eval_config["data_dir"]) - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=self.cfg.eval_config["data_dir"]) + llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)) if self.cfg.parallel_thinking.mode is not None: # We don't want to override these key variables which overlap with self.cfg From 9536b8b00e7edc06b6d41aafe692791346d167b1 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Nov 2025 16:37:18 -0800 Subject: [PATCH 14/38] if Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 2181bd6589..a1dc5e6ac8 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -392,7 +392,11 @@ def setup_llm(self): additional_config={"sandbox": self.cfg.sandbox}, ) else: - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type)) + if self.cfg.eval_config["data_dir"]: + data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) + else: + data_dir = "" + llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir) if self.cfg.parallel_thinking.mode is not None: # We don't want to override these key variables which overlap with self.cfg From 155a4806cb5deb875235bc2e4ef9b7361ad03780 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Nov 2025 16:45:57 -0800 Subject: [PATCH 15/38] or Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index a1dc5e6ac8..afe665dd3e 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -392,10 +392,10 @@ def setup_llm(self): additional_config={"sandbox": self.cfg.sandbox}, ) else: - if self.cfg.eval_config["data_dir"]: - data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) - else: + if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)): data_dir = "" + else: + data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir) if self.cfg.parallel_thinking.mode is not None: From 0d9292d711892070c1bb0d58f8c5100827ad3c57 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Nov 2025 16:54:04 -0800 Subject: [PATCH 16/38] print Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index afe665dd3e..4876b29d51 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -392,6 +392,7 @@ def setup_llm(self): additional_config={"sandbox": self.cfg.sandbox}, ) else: + print("self.cfg.eval_config", self.cfg) if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)): data_dir = "" else: From c9d99a52661ca564a3f28d9184dbd699978ba114 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Nov 2025 17:03:02 -0800 Subject: [PATCH 17/38] print Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 4876b29d51..364bd73375 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -393,10 +393,10 @@ def setup_llm(self): ) else: print("self.cfg.eval_config", self.cfg) - if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)): - data_dir = "" - else: - data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) + # if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)): + # data_dir = "" + # else: + data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir) if self.cfg.parallel_thinking.mode is not None: From 0c54cad5efea067d10e54f57d7d28946c2b4aebc Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Nov 2025 17:25:42 -0800 Subject: [PATCH 18/38] print Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 364bd73375..7caf31a0fc 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -392,11 +392,11 @@ def setup_llm(self): additional_config={"sandbox": self.cfg.sandbox}, ) else: - print("self.cfg.eval_config", self.cfg) - # if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)): - # data_dir = "" - # else: - data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) + if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)): + print("self.cfg.eval_config", self.cfg) + data_dir = "" + else: + data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir) if self.cfg.parallel_thinking.mode is not None: From fc7a3445b3de2023264392a2b7624292948204c8 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Fri, 14 Nov 2025 17:33:23 -0800 Subject: [PATCH 19/38] ++val_type Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/dataset/mmau-pro/closed_form/__init__.py | 1 + nemo_skills/dataset/mmau-pro/open_ended/__init__.py | 1 + nemo_skills/inference/generate.py | 1 - 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_skills/dataset/mmau-pro/closed_form/__init__.py b/nemo_skills/dataset/mmau-pro/closed_form/__init__.py index 4e3b424d84..4390c1d887 100644 --- a/nemo_skills/dataset/mmau-pro/closed_form/__init__.py +++ b/nemo_skills/dataset/mmau-pro/closed_form/__init__.py @@ -16,6 +16,7 @@ METRICS_TYPE = "mmau_pro_closed_form" SCORE_MODULE = "nemo_skills.evaluation.metrics.mmau_pro_metrics" GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=mmau-pro" # NVEmbed judge configuration for closed-form evaluation JUDGE_PIPELINE_ARGS = { diff --git a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py index 22773d6fed..9e1e812a97 100644 --- a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py +++ b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py @@ -16,6 +16,7 @@ METRICS_TYPE = "mmau_pro_open_ended" SCORE_MODULE = "nemo_skills.evaluation.metrics.mmau_pro_metrics" GENERATION_ARGS = "++prompt_format=openai" +EVAL_ARGS = "++eval_type=mmau-pro" # Judge configuration for open-ended evaluation using NVIDIA API JUDGE_PIPELINE_ARGS = { diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 7caf31a0fc..afe665dd3e 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -393,7 +393,6 @@ def setup_llm(self): ) else: if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)): - print("self.cfg.eval_config", self.cfg) data_dir = "" else: data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) From c68f8af81c5551e1ee818109a31dd4f0b16c1ed3 Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 20 Nov 2025 10:08:30 -0800 Subject: [PATCH 20/38] audio Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 5 ++--- nemo_skills/inference/model/vllm.py | 11 ++++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index afe665dd3e..b60cfd3eae 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -392,9 +392,8 @@ def setup_llm(self): additional_config={"sandbox": self.cfg.sandbox}, ) else: - if isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)): - data_dir = "" - else: + data_dir = "" + if "data_dir" in self.cfg.eval_config and not (isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None))): data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir) diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index 6036bc8869..e50e2c90c4 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -108,7 +108,7 @@ def _build_completion_request_params( } def content_text_to_list(self, message): - if "audios" in message: + if "audio" in message or "audios" in message: content = message["content"] if isinstance(content, str): message["content"] = [{"type": "text", "text": content}] @@ -117,6 +117,15 @@ def content_text_to_list(self, message): else: raise TypeError(str(content)) + if "audio" in message: + audio = message["audio"] + base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) + audio_message = { + "type": "audio_url", + "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"} + } + message["content"].append(audio_message) + elif "audios" in message: for audio in message["audios"]: base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) audio_message = { From cfaf38a4dcf4969144cc265aeac829ccbba2301e Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 20 Nov 2025 17:51:15 -0800 Subject: [PATCH 21/38] rm data_dir Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index b60cfd3eae..9bd81e297b 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -392,10 +392,10 @@ def setup_llm(self): additional_config={"sandbox": self.cfg.sandbox}, ) else: - data_dir = "" if "data_dir" in self.cfg.eval_config and not (isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None))): data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir) + llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir) + llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) if self.cfg.parallel_thinking.mode is not None: # We don't want to override these key variables which overlap with self.cfg From 81861646b1310bac6b59231b8fb5892250142a6b Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Thu, 20 Nov 2025 18:02:29 -0800 Subject: [PATCH 22/38] else Signed-off-by: Nikolay Karpov Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 9bd81e297b..776bbfe420 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -395,7 +395,8 @@ def setup_llm(self): if "data_dir" in self.cfg.eval_config and not (isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None))): data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir) - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) + else: + llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) if self.cfg.parallel_thinking.mode is not None: # We don't want to override these key variables which overlap with self.cfg From 813d675d7214ccba076fe9de686256a7bd5d6911 Mon Sep 17 00:00:00 2001 From: mmkrtchyan Date: Wed, 26 Nov 2025 15:23:42 +0400 Subject: [PATCH 23/38] mmau-pro open-ended metrics improvement Signed-off-by: mmkrtchyan --- .../dataset/mmau-pro/open_ended/__init__.py | 3 +- nemo_skills/dataset/mmau-pro/prepare.py | 23 +-- .../evaluation/metrics/mmau_pro_metrics.py | 132 ++++++++++++++---- nemo_skills/inference/generate.py | 16 ++- nemo_skills/prompt/config/judge/mmau-pro.yaml | 30 ++++ nemo_skills/prompt/config/judge/speechlm.yaml | 28 ---- 6 files changed, 163 insertions(+), 69 deletions(-) create mode 100644 nemo_skills/prompt/config/judge/mmau-pro.yaml delete mode 100644 nemo_skills/prompt/config/judge/speechlm.yaml diff --git a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py index 9e1e812a97..c5f09272d2 100644 --- a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py +++ b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py @@ -16,7 +16,6 @@ METRICS_TYPE = "mmau_pro_open_ended" SCORE_MODULE = "nemo_skills.evaluation.metrics.mmau_pro_metrics" GENERATION_ARGS = "++prompt_format=openai" -EVAL_ARGS = "++eval_type=mmau-pro" # Judge configuration for open-ended evaluation using NVIDIA API JUDGE_PIPELINE_ARGS = { @@ -24,4 +23,4 @@ "server_type": "openai", "server_address": "https://integrate.api.nvidia.com/v1", } -JUDGE_ARGS = "++prompt_config=judge/speechlm ++generation_key=judgement" +JUDGE_ARGS = "++prompt_config=judge/mmau-pro ++generation_key=judgement" diff --git a/nemo_skills/dataset/mmau-pro/prepare.py b/nemo_skills/dataset/mmau-pro/prepare.py index a6f04d621b..0ea66ec2b7 100644 --- a/nemo_skills/dataset/mmau-pro/prepare.py +++ b/nemo_skills/dataset/mmau-pro/prepare.py @@ -75,8 +75,8 @@ def format_entry(entry, with_audio=False): if category == "open": content = entry["question"] elif choices and len(choices) > 1: - options_text = "\n".join(f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)) - content = f"{entry['question']}\n\n{options_text}" + options_text = "\n".join(f"{chr(65 + i)}) {choice}" for i, choice in enumerate(choices)) + content = f"{entry['question']}\n\n{options_text}\n\nRespond with the complete text of the correct option, not just the letter." else: content = entry["question"] @@ -84,13 +84,18 @@ def format_entry(entry, with_audio=False): if entry.get("audio_path"): audio_path = entry["audio_path"] - - if isinstance(audio_path, list) and audio_path: - user_message["audios"] = [{"path": path, "duration": 10.0} for path in audio_path] - elif isinstance(audio_path, str): - user_message["audio"] = {"path": audio_path, "duration": 10.0} - - formatted_entry["messages"] = [user_message] + # Prepend /dataset/mmau-pro/ to make paths absolute for cluster + if len(audio_path) == 1: + user_message["audio"] = {"path": f"/dataset/mmau-pro/{audio_path[0]}"} + else: + user_message["audios"] = [{"path": f"/dataset/mmau-pro/{path}"} for path in audio_path] + + # Don't use /no_think for open-ended questions to allow reasoning + system_content = "You are a helpful assistant." + if category != "open": + system_content += " /no_think" + + formatted_entry["messages"] = [{"role": "system", "content": system_content}, user_message] return formatted_entry diff --git a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py index f079049cc1..5e619fce90 100644 --- a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py +++ b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py @@ -1,26 +1,48 @@ -# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - import logging - +import re +import numpy as np from nemo_skills.evaluation.metrics.base import BaseMetrics, as_int, as_percentage -from nemo_skills.evaluation.metrics.utils import is_correct_judgement from nemo_skills.utils import get_logger_name LOG = logging.getLogger(get_logger_name(__file__)) +def extract_multicriteria_scores(judgement_text: str) -> dict[str, float]: + """Extract multi-criteria scores (1-5 scale) from LLM judge evaluation. + + Expected format: + CORRECTNESS: [score] - [justification] + RELEVANCE: [score] - [justification] + COMPLETENESS: [score] - [justification] + CLARITY: [score] - [justification] + OVERALL: [score] - [overall assessment] + + Returns: + Dictionary with keys: correctness, relevance, completeness, clarity, overall + Defaults to 3.0 if score not found. + """ + scores = {} + + patterns = { + "correctness": r"CORRECTNESS:\s*(\d+(?:\.\d+)?)", + "relevance": r"RELEVANCE:\s*(\d+(?:\.\d+)?)", + "completeness": r"COMPLETENESS:\s*(\d+(?:\.\d+)?)", + "clarity": r"CLARITY:\s*(\d+(?:\.\d+)?)", + "overall": r"OVERALL:\s*(\d+(?:\.\d+)?)", + } + + for criterion, pattern in patterns.items(): + match = re.search(pattern, judgement_text, re.IGNORECASE) + scores[criterion] = float(match.group(1)) if match else 3.0 + + # Fallback: compute overall if missing or still 3.0 + if "overall" not in scores or scores["overall"] == 3.0: + criteria_scores = [scores.get(k, 3.0) for k in ["correctness", "relevance", "completeness", "clarity"]] + scores["overall"] = sum(criteria_scores) / len(criteria_scores) + + return scores + + class MMAUProMetrics(BaseMetrics): """Metrics class for MMAU-Pro benchmark (all subgroups).""" @@ -28,16 +50,24 @@ def __init__(self, compute_no_answer: bool = True, max_k: int = 1): super().__init__(compute_no_answer=compute_no_answer) self.max_k = max_k + # Track multi-criteria scores for open-ended questions (1-5 scale) + self.multicriteria_scores = { + "correctness": [], + "relevance": [], + "completeness": [], + "clarity": [], + "overall": [], + } + def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]: """Extract correctness scores from prediction.""" score_dict = {} - # Open-ended: extract from judge result + # Open-ended: use LLM judge correctness score >= 3 as correct if "judgement" in prediction: - judge_result = is_correct_judgement(prediction["judgement"]) - score_dict["judge_correct"] = judge_result - score_dict["correct"] = judge_result - # Closed-form and instruction following: use is_correct + multicriteria = extract_multicriteria_scores(prediction["judgement"]) + score_dict["correct"] = multicriteria.get("correctness", 3.0) >= 3.0 + # Closed-form / instruction-following: use binary correctness elif "is_correct" in prediction: score_dict["correct"] = prediction["is_correct"] else: @@ -58,24 +88,61 @@ def get_incorrect_sample(self, prediction: dict) -> dict: def update(self, predictions): """Update metrics with new predictions.""" super().update(predictions) + predicted_answers = [pred.get("generation", None).strip() or None for pred in predictions] self._compute_pass_at_k(predictions=predictions, predicted_answers=predicted_answers) self._compute_majority_at_k(predictions=predictions, predicted_answers=predicted_answers) + # Collect multi-criteria scores for open-ended questions + for pred in predictions: + if "judgement" in pred: + multicriteria = extract_multicriteria_scores(pred["judgement"]) + for criterion in self.multicriteria_scores: + self.multicriteria_scores[criterion].append(multicriteria.get(criterion, 3.0)) + def get_metrics(self): """Get computed metrics.""" metrics_dict = super().get_metrics() + for agg_mode, agg_metrics in metrics_dict.items(): - # Ensure avg_tokens is always present for MMAU-Pro + # Ensure avg_tokens is present if "avg_tokens" not in agg_metrics: agg_metrics["avg_tokens"] = 0 if "no_answer" in agg_metrics: agg_metrics["no_answer"] = agg_metrics["no_answer"] / 2.0 - # Set success_rate from correct or judge_correct - if "judge_correct" in agg_metrics: - agg_metrics["success_rate"] = agg_metrics["judge_correct"] + + # Add multi-criteria averages for open-ended (convert 1-5 scale to percentage) + if self.multicriteria_scores["overall"]: + for criterion in self.multicriteria_scores: + scores = self.multicriteria_scores[criterion] + if scores: + # Convert 1-5 scale to 0-100 percentage scale + avg_score = np.mean(scores) + std_score = np.std(scores) + agg_metrics[f"avg_{criterion}"] = (avg_score / 5.0) * 100 + agg_metrics[f"std_{criterion}"] = (std_score / 5.0) * 100 + + # Set correct and success_rate to avg_correctness for open-ended + agg_metrics["correct"] = agg_metrics["avg_correctness"] + agg_metrics["success_rate"] = agg_metrics["avg_correctness"] + + # Calculate good/poor response rates based on overall >= 4 or <= 2 + overall_scores = self.multicriteria_scores["overall"] + good_responses = sum(1 for score in overall_scores if score >= 4.0) + poor_responses = sum(1 for score in overall_scores if score <= 2.0) + + agg_metrics["good_response_rate"] = (good_responses / len(overall_scores)) * 100 + agg_metrics["poor_response_rate"] = (poor_responses / len(overall_scores)) * 100 + + # For closed-form / instruction-following: use binary correctness elif "correct" in agg_metrics: agg_metrics["success_rate"] = agg_metrics["correct"] + + # Round all numeric values to 2 decimal places + for key, value in agg_metrics.items(): + if isinstance(value, float) and not isinstance(value, bool): + agg_metrics[key] = round(value, 2) + return metrics_dict def metrics_to_print(self): @@ -87,5 +154,20 @@ def metrics_to_print(self): } if self.compute_no_answer: base_metrics["no_answer"] = as_percentage + + # Add multi-criteria metrics for open-ended questions (now in percentage format) + if self.multicriteria_scores["overall"]: + base_metrics.update( + { + "avg_overall": as_percentage, + "avg_correctness": as_percentage, + "avg_relevance": as_percentage, + "avg_completeness": as_percentage, + "avg_clarity": as_percentage, + "good_response_rate": as_percentage, + "poor_response_rate": as_percentage, + } + ) + base_metrics["num_entries"] = as_int return base_metrics diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 776bbfe420..75886767b4 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -531,11 +531,17 @@ def dump_outputs(self, outputs, data_points, fout): fout.write(json.dumps(output) + "\n") def drop_binary_data(self, output): - binary_data_to_drop_from_output = {"audio_url"} - if isinstance(output["messages"][0]["content"], list): - for content in output["messages"][0]["content"]: - if "type" in content and content["type"] in binary_data_to_drop_from_output: - content[content["type"]]["url"] = "" + """Remove binary data (like base64 audio) from messages to keep output files smaller.""" + for message in output["messages"]: + # Skip if content is not a list (e.g., string content in system messages) + if not isinstance(message.get("content"), list): + continue + + # Filter out audio_url items from list-style content + message["content"] = [ + content for content in message["content"] + if content.get("type") != "audio_url" + ] async def postprocess_single_output(self, output, original_data_point): # to make it easier to follow up with other generations and limit accidental errors, we are adding diff --git a/nemo_skills/prompt/config/judge/mmau-pro.yaml b/nemo_skills/prompt/config/judge/mmau-pro.yaml new file mode 100644 index 0000000000..5339e4ab0d --- /dev/null +++ b/nemo_skills/prompt/config/judge/mmau-pro.yaml @@ -0,0 +1,30 @@ +# Judge prompt configuration for Speech/Audio Language Model evaluation +# Used for evaluating open-ended responses in MMAU-Pro benchmark +# Uses multi-criteria scoring on 1-5 scale + +user: |- + You are an expert evaluator for audio and speech-related questions. Please evaluate the quality of a model's response to a question. + + Question: {question} + + Reference Answer: {expected_answer} + + Model Response: {generation} + + Please evaluate the model response on the following criteria and provide scores from 1-5 (where 5 is best): + + 1. **Correctness**: How factually accurate is the response compared to the reference? + 2. **Relevance**: How well does the response address the specific question asked? + 3. **Completeness**: Does the response cover all important aspects mentioned in the reference? + 4. **Clarity**: How clear and well-structured is the response? + + For each criterion, provide: + - A score from 1-5 + - A brief justification (1-2 sentences) + + Format your response as: + CORRECTNESS: [score] - [justification] + RELEVANCE: [score] - [justification] + COMPLETENESS: [score] - [justification] + CLARITY: [score] - [justification] + OVERALL: [average score] - [overall assessment] diff --git a/nemo_skills/prompt/config/judge/speechlm.yaml b/nemo_skills/prompt/config/judge/speechlm.yaml deleted file mode 100644 index 4862558145..0000000000 --- a/nemo_skills/prompt/config/judge/speechlm.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Judge prompt configuration for Speech/Audio Language Model evaluation -# Used for evaluating open-ended responses in MMAU-Pro benchmark -# Follows nemo-skills standard Yes/No judgement pattern - -user: |- - You are an expert evaluator for audio and speech-related questions. Please evaluate whether the model's response correctly answers the question. - - Question: {question} - - Reference Answer: {expected_answer} - - Model Response: {generation} - - Your task is to determine if the model's response is correct based on the reference answer. Consider: - - 1. **Factual Accuracy**: Is the information in the response factually correct? - 2. **Relevance**: Does the response address the specific question asked? - 3. **Completeness**: Does the response cover the key points from the reference answer? - - Please first explain your reasoning in 2-3 sentences, then provide your final judgement. - - Your final judgement must be either "Yes" or "No": - - "Yes" if the model response is correct and adequately answers the question - - "No" if the model response is incorrect, irrelevant, or inadequate - - Format your response as: - Reasoning: [Your explanation] - Judgement: [Yes or No] From 1b7e7151c4bc3665665356a0a4bf9caa2c5d80fa Mon Sep 17 00:00:00 2001 From: mmkrtchyan Date: Wed, 26 Nov 2025 18:17:43 +0400 Subject: [PATCH 24/38] pre-commit fixes Signed-off-by: mmkrtchyan --- .../evaluation/metrics/mmau_pro_metrics.py | 18 ++++++++++++++- nemo_skills/inference/generate.py | 23 +++++++++---------- nemo_skills/inference/model/vllm.py | 16 +++++-------- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py index 5e619fce90..000dbcf13f 100644 --- a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py +++ b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py @@ -1,6 +1,22 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import logging import re + import numpy as np + from nemo_skills.evaluation.metrics.base import BaseMetrics, as_int, as_percentage from nemo_skills.utils import get_logger_name @@ -125,7 +141,7 @@ def get_metrics(self): # Set correct and success_rate to avg_correctness for open-ended agg_metrics["correct"] = agg_metrics["avg_correctness"] agg_metrics["success_rate"] = agg_metrics["avg_correctness"] - + # Calculate good/poor response rates based on overall >= 4 or <= 2 overall_scores = self.multicriteria_scores["overall"] good_responses = sum(1 for score in overall_scores if score >= 4.0) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 75886767b4..f78e3baef3 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -13,9 +13,9 @@ # limitations under the License. import asyncio -import os import json import logging +import os import random import shutil import subprocess @@ -392,9 +392,11 @@ def setup_llm(self): additional_config={"sandbox": self.cfg.sandbox}, ) else: - if "data_dir" in self.cfg.eval_config and not (isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None))): - data_dir =os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir = data_dir) + if "data_dir" in self.cfg.eval_config and not ( + isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)) + ): + data_dir = os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) + llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=data_dir) else: llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) @@ -531,18 +533,15 @@ def dump_outputs(self, outputs, data_points, fout): fout.write(json.dumps(output) + "\n") def drop_binary_data(self, output): - """Remove binary data (like base64 audio) from messages to keep output files smaller.""" + """Remove binary data (like base64 audio) from messages to keep output files smaller.""" for message in output["messages"]: # Skip if content is not a list (e.g., string content in system messages) if not isinstance(message.get("content"), list): continue - + # Filter out audio_url items from list-style content - message["content"] = [ - content for content in message["content"] - if content.get("type") != "audio_url" - ] - + message["content"] = [content for content in message["content"] if content.get("type") != "audio_url"] + async def postprocess_single_output(self, output, original_data_point): # to make it easier to follow up with other generations and limit accidental errors, we are adding # all of the original data to the output file alongside the new generations @@ -558,7 +557,7 @@ async def postprocess_single_output(self, output, original_data_point): for key in output: original_data_point.pop(key, None) output.update(original_data_point) - + self.drop_binary_data(output) if self.cfg.parse_reasoning: diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index e50e2c90c4..cff46cf0e6 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import base64 import logging import os -import base64 + import requests from nemo_skills.utils import get_logger_name @@ -24,6 +25,7 @@ LOG = logging.getLogger(get_logger_name(__file__)) + def audio_file_to_base64(audio_file_path: str): """Encodes an audio file into a base64 string.""" with open(audio_file_path, "rb") as audio_file: @@ -116,22 +118,16 @@ def content_text_to_list(self, message): message["content"] = content else: raise TypeError(str(content)) - + if "audio" in message: audio = message["audio"] base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) - audio_message = { - "type": "audio_url", - "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"} - } + audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} message["content"].append(audio_message) elif "audios" in message: for audio in message["audios"]: base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) - audio_message = { - "type": "audio_url", - "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"} - } + audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} message["content"].append(audio_message) return message From ea8f2943c0ab0dfb416f2bb2006e8f83e48a1181 Mon Sep 17 00:00:00 2001 From: mmkrtchyan Date: Wed, 26 Nov 2025 19:26:16 +0400 Subject: [PATCH 25/38] Add audio tests for vLLM server Signed-off-by: mmkrtchyan --- tests/gpu-tests/test_vllm_audio.py | 84 ++++++++++++++++ tests/test_vllm_audio.py | 156 +++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 tests/gpu-tests/test_vllm_audio.py create mode 100644 tests/test_vllm_audio.py diff --git a/tests/gpu-tests/test_vllm_audio.py b/tests/gpu-tests/test_vllm_audio.py new file mode 100644 index 0000000000..8183adaa80 --- /dev/null +++ b/tests/gpu-tests/test_vllm_audio.py @@ -0,0 +1,84 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import shutil +import subprocess +import tempfile +from pathlib import Path + +import pytest +from utils import require_env_var + + +@pytest.mark.gpu +def test_vllm_audio_generation(): + """Integration test: Generate with vLLM server using audio input.""" + model_path = require_env_var("NEMO_SKILLS_TEST_HF_MODEL") + model_type = require_env_var("NEMO_SKILLS_TEST_MODEL_TYPE") + + output_dir = f"/tmp/nemo-skills-tests/{model_type}/vllm-audio-generation" + # Clean up output directory + if Path(output_dir).exists(): + shutil.rmtree(output_dir) + + # Create test input file with audio + with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f: + test_data = [ + { + "problem": "Transcribe this audio", + "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t2_16.wav"}, + }, + { + "problem": "What is in this audio?", + "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t3_16.wav"}, + }, + ] + for item in test_data: + f.write(json.dumps(item) + '\n') + input_file = f.name + + try: + cmd = ( + f"ns generate " + f" --cluster test-local --config_dir {Path(__file__).absolute().parent} " + f" --model {model_path} " + f" --output_dir {output_dir} " + f" --server_type vllm " + f" --server_gpus 1 " + f" --server_nodes 1 " + f" --server_args '--enforce-eager' " + f" --input_file={input_file} " + f" ++prompt_config=openai " + f" ++skip_filled=False " + ) + subprocess.run(cmd, shell=True, check=True) + + # Verify output exists and has audio-related generation + with open(f"{output_dir}/output.jsonl") as fin: + lines = fin.readlines() + + assert len(lines) == 2, "Should have 2 output lines" + + for line in lines: + data = json.loads(line) + assert "generation" in data, "Should have generation field" + assert len(data["generation"]) > 0, "Generation should not be empty" + # If model supports audio, generation should contain something + print(f"Generated: {data['generation']}") + + finally: + # Cleanup temp file + Path(input_file).unlink(missing_ok=True) + diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py new file mode 100644 index 0000000000..56bee85aa2 --- /dev/null +++ b/tests/test_vllm_audio.py @@ -0,0 +1,156 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import base64 +import os +import tempfile +from unittest.mock import AsyncMock, patch + +import pytest + +from nemo_skills.inference.model.vllm import VLLMModel, audio_file_to_base64 + + +# ----------------------- +# Unit tests - no server required +# ----------------------- + +def test_audio_file_to_base64(): + """Test basic audio file encoding to base64.""" + with tempfile.NamedTemporaryFile(mode='wb', suffix='.wav', delete=False) as f: + test_content = b'RIFF' + b'\x00' * 100 + f.write(test_content) + temp_path = f.name + + try: + result = audio_file_to_base64(temp_path) + assert isinstance(result, str) + assert len(result) > 0 + decoded = base64.b64decode(result) + assert decoded == test_content + finally: + os.unlink(temp_path) + + +@pytest.fixture +def vllm_model(tmp_path): + """Create a VLLMModel instance for testing.""" + audio_dir = tmp_path / "audio" + audio_dir.mkdir() + model = VLLMModel(model="test-model", data_dir=str(tmp_path), base_url="http://localhost:5000") + return model + + +def test_content_text_to_list_with_audio(vllm_model, tmp_path): + """Test converting string content with audio to list format.""" + audio_path = tmp_path / "audio" / "test.wav" + audio_path.parent.mkdir(exist_ok=True) + with open(audio_path, 'wb') as f: + f.write(b'RIFF' + b'\x00' * 100) + + message = {"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}} + + result = vllm_model.content_text_to_list(message) + + assert isinstance(result["content"], list) + assert len(result["content"]) == 2 + assert result["content"][0]["type"] == "text" + assert result["content"][1]["type"] == "audio_url" + assert result["content"][1]["audio_url"]["url"].startswith("data:audio/wav;base64,") + + +def test_content_text_to_list_with_multiple_audios(vllm_model, tmp_path): + """Test handling message with multiple audio files.""" + audio_dir = tmp_path / "audio" + audio_dir.mkdir(exist_ok=True) + + for i in range(2): + with open(audio_dir / f"test_{i}.wav", 'wb') as f: + f.write(b'RIFF' + b'\x00' * 100) + + message = { + "role": "user", + "content": "Compare these", + "audios": [{"path": "audio/test_0.wav"}, {"path": "audio/test_1.wav"}], + } + + result = vllm_model.content_text_to_list(message) + + assert isinstance(result["content"], list) + assert len(result["content"]) == 3 + assert result["content"][0]["type"] == "text" + assert result["content"][1]["type"] == "audio_url" + assert result["content"][2]["type"] == "audio_url" + + +# ----------------------- +# Request building tests with audio +# ----------------------- + +def test_build_chat_request_with_audio(tmp_path, vllm_model): + """Test that chat request params are correctly built with audio content.""" + # Create audio file + audio_path = tmp_path / "audio" / "test.wav" + audio_path.parent.mkdir(exist_ok=True) + with open(audio_path, 'wb') as f: + f.write(b'RIFF' + b'\x00' * 100) + + messages = [{"role": "user", "content": "Test audio", "audio": {"path": "audio/test.wav"}}] + + # Build request params - this doesn't make any network calls + params = vllm_model._build_chat_request_params(messages=messages, stream=False, tokens_to_generate=10) + + # Validate request structure + assert "messages" in params + assert len(params["messages"]) == 1 + content_items = params["messages"][0]["content"] + assert isinstance(content_items, list) + assert len(content_items) == 2 + assert content_items[0]["type"] == "text" + assert content_items[1]["type"] == "audio_url" + + # Verify base64 encoding is valid + audio_url = content_items[1]["audio_url"]["url"] + assert audio_url.startswith("data:audio/wav;base64,") + audio_b64 = audio_url.split(",", 1)[1] + decoded = base64.b64decode(audio_b64) + assert decoded.startswith(b'RIFF') + + +@pytest.mark.asyncio +async def test_generate_with_audio_mocked_response(tmp_path, vllm_model): + """Test generate_async with audio by mocking the response (no real server call).""" + # Create audio file + audio_path = tmp_path / "audio" / "test.wav" + audio_path.parent.mkdir(exist_ok=True) + with open(audio_path, 'wb') as f: + f.write(b'RIFF' + b'\x00' * 100) + + messages = [{"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}}] + + # Mock the entire generate_async method - no actual API call made + mock_response = {"generation": "This audio contains speech", "num_generated_tokens": 5} + + with patch.object(vllm_model, "generate_async", new_callable=AsyncMock) as mock_generate: + mock_generate.return_value = mock_response + + # Call the mocked method + response = await vllm_model.generate_async(prompt=messages, tokens_to_generate=50, temperature=0.0) + + # Verify the mock was called correctly + assert response["generation"] == "This audio contains speech" + assert response["num_generated_tokens"] == 5 + mock_generate.assert_awaited_once() + + From 206c7d60096ef6cc8e237be98de910f0ef93d9c0 Mon Sep 17 00:00:00 2001 From: mmkrtchyan Date: Thu, 27 Nov 2025 19:56:11 +0400 Subject: [PATCH 26/38] making audio paths generic Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index f78e3baef3..38d9d340a3 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -15,7 +15,6 @@ import asyncio import json import logging -import os import random import shutil import subprocess @@ -392,11 +391,9 @@ def setup_llm(self): additional_config={"sandbox": self.cfg.sandbox}, ) else: - if "data_dir" in self.cfg.eval_config and not ( - isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)) - ): - data_dir = os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=data_dir) + # data_dir is only needed if audio/media paths are relative + if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config["data_dir"], type(None)): + llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=self.cfg.eval_config["data_dir"]) else: llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) From 289c694e110e20c11d052a6904cfd42e0ea26e0e Mon Sep 17 00:00:00 2001 From: mmkrtchyan Date: Tue, 2 Dec 2025 15:32:54 +0400 Subject: [PATCH 27/38] fix generate.py Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 38d9d340a3..70145639e0 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -380,6 +380,10 @@ def setup_prompt(self): def setup_llm(self): self.sandbox = get_sandbox(**self.cfg.sandbox) if self.cfg.sandbox is not None else None + self.data_dir = None + if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config.get("data_dir"), type(None)): + self.data_dir = self.cfg.eval_config["data_dir"] + if self.cfg.code_execution: llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox) elif self.cfg.tool_modules is not None: @@ -391,11 +395,7 @@ def setup_llm(self): additional_config={"sandbox": self.cfg.sandbox}, ) else: - # data_dir is only needed if audio/media paths are relative - if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config["data_dir"], type(None)): - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, data_dir=self.cfg.eval_config["data_dir"]) - else: - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) + llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) if self.cfg.parallel_thinking.mode is not None: # We don't want to override these key variables which overlap with self.cfg From 3cc7ae5a392a6cd330ef65be7d1fc8ed07ea3821 Mon Sep 17 00:00:00 2001 From: mmkrtchyan Date: Fri, 5 Dec 2025 16:36:03 +0400 Subject: [PATCH 28/38] requested changes and audio/text order fix Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 80 ++++++++++++++-- nemo_skills/inference/model/vllm.py | 36 -------- tests/test_vllm_audio.py | 136 +++++++++------------------- 3 files changed, 117 insertions(+), 135 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 70145639e0..7c721a5d3f 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -13,6 +13,7 @@ # limitations under the License. import asyncio +import base64 import json import logging import random @@ -186,6 +187,9 @@ class GenerateSolutionsConfig: # If True, will enable litellm disk cache (useful for keeping intermediate results in case of job timelimit failures) enable_litellm_cache: bool = False + # List of content types to drop from messages (e.g., base64 audio) to keep output files smaller + drop_content_types: list[str] = field(default_factory=lambda: ["audio_url"]) + # Evaluation setup if requested. If eval_type is set to None, evaluation is skipped eval_type: str | None = None # "lean4-proof", "math", etc. eval_config: dict = field(default_factory=dict) # Config for the evaluator @@ -380,10 +384,6 @@ def setup_prompt(self): def setup_llm(self): self.sandbox = get_sandbox(**self.cfg.sandbox) if self.cfg.sandbox is not None else None - self.data_dir = None - if "data_dir" in self.cfg.eval_config and not isinstance(self.cfg.eval_config.get("data_dir"), type(None)): - self.data_dir = self.cfg.eval_config["data_dir"] - if self.cfg.code_execution: llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox) elif self.cfg.tool_modules is not None: @@ -525,6 +525,67 @@ def fill_prompt(self, data_point, data): filled_prompt += self.cfg.prompt_suffix return filled_prompt + def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]: + """Preprocess the prompt before sending to the model. + + Override this method to add custom preprocessing logic. + By default, auto-detects and handles audio file path to base64 conversion. + """ + if not isinstance(prompt, list): + return prompt + + # Auto-detect and convert audio file paths to base64 + prompt = [self._convert_audio_to_base64(message) for message in prompt] + + return prompt + + def _audio_file_to_base64(self, audio_file_path: str) -> str: + """Encodes an audio file into a base64 string.""" + with open(audio_file_path, "rb") as audio_file: + audio_content = audio_file.read() + return base64.b64encode(audio_content).decode("utf-8") + + def _convert_audio_to_base64(self, message: dict) -> dict: + """Convert audio file paths in a message to base64 inline content. + + Looks for 'audio' or 'audios' keys in the message and converts them + to base64-encoded audio_url content items. Removes the original keys + after conversion to prevent double-processing by model-specific code. + + CRITICAL: Audio must come BEFORE text for Qwen Audio to transcribe correctly. + """ + if "audio" not in message and "audios" not in message: + return message + + audio_items = [] + + # Handle single audio + if "audio" in message: + audio = message.pop("audio") + base64_audio = self._audio_file_to_base64(audio["path"]) + audio_items.append({"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}) + + # Handle multiple audios + if "audios" in message: + for audio in message.pop("audios"): + base64_audio = self._audio_file_to_base64(audio["path"]) + audio_items.append( + {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} + ) + + # Convert existing content to list format and place AFTER audio + content = message.get("content", "") + if isinstance(content, str): + text_items = [{"type": "text", "text": content}] if content else [] + elif isinstance(content, list): + text_items = content + else: + raise TypeError(f"Unsupported content type: {type(content)}") + + message["content"] = audio_items + text_items + + return message + def dump_outputs(self, outputs, data_points, fout): for output in outputs: fout.write(json.dumps(output) + "\n") @@ -536,8 +597,10 @@ def drop_binary_data(self, output): if not isinstance(message.get("content"), list): continue - # Filter out audio_url items from list-style content - message["content"] = [content for content in message["content"] if content.get("type") != "audio_url"] + # Filter out content types specified in drop_content_types config + message["content"] = [ + content for content in message["content"] if content.get("type") not in self.cfg.drop_content_types + ] async def postprocess_single_output(self, output, original_data_point): # to make it easier to follow up with other generations and limit accidental errors, we are adding @@ -587,10 +650,13 @@ async def process_single_datapoint(self, data_point, all_data): # Already a dict from Hydra inference_params = dict(self.cfg.inference) + prompt = self.fill_prompt(data_point, all_data) + prompt = self.preprocess_prompt(prompt) + generation_params = { **inference_params, **self.extra_generate_params, - "prompt": self.fill_prompt(data_point, all_data), + "prompt": prompt, "stop_phrases": [self.cfg.stop_phrase] if self.cfg.stop_phrase else None, } diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index cff46cf0e6..fb3de3f0e7 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -12,9 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import base64 import logging -import os import requests @@ -26,18 +24,7 @@ LOG = logging.getLogger(get_logger_name(__file__)) -def audio_file_to_base64(audio_file_path: str): - """Encodes an audio file into a base64 string.""" - with open(audio_file_path, "rb") as audio_file: - audio_content = audio_file.read() - return base64.b64encode(audio_content).decode("utf-8") - - class VLLMModel(BaseModel): - def __init__(self, data_dir: str = "", **kwargs): - self.data_dir = data_dir - super().__init__(**kwargs) - def _get_tokenizer_endpoint(self): """ Returns the tokenizer endpoint if available, otherwise returns None. @@ -109,28 +96,6 @@ def _build_completion_request_params( "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body), } - def content_text_to_list(self, message): - if "audio" in message or "audios" in message: - content = message["content"] - if isinstance(content, str): - message["content"] = [{"type": "text", "text": content}] - elif isinstance(content, list): - message["content"] = content - else: - raise TypeError(str(content)) - - if "audio" in message: - audio = message["audio"] - base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) - audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} - message["content"].append(audio_message) - elif "audios" in message: - for audio in message["audios"]: - base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) - audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} - message["content"].append(audio_message) - return message - def _build_chat_request_params( self, messages: list[dict], @@ -149,7 +114,6 @@ def _build_chat_request_params( tools: list[dict] | None = None, extra_body: dict = None, ) -> dict: - messages = [self.content_text_to_list(message) for message in messages] request = { "messages": messages, "max_tokens": tokens_to_generate, diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py index 56bee85aa2..d806c46121 100644 --- a/tests/test_vllm_audio.py +++ b/tests/test_vllm_audio.py @@ -15,18 +15,28 @@ import base64 import os import tempfile -from unittest.mock import AsyncMock, patch +from unittest.mock import MagicMock import pytest -from nemo_skills.inference.model.vllm import VLLMModel, audio_file_to_base64 +from nemo_skills.inference.generate import GenerationTask -# ----------------------- -# Unit tests - no server required -# ----------------------- +@pytest.fixture +def mock_generation_task(): + """Create a mock GenerationTask for testing audio preprocessing.""" + mock_cfg = MagicMock() + mock_cfg.drop_content_types = ["audio_url"] + + task = MagicMock(spec=GenerationTask) + task.cfg = mock_cfg + # Use the real methods + task._audio_file_to_base64 = GenerationTask._audio_file_to_base64.__get__(task) + task._convert_audio_to_base64 = GenerationTask._convert_audio_to_base64.__get__(task) + return task -def test_audio_file_to_base64(): + +def test_audio_file_to_base64(mock_generation_task): """Test basic audio file encoding to base64.""" with tempfile.NamedTemporaryFile(mode='wb', suffix='.wav', delete=False) as f: test_content = b'RIFF' + b'\x00' * 100 @@ -34,7 +44,7 @@ def test_audio_file_to_base64(): temp_path = f.name try: - result = audio_file_to_base64(temp_path) + result = mock_generation_task._audio_file_to_base64(temp_path) assert isinstance(result, str) assert len(result) > 0 decoded = base64.b64decode(result) @@ -43,114 +53,56 @@ def test_audio_file_to_base64(): os.unlink(temp_path) -@pytest.fixture -def vllm_model(tmp_path): - """Create a VLLMModel instance for testing.""" - audio_dir = tmp_path / "audio" - audio_dir.mkdir() - model = VLLMModel(model="test-model", data_dir=str(tmp_path), base_url="http://localhost:5000") - return model - +def test_convert_audio_to_base64_with_audio(mock_generation_task, tmp_path): + """Test converting string content with audio to list format. -def test_content_text_to_list_with_audio(vllm_model, tmp_path): - """Test converting string content with audio to list format.""" + CRITICAL: Audio must come BEFORE text for Qwen Audio to transcribe correctly. + """ audio_path = tmp_path / "audio" / "test.wav" audio_path.parent.mkdir(exist_ok=True) with open(audio_path, 'wb') as f: f.write(b'RIFF' + b'\x00' * 100) - message = {"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}} + message = {"role": "user", "content": "Describe this audio", "audio": {"path": str(audio_path)}} - result = vllm_model.content_text_to_list(message) + result = mock_generation_task._convert_audio_to_base64(message) assert isinstance(result["content"], list) assert len(result["content"]) == 2 - assert result["content"][0]["type"] == "text" - assert result["content"][1]["type"] == "audio_url" - assert result["content"][1]["audio_url"]["url"].startswith("data:audio/wav;base64,") + assert result["content"][0]["type"] == "audio_url" + assert result["content"][0]["audio_url"]["url"].startswith("data:audio/wav;base64,") + assert result["content"][1]["type"] == "text" + assert "audio" not in result + +def test_convert_audio_to_base64_with_multiple_audios(mock_generation_task, tmp_path): + """Test handling message with multiple audio files. -def test_content_text_to_list_with_multiple_audios(vllm_model, tmp_path): - """Test handling message with multiple audio files.""" + CRITICAL: Audio must come BEFORE text for Qwen Audio to transcribe correctly. + """ audio_dir = tmp_path / "audio" audio_dir.mkdir(exist_ok=True) + audio_paths = [] for i in range(2): - with open(audio_dir / f"test_{i}.wav", 'wb') as f: - f.write(b'RIFF' + b'\x00' * 100) + audio_path = audio_dir / f"test_{i}.wav" + with open(audio_path, "wb") as f: + f.write(b"RIFF" + b"\x00" * 100) + audio_paths.append(str(audio_path)) message = { "role": "user", "content": "Compare these", - "audios": [{"path": "audio/test_0.wav"}, {"path": "audio/test_1.wav"}], + "audios": [{"path": audio_paths[0]}, {"path": audio_paths[1]}], } - result = vllm_model.content_text_to_list(message) + result = mock_generation_task._convert_audio_to_base64(message) assert isinstance(result["content"], list) assert len(result["content"]) == 3 - assert result["content"][0]["type"] == "text" + # Audio MUST come before text for Qwen Audio + assert result["content"][0]["type"] == "audio_url" assert result["content"][1]["type"] == "audio_url" - assert result["content"][2]["type"] == "audio_url" - - -# ----------------------- -# Request building tests with audio -# ----------------------- - -def test_build_chat_request_with_audio(tmp_path, vllm_model): - """Test that chat request params are correctly built with audio content.""" - # Create audio file - audio_path = tmp_path / "audio" / "test.wav" - audio_path.parent.mkdir(exist_ok=True) - with open(audio_path, 'wb') as f: - f.write(b'RIFF' + b'\x00' * 100) - - messages = [{"role": "user", "content": "Test audio", "audio": {"path": "audio/test.wav"}}] - - # Build request params - this doesn't make any network calls - params = vllm_model._build_chat_request_params(messages=messages, stream=False, tokens_to_generate=10) - - # Validate request structure - assert "messages" in params - assert len(params["messages"]) == 1 - content_items = params["messages"][0]["content"] - assert isinstance(content_items, list) - assert len(content_items) == 2 - assert content_items[0]["type"] == "text" - assert content_items[1]["type"] == "audio_url" - - # Verify base64 encoding is valid - audio_url = content_items[1]["audio_url"]["url"] - assert audio_url.startswith("data:audio/wav;base64,") - audio_b64 = audio_url.split(",", 1)[1] - decoded = base64.b64decode(audio_b64) - assert decoded.startswith(b'RIFF') - - -@pytest.mark.asyncio -async def test_generate_with_audio_mocked_response(tmp_path, vllm_model): - """Test generate_async with audio by mocking the response (no real server call).""" - # Create audio file - audio_path = tmp_path / "audio" / "test.wav" - audio_path.parent.mkdir(exist_ok=True) - with open(audio_path, 'wb') as f: - f.write(b'RIFF' + b'\x00' * 100) - - messages = [{"role": "user", "content": "Describe this audio", "audio": {"path": "audio/test.wav"}}] - - # Mock the entire generate_async method - no actual API call made - mock_response = {"generation": "This audio contains speech", "num_generated_tokens": 5} - - with patch.object(vllm_model, "generate_async", new_callable=AsyncMock) as mock_generate: - mock_generate.return_value = mock_response - - # Call the mocked method - response = await vllm_model.generate_async(prompt=messages, tokens_to_generate=50, temperature=0.0) - - # Verify the mock was called correctly - assert response["generation"] == "This audio contains speech" - assert response["num_generated_tokens"] == 5 - mock_generate.assert_awaited_once() - - + assert result["content"][2]["type"] == "text" + # Original audios key should be removed + assert "audios" not in result From 5825f277ff90b372371d6adba3737da50882f9b8 Mon Sep 17 00:00:00 2001 From: mmkrtchyan Date: Fri, 5 Dec 2025 18:10:02 +0400 Subject: [PATCH 29/38] resolving conflicts and lint fix Signed-off-by: mmkrtchyan --- tests/gpu-tests/test_vllm_audio.py | 9 ++++----- tests/test_vllm_audio.py | 8 ++++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/gpu-tests/test_vllm_audio.py b/tests/gpu-tests/test_vllm_audio.py index 8183adaa80..9272a6d85f 100644 --- a/tests/gpu-tests/test_vllm_audio.py +++ b/tests/gpu-tests/test_vllm_audio.py @@ -34,7 +34,7 @@ def test_vllm_audio_generation(): shutil.rmtree(output_dir) # Create test input file with audio - with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: test_data = [ { "problem": "Transcribe this audio", @@ -46,7 +46,7 @@ def test_vllm_audio_generation(): }, ] for item in test_data: - f.write(json.dumps(item) + '\n') + f.write(json.dumps(item) + "\n") input_file = f.name try: @@ -68,9 +68,9 @@ def test_vllm_audio_generation(): # Verify output exists and has audio-related generation with open(f"{output_dir}/output.jsonl") as fin: lines = fin.readlines() - + assert len(lines) == 2, "Should have 2 output lines" - + for line in lines: data = json.loads(line) assert "generation" in data, "Should have generation field" @@ -81,4 +81,3 @@ def test_vllm_audio_generation(): finally: # Cleanup temp file Path(input_file).unlink(missing_ok=True) - diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py index d806c46121..1d62b4e8b1 100644 --- a/tests/test_vllm_audio.py +++ b/tests/test_vllm_audio.py @@ -38,8 +38,8 @@ def mock_generation_task(): def test_audio_file_to_base64(mock_generation_task): """Test basic audio file encoding to base64.""" - with tempfile.NamedTemporaryFile(mode='wb', suffix='.wav', delete=False) as f: - test_content = b'RIFF' + b'\x00' * 100 + with tempfile.NamedTemporaryFile(mode="wb", suffix=".wav", delete=False) as f: + test_content = b"RIFF" + b"\x00" * 100 f.write(test_content) temp_path = f.name @@ -60,8 +60,8 @@ def test_convert_audio_to_base64_with_audio(mock_generation_task, tmp_path): """ audio_path = tmp_path / "audio" / "test.wav" audio_path.parent.mkdir(exist_ok=True) - with open(audio_path, 'wb') as f: - f.write(b'RIFF' + b'\x00' * 100) + with open(audio_path, "wb") as f: + f.write(b"RIFF" + b"\x00" * 100) message = {"role": "user", "content": "Describe this audio", "audio": {"path": str(audio_path)}} From 6f58a1bfd351e1c58c07d419971539f5b715db91 Mon Sep 17 00:00:00 2001 From: mmkrtchyan Date: Fri, 5 Dec 2025 18:29:34 +0400 Subject: [PATCH 30/38] Fix drop_binary_data missing messages check Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 7c721a5d3f..8e72c1ae77 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -592,6 +592,10 @@ def dump_outputs(self, outputs, data_points, fout): def drop_binary_data(self, output): """Remove binary data (like base64 audio) from messages to keep output files smaller.""" + # Skip if output doesn't have messages (e.g., text completion mode or error cases) + if "messages" not in output: + return + for message in output["messages"]: # Skip if content is not a list (e.g., string content in system messages) if not isinstance(message.get("content"), list): From 0f1b86d10efd8fb160062d9ac35bbcb4b821ef26 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Wed, 10 Dec 2025 08:37:35 -0800 Subject: [PATCH 31/38] Enable audio chunking Signed-off-by: George Zelenfroind clean up Signed-off-by: George Zelenfroind --- nemo_skills/inference/generate.py | 82 +++----- nemo_skills/inference/model/vllm.py | 304 ++++++++++++++++++++++++++++ 2 files changed, 331 insertions(+), 55 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 8e72c1ae77..f6d7a8a145 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -13,9 +13,9 @@ # limitations under the License. import asyncio -import base64 import json import logging +import os import random import shutil import subprocess @@ -190,6 +190,11 @@ class GenerateSolutionsConfig: # List of content types to drop from messages (e.g., base64 audio) to keep output files smaller drop_content_types: list[str] = field(default_factory=lambda: ["audio_url"]) + # Audio chunking configuration + enable_audio_chunking: bool = True + audio_chunk_task_types: list[str] | None = None # If None, chunk all task types; if specified, only chunk these + chunk_audio_threshold_sec: int = 30 # Duration in seconds for each audio chunk + # Evaluation setup if requested. If eval_type is set to None, evaluation is skipped eval_type: str | None = None # "lean4-proof", "math", etc. eval_config: dict = field(default_factory=dict) # Config for the evaluator @@ -384,6 +389,21 @@ def setup_prompt(self): def setup_llm(self): self.sandbox = get_sandbox(**self.cfg.sandbox) if self.cfg.sandbox is not None else None + # Prepare data_dir for audio processing + data_dir = "" + if "data_dir" in self.cfg.eval_config and not ( + isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)) + ): + data_dir = os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) + + # Prepare audio chunking config + audio_chunking_config = { + "data_dir": data_dir, + "enable_audio_chunking": self.cfg.enable_audio_chunking, + "audio_chunk_task_types": self.cfg.audio_chunk_task_types, + "chunk_audio_threshold_sec": self.cfg.chunk_audio_threshold_sec, + } + if self.cfg.code_execution: llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox) elif self.cfg.tool_modules is not None: @@ -395,7 +415,7 @@ def setup_llm(self): additional_config={"sandbox": self.cfg.sandbox}, ) else: - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) + llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, **audio_chunking_config) if self.cfg.parallel_thinking.mode is not None: # We don't want to override these key variables which overlap with self.cfg @@ -529,63 +549,11 @@ def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]: """Preprocess the prompt before sending to the model. Override this method to add custom preprocessing logic. - By default, auto-detects and handles audio file path to base64 conversion. + Audio conversion is now handled by the model (e.g., VLLMModel). """ if not isinstance(prompt, list): return prompt - # Auto-detect and convert audio file paths to base64 - prompt = [self._convert_audio_to_base64(message) for message in prompt] - - return prompt - - def _audio_file_to_base64(self, audio_file_path: str) -> str: - """Encodes an audio file into a base64 string.""" - with open(audio_file_path, "rb") as audio_file: - audio_content = audio_file.read() - return base64.b64encode(audio_content).decode("utf-8") - - def _convert_audio_to_base64(self, message: dict) -> dict: - """Convert audio file paths in a message to base64 inline content. - - Looks for 'audio' or 'audios' keys in the message and converts them - to base64-encoded audio_url content items. Removes the original keys - after conversion to prevent double-processing by model-specific code. - - CRITICAL: Audio must come BEFORE text for Qwen Audio to transcribe correctly. - """ - if "audio" not in message and "audios" not in message: - return message - - audio_items = [] - - # Handle single audio - if "audio" in message: - audio = message.pop("audio") - base64_audio = self._audio_file_to_base64(audio["path"]) - audio_items.append({"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}}) - - # Handle multiple audios - if "audios" in message: - for audio in message.pop("audios"): - base64_audio = self._audio_file_to_base64(audio["path"]) - audio_items.append( - {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} - ) - - # Convert existing content to list format and place AFTER audio - content = message.get("content", "") - if isinstance(content, str): - text_items = [{"type": "text", "text": content}] if content else [] - elif isinstance(content, list): - text_items = content - else: - raise TypeError(f"Unsupported content type: {type(content)}") - - message["content"] = audio_items + text_items - - return message - def dump_outputs(self, outputs, data_points, fout): for output in outputs: fout.write(json.dumps(output) + "\n") @@ -664,6 +632,10 @@ async def process_single_datapoint(self, data_point, all_data): "stop_phrases": [self.cfg.stop_phrase] if self.cfg.stop_phrase else None, } + # Pass task_type for audio chunking + if "task_type" in data_point: + generation_params["task_type"] = data_point["task_type"] + if self.cfg.code_execution: if self.cfg.override_max_code_executions and self.cfg.total_code_executions_in_prompt is not None: generation_params["max_code_executions"] = data_point["total_code_executions"] diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index fb3de3f0e7..894425a874 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import base64 import logging +import os import requests @@ -24,7 +26,102 @@ LOG = logging.getLogger(get_logger_name(__file__)) +def audio_file_to_base64(audio_file_path: str) -> str: + """Encodes an audio file into a base64 string.""" + with open(audio_file_path, "rb") as audio_file: + audio_content = audio_file.read() + return base64.b64encode(audio_content).decode("utf-8") + + +def load_audio_file(audio_file_path: str): + """Load audio file and return array and sampling rate.""" + import soundfile as sf + + audio_array, sampling_rate = sf.read(audio_file_path) + return audio_array, sampling_rate + + +def chunk_audio(audio_array, sampling_rate, chunk_duration_sec=30): + """Chunk audio array into segments of specified duration. + + Args: + audio_array: Audio data as numpy array + sampling_rate: Sampling rate in Hz + chunk_duration_sec: Duration of each chunk in seconds + + Returns: + List of audio chunks + """ + import numpy as np + + chunk_samples = int(chunk_duration_sec * sampling_rate) + num_chunks = int(np.ceil(len(audio_array) / chunk_samples)) + + chunks = [] + for i in range(num_chunks): + start = i * chunk_samples + end = min((i + 1) * chunk_samples, len(audio_array)) + chunks.append(audio_array[start:end]) + + return chunks + + +def save_audio_chunk_to_base64(audio_chunk, sampling_rate): + """Save audio chunk to temporary file and convert to base64. + + Args: + audio_chunk: Audio data as numpy array + sampling_rate: Sampling rate in Hz + + Returns: + Base64 encoded audio string + """ + import tempfile + + import soundfile as sf + + # Create temporary file + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: + tmp_path = tmp_file.name + sf.write(tmp_path, audio_chunk, sampling_rate) + + try: + # Read and encode + with open(tmp_path, "rb") as f: + audio_content = f.read() + encoded = base64.b64encode(audio_content).decode("utf-8") + finally: + # Clean up + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + return encoded + + class VLLMModel(BaseModel): + def __init__( + self, + data_dir: str = "", + enable_audio_chunking: bool = True, + audio_chunk_task_types: list[str] | None = None, + chunk_audio_threshold_sec: int = 30, + **kwargs, + ): + """Initialize VLLMModel with audio chunking support. + + Args: + data_dir: Base directory for audio files + enable_audio_chunking: Master switch for audio chunking + audio_chunk_task_types: If None, chunk all task types; if specified, only chunk these + chunk_audio_threshold_sec: Audio duration threshold for chunking + **kwargs: Other parameters passed to BaseModel + """ + self.data_dir = data_dir + self.enable_audio_chunking = enable_audio_chunking + self.audio_chunk_task_types = audio_chunk_task_types + self.chunk_audio_threshold_sec = chunk_audio_threshold_sec + super().__init__(**kwargs) + def _get_tokenizer_endpoint(self): """ Returns the tokenizer endpoint if available, otherwise returns None. @@ -42,6 +139,64 @@ def _get_tokenizer_endpoint(self): except requests.exceptions.RequestException: return None + def content_text_to_list(self, message): + """Convert message content with audio to proper list format. + + Handles 'audio' or 'audios' keys in messages and converts them to + base64-encoded audio_url content items. + + CRITICAL: Audio must come BEFORE text for Qwen models to transcribe correctly. + """ + if "audio" in message or "audios" in message: + content = message["content"] + if isinstance(content, str): + message["content"] = [{"type": "text", "text": content}] + elif isinstance(content, list): + message["content"] = content + else: + raise TypeError(str(content)) + + audio_items = [] + + if "audio" in message: + audio = message["audio"] + base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) + audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} + audio_items.append(audio_message) + elif "audios" in message: + for audio in message["audios"]: + base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) + audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} + audio_items.append(audio_message) + + # Insert audio items at the BEGINNING of content list (before text) + if audio_items: + message["content"] = audio_items + message["content"] + + return message + + def _preprocess_messages_for_model(self, messages: list[dict]) -> list[dict]: + """Preprocess messages based on model-specific requirements. + + Remove /no_think suffix from system message as many models don't + recognize it and it degrades performance (especially Qwen). + """ + processed_messages = [] + for msg in messages: + msg_copy = msg.copy() + + if msg_copy.get("role") == "system" and isinstance(msg_copy.get("content"), str): + content = msg_copy["content"] + if "/no_think" in content: + LOG.info(f"[PREPROCESS] BEFORE: '{content}'") + content = content.replace(" /no_think", "").replace("/no_think", "") + msg_copy["content"] = content.strip() + LOG.info(f"[PREPROCESS] AFTER: '{msg_copy['content']}'") + + processed_messages.append(msg_copy) + + return processed_messages + def _build_request_body(self, top_k, min_p, repetition_penalty, extra_body: dict = None): full_extra_body = { "min_p": min_p, @@ -96,6 +251,152 @@ def _build_completion_request_params( "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body), } + async def generate_async( + self, + prompt: str | list[dict] | None = None, + tokens_to_generate: int | None = None, + task_type: str = None, + **kwargs, + ) -> dict: + """Generate with automatic audio chunking for long audio files. + + This override checks if the prompt (messages) contains long audio. + If so, it chunks the audio, processes each chunk separately, and aggregates results. + """ + if isinstance(prompt, list): + messages = prompt + needs_chunking, audio_path, duration = self._needs_audio_chunking(messages, task_type) + + if needs_chunking: + import re + + audio_array, sampling_rate = load_audio_file(audio_path) + chunks = chunk_audio(audio_array, sampling_rate, self.chunk_audio_threshold_sec) + + chunk_results = [] + for chunk_idx, audio_chunk in enumerate(chunks): + chunk_messages = [] + for msg in messages: + msg_copy = msg.copy() + if msg_copy.get("role") == "user" and ("audio" in msg_copy or "audios" in msg_copy): + chunk_base64 = save_audio_chunk_to_base64(audio_chunk, sampling_rate) + + content = msg_copy.get("content", "") + if isinstance(content, str): + msg_copy["content"] = [{"type": "text", "text": content}] + + # Add audio chunk at the beginning (before text) + msg_copy["content"] = [ + {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{chunk_base64}"}} + ] + msg_copy["content"] + + # Remove original audio fields to avoid double processing + msg_copy.pop("audio", None) + msg_copy.pop("audios", None) + + chunk_messages.append(msg_copy) + + # Preprocess messages to strip /no_think for Qwen models + chunk_messages = self._preprocess_messages_for_model(chunk_messages) + + # Generate for this chunk (pass as prompt, which accepts list[dict]) + result = await super().generate_async( + prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs + ) + + generation = result.get("generation", "") + + # Post-process Qwen2-Audio output using AudioBench's + # https://github.com/AudioLLMs/AudioBench/blob/main/src/model_src/qwen2_audio_7b_instruct.py + + # Possible issue: Remove SRT subtitle timestamps + timestamp_pattern = r"\d+\s+\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3}\s+" + generation = re.sub(timestamp_pattern, "", generation) + timestamp_pattern_no_num = r"\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3}\s*" + generation = re.sub(timestamp_pattern_no_num, "", generation) + generation = re.sub(r"\\n\d+\s+(?=\d{2}:\d{2})", " ", generation) + generation = re.sub(r"\n\d+\s+(?=\d{2}:\d{2})", " ", generation) + + # === AudioBench's way of processing output === + + # Step 1: Extract from double quotes + match = re.search(r'"((?:\\.|[^"\\])*)"', generation) + if match: + generation = match.group(1) + + # Handle colon-quote patterns + if ":'" in generation: + generation = "'" + generation.split(":'")[1] + elif ": '" in generation: + generation = "'" + generation.split(": '")[1] + + # Step 3: ALWAYS extract from single quotes + match = re.search(r"'(.*)'", generation) + if match: + generation = match.group(1) + + chunk_results.append(generation.strip()) + + # Aggregate all chunks + aggregated_text = " ".join(chunk_results) + + # Return result with aggregated generation + # Use the last chunk's result structure but replace generation + if result: + final_result = result.copy() + final_result["generation"] = aggregated_text + final_result["num_audio_chunks"] = len(chunks) + final_result["audio_duration"] = duration + else: + final_result = { + "generation": aggregated_text, + "num_audio_chunks": len(chunks), + "audio_duration": duration, + } + + return final_result + + # Default behavior for non-chunked audio or non-list prompts + return await super().generate_async(prompt=prompt, tokens_to_generate=tokens_to_generate, **kwargs) + + def _needs_audio_chunking(self, messages: list[dict], task_type: str = None) -> tuple[bool, str, float]: + """Check if audio in messages needs chunking. + + Modified to support all task types by default, with optional filtering. + + Returns: + Tuple of (needs_chunking, audio_path, duration) + """ + if not self.enable_audio_chunking: + return False, None, 0.0 + + # Check if task type should be chunked (if filter is specified) + if self.audio_chunk_task_types is not None: + if task_type not in self.audio_chunk_task_types: + return False, None, 0.0 + + # Find audio in messages + for msg in messages: + if msg.get("role") == "user": + audio_info = msg.get("audio") or (msg.get("audios", [{}])[0] if msg.get("audios") else {}) + if audio_info and "path" in audio_info: + audio_path = os.path.join(self.data_dir, audio_info["path"]) + + if not os.path.exists(audio_path): + return False, None, 0.0 + + # Load audio to check duration + try: + audio_array, sampling_rate = load_audio_file(audio_path) + duration = len(audio_array) / sampling_rate + + if duration > self.chunk_audio_threshold_sec: + return True, audio_path, duration + except Exception: + pass + + return False, None, 0.0 + def _build_chat_request_params( self, messages: list[dict], @@ -114,6 +415,9 @@ def _build_chat_request_params( tools: list[dict] | None = None, extra_body: dict = None, ) -> dict: + # Preprocess messages for model-specific requirements (e.g., remove /no_think for Qwen) + messages = self._preprocess_messages_for_model(messages) + messages = [self.content_text_to_list(message) for message in messages] request = { "messages": messages, "max_tokens": tokens_to_generate, From 32ed94cef86d0bc0a8e7d7563e9f3ad19ff98838 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Wed, 10 Dec 2025 09:35:59 -0800 Subject: [PATCH 32/38] remove qwen specific post processing Signed-off-by: George Zelenfroind --- nemo_skills/inference/model/vllm.py | 35 ----------------------------- 1 file changed, 35 deletions(-) diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index 894425a874..8dea2a5071 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -268,8 +268,6 @@ async def generate_async( needs_chunking, audio_path, duration = self._needs_audio_chunking(messages, task_type) if needs_chunking: - import re - audio_array, sampling_rate = load_audio_file(audio_path) chunks = chunk_audio(audio_array, sampling_rate, self.chunk_audio_threshold_sec) @@ -296,48 +294,15 @@ async def generate_async( chunk_messages.append(msg_copy) - # Preprocess messages to strip /no_think for Qwen models chunk_messages = self._preprocess_messages_for_model(chunk_messages) - # Generate for this chunk (pass as prompt, which accepts list[dict]) result = await super().generate_async( prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs ) generation = result.get("generation", "") - - # Post-process Qwen2-Audio output using AudioBench's - # https://github.com/AudioLLMs/AudioBench/blob/main/src/model_src/qwen2_audio_7b_instruct.py - - # Possible issue: Remove SRT subtitle timestamps - timestamp_pattern = r"\d+\s+\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3}\s+" - generation = re.sub(timestamp_pattern, "", generation) - timestamp_pattern_no_num = r"\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3}\s*" - generation = re.sub(timestamp_pattern_no_num, "", generation) - generation = re.sub(r"\\n\d+\s+(?=\d{2}:\d{2})", " ", generation) - generation = re.sub(r"\n\d+\s+(?=\d{2}:\d{2})", " ", generation) - - # === AudioBench's way of processing output === - - # Step 1: Extract from double quotes - match = re.search(r'"((?:\\.|[^"\\])*)"', generation) - if match: - generation = match.group(1) - - # Handle colon-quote patterns - if ":'" in generation: - generation = "'" + generation.split(":'")[1] - elif ": '" in generation: - generation = "'" + generation.split(": '")[1] - - # Step 3: ALWAYS extract from single quotes - match = re.search(r"'(.*)'", generation) - if match: - generation = match.group(1) - chunk_results.append(generation.strip()) - # Aggregate all chunks aggregated_text = " ".join(chunk_results) # Return result with aggregated generation From 15f30fd6c91bd5d2810ff933eafd76d65364fbc5 Mon Sep 17 00:00:00 2001 From: mmkrtchyan Date: Tue, 16 Dec 2025 13:27:48 +0400 Subject: [PATCH 33/38] fixing audio chunking config Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index f6d7a8a145..ddb67ad9a1 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -404,6 +404,10 @@ def setup_llm(self): "chunk_audio_threshold_sec": self.cfg.chunk_audio_threshold_sec, } + # Only pass audio_chunking_config to models that support it (VLLM-based servers) + audio_supported_servers = {"vllm"} + server_type = self.cfg.server.get("server_type", "").lower() + if self.cfg.code_execution: llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox) elif self.cfg.tool_modules is not None: @@ -414,8 +418,10 @@ def setup_llm(self): tokenizer=self.tokenizer, additional_config={"sandbox": self.cfg.sandbox}, ) - else: + elif server_type in audio_supported_servers: llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, **audio_chunking_config) + else: + llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) if self.cfg.parallel_thinking.mode is not None: # We don't want to override these key variables which overlap with self.cfg From 8a0765a180d527a5a9f8dfd00cd1dc0895a545d9 Mon Sep 17 00:00:00 2001 From: mmkrtchyan Date: Tue, 16 Dec 2025 14:40:40 +0400 Subject: [PATCH 34/38] Fix audio tests to use VLLMModel instead of GenerationTask Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 3 +- tests/test_vllm_audio.py | 59 ++++++++++++++----------------- 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index ddb67ad9a1..ec3aeee248 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -557,8 +557,7 @@ def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]: Override this method to add custom preprocessing logic. Audio conversion is now handled by the model (e.g., VLLMModel). """ - if not isinstance(prompt, list): - return prompt + return prompt def dump_outputs(self, outputs, data_points, fout): for output in outputs: diff --git a/tests/test_vllm_audio.py b/tests/test_vllm_audio.py index 1d62b4e8b1..05cc2645eb 100644 --- a/tests/test_vllm_audio.py +++ b/tests/test_vllm_audio.py @@ -15,28 +15,14 @@ import base64 import os import tempfile -from unittest.mock import MagicMock +from unittest.mock import patch import pytest -from nemo_skills.inference.generate import GenerationTask +from nemo_skills.inference.model.vllm import VLLMModel, audio_file_to_base64 -@pytest.fixture -def mock_generation_task(): - """Create a mock GenerationTask for testing audio preprocessing.""" - mock_cfg = MagicMock() - mock_cfg.drop_content_types = ["audio_url"] - - task = MagicMock(spec=GenerationTask) - task.cfg = mock_cfg - # Use the real methods - task._audio_file_to_base64 = GenerationTask._audio_file_to_base64.__get__(task) - task._convert_audio_to_base64 = GenerationTask._convert_audio_to_base64.__get__(task) - return task - - -def test_audio_file_to_base64(mock_generation_task): +def test_audio_file_to_base64(): """Test basic audio file encoding to base64.""" with tempfile.NamedTemporaryFile(mode="wb", suffix=".wav", delete=False) as f: test_content = b"RIFF" + b"\x00" * 100 @@ -44,7 +30,7 @@ def test_audio_file_to_base64(mock_generation_task): temp_path = f.name try: - result = mock_generation_task._audio_file_to_base64(temp_path) + result = audio_file_to_base64(temp_path) assert isinstance(result, str) assert len(result) > 0 decoded = base64.b64decode(result) @@ -53,42 +39,51 @@ def test_audio_file_to_base64(mock_generation_task): os.unlink(temp_path) -def test_convert_audio_to_base64_with_audio(mock_generation_task, tmp_path): +@pytest.fixture +def mock_vllm_model(): + """Create a mock VLLMModel for testing audio preprocessing.""" + with patch.object(VLLMModel, "__init__", lambda self, **kwargs: None): + model = VLLMModel() + model.data_dir = "" + return model + + +def test_content_text_to_list_with_audio(mock_vllm_model, tmp_path): """Test converting string content with audio to list format. CRITICAL: Audio must come BEFORE text for Qwen Audio to transcribe correctly. """ - audio_path = tmp_path / "audio" / "test.wav" - audio_path.parent.mkdir(exist_ok=True) + audio_path = tmp_path / "test.wav" with open(audio_path, "wb") as f: f.write(b"RIFF" + b"\x00" * 100) - message = {"role": "user", "content": "Describe this audio", "audio": {"path": str(audio_path)}} + # Set data_dir to tmp_path parent so path resolution works + mock_vllm_model.data_dir = str(tmp_path) + + message = {"role": "user", "content": "Describe this audio", "audio": {"path": "test.wav"}} - result = mock_generation_task._convert_audio_to_base64(message) + result = mock_vllm_model.content_text_to_list(message) assert isinstance(result["content"], list) assert len(result["content"]) == 2 assert result["content"][0]["type"] == "audio_url" assert result["content"][0]["audio_url"]["url"].startswith("data:audio/wav;base64,") assert result["content"][1]["type"] == "text" - assert "audio" not in result -def test_convert_audio_to_base64_with_multiple_audios(mock_generation_task, tmp_path): +def test_content_text_to_list_with_multiple_audios(mock_vllm_model, tmp_path): """Test handling message with multiple audio files. CRITICAL: Audio must come BEFORE text for Qwen Audio to transcribe correctly. """ - audio_dir = tmp_path / "audio" - audio_dir.mkdir(exist_ok=True) - audio_paths = [] for i in range(2): - audio_path = audio_dir / f"test_{i}.wav" + audio_path = tmp_path / f"test_{i}.wav" with open(audio_path, "wb") as f: f.write(b"RIFF" + b"\x00" * 100) - audio_paths.append(str(audio_path)) + audio_paths.append(f"test_{i}.wav") + + mock_vllm_model.data_dir = str(tmp_path) message = { "role": "user", @@ -96,7 +91,7 @@ def test_convert_audio_to_base64_with_multiple_audios(mock_generation_task, tmp_ "audios": [{"path": audio_paths[0]}, {"path": audio_paths[1]}], } - result = mock_generation_task._convert_audio_to_base64(message) + result = mock_vllm_model.content_text_to_list(message) assert isinstance(result["content"], list) assert len(result["content"]) == 3 @@ -104,5 +99,3 @@ def test_convert_audio_to_base64_with_multiple_audios(mock_generation_task, tmp_ assert result["content"][0]["type"] == "audio_url" assert result["content"][1]["type"] == "audio_url" assert result["content"][2]["type"] == "text" - # Original audios key should be removed - assert "audios" not in result From f8c3089daaca298bb38724e516ece553aea24cb0 Mon Sep 17 00:00:00 2001 From: mmkrtchyan Date: Tue, 16 Dec 2025 17:10:06 +0400 Subject: [PATCH 35/38] adding vllm integration test to run_qwen.sh Signed-off-by: mmkrtchyan --- nemo_skills/inference/generate.py | 9 +++++++++ tests/gpu-tests/run_qwen.sh | 4 ++++ tests/gpu-tests/test_vllm_audio.py | 6 +++--- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index ec3aeee248..820a175479 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -549,6 +549,15 @@ def fill_prompt(self, data_point, data): filled_prompt[-1]["content"] += self.cfg.prompt_suffix else: filled_prompt += self.cfg.prompt_suffix + + # Copy audio fields from data_point to the user message for audio models + if isinstance(filled_prompt, list): + for msg in filled_prompt: + if msg.get("role") == "user": + if "audio" in data_point: + msg["audio"] = data_point["audio"] + break + return filled_prompt def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]: diff --git a/tests/gpu-tests/run_qwen.sh b/tests/gpu-tests/run_qwen.sh index 10201cf9ed..01803c1dae 100755 --- a/tests/gpu-tests/run_qwen.sh +++ b/tests/gpu-tests/run_qwen.sh @@ -20,6 +20,10 @@ pytest tests/gpu-tests/test_contamination.py -s -x # Tool calling tests (uses same Qwen3-4B-Instruct model) pytest tests/gpu-tests/test_tool_calling.py -s -x +# Audio tests (requires Qwen2.5-Omni model) +export NEMO_SKILLS_TEST_HF_MODEL=Qwen/Qwen2.5-Omni-7B +pytest tests/gpu-tests/test_vllm_audio.py -s -x + # TODO: Add fast context retry tests # pytest tests/gpu-tests/test_context_retry.py -s -x diff --git a/tests/gpu-tests/test_vllm_audio.py b/tests/gpu-tests/test_vllm_audio.py index 9272a6d85f..2f5d33ef94 100644 --- a/tests/gpu-tests/test_vllm_audio.py +++ b/tests/gpu-tests/test_vllm_audio.py @@ -37,11 +37,11 @@ def test_vllm_audio_generation(): with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: test_data = [ { - "problem": "Transcribe this audio", + "question": "Transcribe this audio", "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t2_16.wav"}, }, { - "problem": "What is in this audio?", + "question": "What is in this audio?", "audio": {"path": "/nemo_run/code/tests/slurm-tests/asr_nim/wavs/t3_16.wav"}, }, ] @@ -60,7 +60,7 @@ def test_vllm_audio_generation(): f" --server_nodes 1 " f" --server_args '--enforce-eager' " f" --input_file={input_file} " - f" ++prompt_config=openai " + f" ++prompt_config=generic/default " f" ++skip_filled=False " ) subprocess.run(cmd, shell=True, check=True) From a88736e3c6dba18407256cfe358262f7bdbace80 Mon Sep 17 00:00:00 2001 From: George Armstrong Date: Mon, 22 Dec 2025 16:41:47 -0800 Subject: [PATCH 36/38] move audio processing into model (#1137) Signed-off-by: George Armstrong --- nemo_skills/inference/generate.py | 50 ++- nemo_skills/inference/model/__init__.py | 42 ++ .../inference/model/audio_processor.py | 374 ++++++++++++++++++ nemo_skills/inference/model/vllm.py | 271 +------------ 4 files changed, 443 insertions(+), 294 deletions(-) create mode 100644 nemo_skills/inference/model/audio_processor.py diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 820a175479..d994feaca8 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -15,7 +15,6 @@ import asyncio import json import logging -import os import random import shutil import subprocess @@ -39,6 +38,8 @@ supports_single_eval, ) from nemo_skills.inference.model import ( + AudioProcessor, + AudioProcessorConfig, ParallelThinkingConfig, get_code_execution_model, get_model, @@ -190,10 +191,10 @@ class GenerateSolutionsConfig: # List of content types to drop from messages (e.g., base64 audio) to keep output files smaller drop_content_types: list[str] = field(default_factory=lambda: ["audio_url"]) - # Audio chunking configuration - enable_audio_chunking: bool = True - audio_chunk_task_types: list[str] | None = None # If None, chunk all task types; if specified, only chunk these - chunk_audio_threshold_sec: int = 30 # Duration in seconds for each audio chunk + # Audio processing configuration (EXPERIMENTAL) + # Set to enable audio file preprocessing (file->base64 conversion, chunking for long audio) + # Example: ++audio.data_dir=/path/to/audio ++audio.enable_chunking=true + audio: AudioProcessorConfig | None = None # Evaluation setup if requested. If eval_type is set to None, evaluation is skipped eval_type: str | None = None # "lean4-proof", "math", etc. @@ -389,25 +390,6 @@ def setup_prompt(self): def setup_llm(self): self.sandbox = get_sandbox(**self.cfg.sandbox) if self.cfg.sandbox is not None else None - # Prepare data_dir for audio processing - data_dir = "" - if "data_dir" in self.cfg.eval_config and not ( - isinstance(self.cfg.eval_config["data_dir"], type(None)) or isinstance(self.cfg.eval_type, type(None)) - ): - data_dir = os.path.join(self.cfg.eval_config["data_dir"], self.cfg.eval_type) - - # Prepare audio chunking config - audio_chunking_config = { - "data_dir": data_dir, - "enable_audio_chunking": self.cfg.enable_audio_chunking, - "audio_chunk_task_types": self.cfg.audio_chunk_task_types, - "chunk_audio_threshold_sec": self.cfg.chunk_audio_threshold_sec, - } - - # Only pass audio_chunking_config to models that support it (VLLM-based servers) - audio_supported_servers = {"vllm"} - server_type = self.cfg.server.get("server_type", "").lower() - if self.cfg.code_execution: llm = get_code_execution_model(**self.cfg.server, tokenizer=self.tokenizer, sandbox=self.sandbox) elif self.cfg.tool_modules is not None: @@ -418,11 +400,25 @@ def setup_llm(self): tokenizer=self.tokenizer, additional_config={"sandbox": self.cfg.sandbox}, ) - elif server_type in audio_supported_servers: - llm = get_model(**self.cfg.server, tokenizer=self.tokenizer, **audio_chunking_config) else: llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) + # Audio wrapper (preprocesses messages before they reach the model) + if self.cfg.audio is not None: + audio_supported_servers = {"vllm"} + server_type = self.cfg.server.get("server_type", "").lower() + if server_type not in audio_supported_servers: + raise ValueError( + f"Audio processing is not supported for server_type='{server_type}'. " + f"Supported server types: {audio_supported_servers}" + ) + llm = AudioProcessor( + llm, + self.cfg.audio, + eval_config=dict(self.cfg.eval_config), + eval_type=self.cfg.eval_type, + ) + if self.cfg.parallel_thinking.mode is not None: # We don't want to override these key variables which overlap with self.cfg inference_override_config = { @@ -564,7 +560,7 @@ def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]: """Preprocess the prompt before sending to the model. Override this method to add custom preprocessing logic. - Audio conversion is now handled by the model (e.g., VLLMModel). + Audio conversion is handled by the AudioProcessor wrapper. """ return prompt diff --git a/nemo_skills/inference/model/__init__.py b/nemo_skills/inference/model/__init__.py index bd2e246499..214c28b500 100644 --- a/nemo_skills/inference/model/__init__.py +++ b/nemo_skills/inference/model/__init__.py @@ -12,6 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""Model wrappers and factories for inference. + +This module provides: +- Base model implementations (VLLMModel, OpenAIModel, etc.) +- Wrappers for additional capabilities: + - AudioProcessor: Audio file preprocessing and chunking + - CodeExecutionWrapper: Code execution in sandboxes + - ToolCallingWrapper: Tool/function calling support + - ParallelThinkingTask: Parallel thinking/reasoning +- Factory functions: get_model, get_code_execution_model, get_tool_calling_model, etc. +""" + import dataclasses from nemo_skills.mcp.utils import locate @@ -19,6 +31,9 @@ # NIM models (speech) from .asr_nim import ASRNIMModel + +# Audio processing +from .audio_processor import AudioProcessor, AudioProcessorConfig from .azure import AzureOpenAIModel # Base classes @@ -80,6 +95,33 @@ def get_model(server_type, tokenizer=None, model_class: str | None = None, **kwa return loaded_class(tokenizer=tokenizer, **kwargs) +def get_audio_model(server_type, audio_config: dict | AudioProcessorConfig | None = None, **kwargs): + """A helper function to create a model with audio processing capabilities. + + This wraps the base model with AudioProcessor for handling audio files in messages. + + Args: + server_type: The type of server (vllm, sglang, openai, etc.) + audio_config: Audio processing configuration. Can be: + - None: No audio processing (returns base model) + - dict: Will be converted to AudioProcessorConfig + - AudioProcessorConfig: Used directly + **kwargs: Additional arguments passed to get_model() + + Returns: + Model optionally wrapped with AudioProcessor + """ + model = get_model(server_type=server_type, **kwargs) + + if audio_config is None: + return model + + if isinstance(audio_config, dict): + audio_config = AudioProcessorConfig(**audio_config) + + return AudioProcessor(model, audio_config) + + def get_code_execution_model(server_type, tokenizer=None, code_execution=None, sandbox=None, **kwargs): """A helper function to make it easier to set server through cmd.""" model = get_model(server_type=server_type, tokenizer=tokenizer, **kwargs) diff --git a/nemo_skills/inference/model/audio_processor.py b/nemo_skills/inference/model/audio_processor.py new file mode 100644 index 0000000000..29c49b80b1 --- /dev/null +++ b/nemo_skills/inference/model/audio_processor.py @@ -0,0 +1,374 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Audio processing wrapper for multimodal models. + +This module provides an AudioProcessor wrapper that can be composed with any +BaseModel to add audio preprocessing capabilities. It handles: +- Converting audio file paths to base64-encoded audio_url format +- Chunking long audio files for models with duration limits +- Aggregating results from chunked audio processing +""" + +import base64 +import logging +import os + +from nemo_skills.utils import get_logger_name, nested_dataclass + +LOG = logging.getLogger(get_logger_name(__file__)) + + +@nested_dataclass(kw_only=True) +class AudioProcessorConfig: + """Configuration for audio preprocessing. + + Attributes: + data_dir: Base directory for resolving relative audio file paths. + enable_chunking: Whether to chunk long audio files. + chunk_task_types: If None, chunk all task types; if specified, only chunk these. + chunk_threshold_sec: Audio duration threshold (in seconds) above which to chunk. + """ + + data_dir: str = "" + enable_chunking: bool = True + chunk_task_types: list[str] | None = None + chunk_threshold_sec: int = 30 + + +def audio_file_to_base64(audio_file_path: str) -> str: + """Encodes an audio file into a base64 string.""" + with open(audio_file_path, "rb") as audio_file: + audio_content = audio_file.read() + return base64.b64encode(audio_content).decode("utf-8") + + +def load_audio_file(audio_file_path: str): + """Load audio file and return array and sampling rate.""" + import soundfile as sf + + audio_array, sampling_rate = sf.read(audio_file_path) + return audio_array, sampling_rate + + +def chunk_audio(audio_array, sampling_rate, chunk_duration_sec=30): + """Chunk audio array into segments of specified duration. + + Args: + audio_array: Audio data as numpy array + sampling_rate: Sampling rate in Hz + chunk_duration_sec: Duration of each chunk in seconds + + Returns: + List of audio chunks + """ + import numpy as np + + chunk_samples = int(chunk_duration_sec * sampling_rate) + num_chunks = int(np.ceil(len(audio_array) / chunk_samples)) + + chunks = [] + for i in range(num_chunks): + start = i * chunk_samples + end = min((i + 1) * chunk_samples, len(audio_array)) + chunks.append(audio_array[start:end]) + + return chunks + + +def save_audio_chunk_to_base64(audio_chunk, sampling_rate) -> str: + """Save audio chunk to temporary file and convert to base64. + + Args: + audio_chunk: Audio data as numpy array + sampling_rate: Sampling rate in Hz + + Returns: + Base64 encoded audio string + """ + import tempfile + + import soundfile as sf + + # Create temporary file + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: + tmp_path = tmp_file.name + sf.write(tmp_path, audio_chunk, sampling_rate) + + try: + # Read and encode + with open(tmp_path, "rb") as f: + audio_content = f.read() + encoded = base64.b64encode(audio_content).decode("utf-8") + finally: + # Clean up + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + return encoded + + +class AudioProcessor: + """Wraps any model to add audio preprocessing capabilities. + + This wrapper handles: + - Converting audio file paths in messages to base64-encoded audio_url format + - Chunking long audio files and aggregating results + - Passing through all other requests unchanged + + Example usage: + model = get_model(server_type="vllm", ...) + audio_model = AudioProcessor(model, AudioProcessorConfig(), eval_config={...}, eval_type="audio") + result = await audio_model.generate_async(prompt=messages, ...) + """ + + def __init__( + self, + model, + config: AudioProcessorConfig, + eval_config: dict | None = None, + eval_type: str | None = None, + ): + """Initialize AudioProcessor wrapper. + + Args: + model: The underlying model to wrap (must have generate_async method) + config: Audio processing configuration + eval_config: Optional eval config dict (contains "data_dir" key) for inferring data_dir + eval_type: Optional eval type string for inferring data_dir + """ + self.model = model + self.config = config + + # Resolve data_dir: explicit config takes precedence, then infer from eval_config + if config.data_dir: + self.data_dir = config.data_dir + elif eval_config is not None and eval_type is not None: + eval_data_dir = eval_config.get("data_dir") + if eval_data_dir is not None: + self.data_dir = os.path.join(eval_data_dir, eval_type) + else: + self.data_dir = "" + else: + self.data_dir = "" + + # Expose common model attributes for compatibility + if hasattr(model, "model_name_or_path"): + self.model_name_or_path = model.model_name_or_path + if hasattr(model, "tokenizer"): + self.tokenizer = model.tokenizer + + async def generate_async( + self, + prompt: str | list[dict] | None = None, + task_type: str = None, + **kwargs, + ) -> dict: + """Generate with automatic audio preprocessing and chunking. + + If the prompt contains audio that needs chunking, processes each chunk + separately and aggregates results. Otherwise, converts audio to base64 + and passes through to the underlying model. + + Args: + prompt: Either a string (text completion) or list of messages (chat) + task_type: Optional task type for chunking filtering + **kwargs: Additional arguments passed to the underlying model + + Returns: + Generation result dict with 'generation' key and optional metadata + """ + if isinstance(prompt, list): + messages = prompt + needs_chunking, audio_path, duration = self._check_chunking_needed(messages, task_type) + + if needs_chunking: + return await self._generate_with_chunking(messages, audio_path, duration, **kwargs) + + # Convert audio fields to base64 format + messages = self._prepare_audio_messages(messages) + prompt = messages + + return await self.model.generate_async(prompt=prompt, **kwargs) + + def _prepare_audio_messages(self, messages: list[dict]) -> list[dict]: + """Convert audio file references in messages to base64-encoded audio_url format. + + Handles 'audio' or 'audios' keys in messages and converts them to + base64-encoded audio_url content items. + + CRITICAL: Audio must come BEFORE text for Qwen models to transcribe correctly. + """ + prepared_messages = [] + + for message in messages: + msg = message.copy() + + if "audio" not in msg and "audios" not in msg: + prepared_messages.append(msg) + continue + + # Convert content to list format if needed + content = msg.get("content", "") + if isinstance(content, str): + text_content = [{"type": "text", "text": content}] + elif isinstance(content, list): + text_content = content + else: + raise TypeError(f"Unexpected content type: {type(content)}") + + # Build audio content items + audio_items = [] + + if "audio" in msg: + audio = msg["audio"] + audio_path = os.path.join(self.data_dir, audio["path"]) + base64_audio = audio_file_to_base64(audio_path) + audio_items.append( + {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} + ) + del msg["audio"] + elif "audios" in msg: + for audio in msg["audios"]: + audio_path = os.path.join(self.data_dir, audio["path"]) + base64_audio = audio_file_to_base64(audio_path) + audio_items.append( + {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} + ) + del msg["audios"] + + # Audio items BEFORE text content (required for Qwen models) + msg["content"] = audio_items + text_content + prepared_messages.append(msg) + + return prepared_messages + + def _check_chunking_needed(self, messages: list[dict], task_type: str = None) -> tuple[bool, str, float]: + """Check if audio in messages needs chunking. + + Returns: + Tuple of (needs_chunking, audio_path, duration) + """ + if not self.config.enable_chunking: + return False, None, 0.0 + + # Check if task type should be chunked (if filter is specified) + if self.config.chunk_task_types is not None: + if task_type not in self.config.chunk_task_types: + return False, None, 0.0 + + # Find audio in messages + for msg in messages: + if msg.get("role") == "user": + audio_info = msg.get("audio") or (msg.get("audios", [{}])[0] if msg.get("audios") else {}) + if audio_info and "path" in audio_info: + audio_path = os.path.join(self.data_dir, audio_info["path"]) + + if not os.path.exists(audio_path): + return False, None, 0.0 + + # Load audio to check duration + try: + audio_array, sampling_rate = load_audio_file(audio_path) + duration = len(audio_array) / sampling_rate + + if duration > self.config.chunk_threshold_sec: + return True, audio_path, duration + except Exception: + pass + + return False, None, 0.0 + + async def _generate_with_chunking( + self, + messages: list[dict], + audio_path: str, + duration: float, + tokens_to_generate: int | None = None, + **kwargs, + ) -> dict: + """Generate by chunking long audio and aggregating results. + + Args: + messages: Original messages containing audio reference + audio_path: Path to the audio file to chunk + duration: Duration of audio in seconds + tokens_to_generate: Max tokens per chunk + **kwargs: Additional generation parameters + + Returns: + Aggregated result with combined generation from all chunks + """ + audio_array, sampling_rate = load_audio_file(audio_path) + chunks = chunk_audio(audio_array, sampling_rate, self.config.chunk_threshold_sec) + + LOG.info(f"Chunking audio ({duration:.1f}s) into {len(chunks)} chunks of {self.config.chunk_threshold_sec}s") + + chunk_results = [] + result = None + + for chunk_idx, audio_chunk in enumerate(chunks): + chunk_messages = [] + + for msg in messages: + msg_copy = msg.copy() + + if msg_copy.get("role") == "user" and ("audio" in msg_copy or "audios" in msg_copy): + chunk_base64 = save_audio_chunk_to_base64(audio_chunk, sampling_rate) + + content = msg_copy.get("content", "") + if isinstance(content, str): + text_content = [{"type": "text", "text": content}] + else: + text_content = content + + # Add audio chunk at the beginning (before text) + msg_copy["content"] = [ + {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{chunk_base64}"}} + ] + text_content + + # Remove original audio fields + msg_copy.pop("audio", None) + msg_copy.pop("audios", None) + + chunk_messages.append(msg_copy) + + result = await self.model.generate_async( + prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs + ) + + generation = result.get("generation", "") + chunk_results.append(generation.strip()) + + # Aggregate results + aggregated_text = " ".join(chunk_results) + + if result: + final_result = result.copy() + final_result["generation"] = aggregated_text + final_result["num_audio_chunks"] = len(chunks) + final_result["audio_duration"] = duration + else: + final_result = { + "generation": aggregated_text, + "num_audio_chunks": len(chunks), + "audio_duration": duration, + } + + return final_result + + # Proxy other common methods to the underlying model + def __getattr__(self, name): + """Proxy attribute access to the underlying model.""" + return getattr(self.model, name) diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py index 8dea2a5071..ecf75617e1 100644 --- a/nemo_skills/inference/model/vllm.py +++ b/nemo_skills/inference/model/vllm.py @@ -12,9 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import base64 import logging -import os import requests @@ -26,101 +24,12 @@ LOG = logging.getLogger(get_logger_name(__file__)) -def audio_file_to_base64(audio_file_path: str) -> str: - """Encodes an audio file into a base64 string.""" - with open(audio_file_path, "rb") as audio_file: - audio_content = audio_file.read() - return base64.b64encode(audio_content).decode("utf-8") - - -def load_audio_file(audio_file_path: str): - """Load audio file and return array and sampling rate.""" - import soundfile as sf - - audio_array, sampling_rate = sf.read(audio_file_path) - return audio_array, sampling_rate - - -def chunk_audio(audio_array, sampling_rate, chunk_duration_sec=30): - """Chunk audio array into segments of specified duration. - - Args: - audio_array: Audio data as numpy array - sampling_rate: Sampling rate in Hz - chunk_duration_sec: Duration of each chunk in seconds - - Returns: - List of audio chunks - """ - import numpy as np - - chunk_samples = int(chunk_duration_sec * sampling_rate) - num_chunks = int(np.ceil(len(audio_array) / chunk_samples)) - - chunks = [] - for i in range(num_chunks): - start = i * chunk_samples - end = min((i + 1) * chunk_samples, len(audio_array)) - chunks.append(audio_array[start:end]) - - return chunks - - -def save_audio_chunk_to_base64(audio_chunk, sampling_rate): - """Save audio chunk to temporary file and convert to base64. - - Args: - audio_chunk: Audio data as numpy array - sampling_rate: Sampling rate in Hz - - Returns: - Base64 encoded audio string - """ - import tempfile - - import soundfile as sf - - # Create temporary file - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: - tmp_path = tmp_file.name - sf.write(tmp_path, audio_chunk, sampling_rate) - - try: - # Read and encode - with open(tmp_path, "rb") as f: - audio_content = f.read() - encoded = base64.b64encode(audio_content).decode("utf-8") - finally: - # Clean up - if os.path.exists(tmp_path): - os.unlink(tmp_path) - - return encoded - - class VLLMModel(BaseModel): - def __init__( - self, - data_dir: str = "", - enable_audio_chunking: bool = True, - audio_chunk_task_types: list[str] | None = None, - chunk_audio_threshold_sec: int = 30, - **kwargs, - ): - """Initialize VLLMModel with audio chunking support. + """VLLM-compatible model client. - Args: - data_dir: Base directory for audio files - enable_audio_chunking: Master switch for audio chunking - audio_chunk_task_types: If None, chunk all task types; if specified, only chunk these - chunk_audio_threshold_sec: Audio duration threshold for chunking - **kwargs: Other parameters passed to BaseModel - """ - self.data_dir = data_dir - self.enable_audio_chunking = enable_audio_chunking - self.audio_chunk_task_types = audio_chunk_task_types - self.chunk_audio_threshold_sec = chunk_audio_threshold_sec - super().__init__(**kwargs) + This is a clean OpenAI-compatible client for VLLM servers. + For audio processing capabilities, wrap this model with AudioProcessor. + """ def _get_tokenizer_endpoint(self): """ @@ -139,64 +48,6 @@ def _get_tokenizer_endpoint(self): except requests.exceptions.RequestException: return None - def content_text_to_list(self, message): - """Convert message content with audio to proper list format. - - Handles 'audio' or 'audios' keys in messages and converts them to - base64-encoded audio_url content items. - - CRITICAL: Audio must come BEFORE text for Qwen models to transcribe correctly. - """ - if "audio" in message or "audios" in message: - content = message["content"] - if isinstance(content, str): - message["content"] = [{"type": "text", "text": content}] - elif isinstance(content, list): - message["content"] = content - else: - raise TypeError(str(content)) - - audio_items = [] - - if "audio" in message: - audio = message["audio"] - base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) - audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} - audio_items.append(audio_message) - elif "audios" in message: - for audio in message["audios"]: - base64_audio = audio_file_to_base64(os.path.join(self.data_dir, audio["path"])) - audio_message = {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{base64_audio}"}} - audio_items.append(audio_message) - - # Insert audio items at the BEGINNING of content list (before text) - if audio_items: - message["content"] = audio_items + message["content"] - - return message - - def _preprocess_messages_for_model(self, messages: list[dict]) -> list[dict]: - """Preprocess messages based on model-specific requirements. - - Remove /no_think suffix from system message as many models don't - recognize it and it degrades performance (especially Qwen). - """ - processed_messages = [] - for msg in messages: - msg_copy = msg.copy() - - if msg_copy.get("role") == "system" and isinstance(msg_copy.get("content"), str): - content = msg_copy["content"] - if "/no_think" in content: - LOG.info(f"[PREPROCESS] BEFORE: '{content}'") - content = content.replace(" /no_think", "").replace("/no_think", "") - msg_copy["content"] = content.strip() - LOG.info(f"[PREPROCESS] AFTER: '{msg_copy['content']}'") - - processed_messages.append(msg_copy) - - return processed_messages - def _build_request_body(self, top_k, min_p, repetition_penalty, extra_body: dict = None): full_extra_body = { "min_p": min_p, @@ -251,117 +102,6 @@ def _build_completion_request_params( "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body), } - async def generate_async( - self, - prompt: str | list[dict] | None = None, - tokens_to_generate: int | None = None, - task_type: str = None, - **kwargs, - ) -> dict: - """Generate with automatic audio chunking for long audio files. - - This override checks if the prompt (messages) contains long audio. - If so, it chunks the audio, processes each chunk separately, and aggregates results. - """ - if isinstance(prompt, list): - messages = prompt - needs_chunking, audio_path, duration = self._needs_audio_chunking(messages, task_type) - - if needs_chunking: - audio_array, sampling_rate = load_audio_file(audio_path) - chunks = chunk_audio(audio_array, sampling_rate, self.chunk_audio_threshold_sec) - - chunk_results = [] - for chunk_idx, audio_chunk in enumerate(chunks): - chunk_messages = [] - for msg in messages: - msg_copy = msg.copy() - if msg_copy.get("role") == "user" and ("audio" in msg_copy or "audios" in msg_copy): - chunk_base64 = save_audio_chunk_to_base64(audio_chunk, sampling_rate) - - content = msg_copy.get("content", "") - if isinstance(content, str): - msg_copy["content"] = [{"type": "text", "text": content}] - - # Add audio chunk at the beginning (before text) - msg_copy["content"] = [ - {"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{chunk_base64}"}} - ] + msg_copy["content"] - - # Remove original audio fields to avoid double processing - msg_copy.pop("audio", None) - msg_copy.pop("audios", None) - - chunk_messages.append(msg_copy) - - chunk_messages = self._preprocess_messages_for_model(chunk_messages) - - result = await super().generate_async( - prompt=chunk_messages, tokens_to_generate=tokens_to_generate, **kwargs - ) - - generation = result.get("generation", "") - chunk_results.append(generation.strip()) - - aggregated_text = " ".join(chunk_results) - - # Return result with aggregated generation - # Use the last chunk's result structure but replace generation - if result: - final_result = result.copy() - final_result["generation"] = aggregated_text - final_result["num_audio_chunks"] = len(chunks) - final_result["audio_duration"] = duration - else: - final_result = { - "generation": aggregated_text, - "num_audio_chunks": len(chunks), - "audio_duration": duration, - } - - return final_result - - # Default behavior for non-chunked audio or non-list prompts - return await super().generate_async(prompt=prompt, tokens_to_generate=tokens_to_generate, **kwargs) - - def _needs_audio_chunking(self, messages: list[dict], task_type: str = None) -> tuple[bool, str, float]: - """Check if audio in messages needs chunking. - - Modified to support all task types by default, with optional filtering. - - Returns: - Tuple of (needs_chunking, audio_path, duration) - """ - if not self.enable_audio_chunking: - return False, None, 0.0 - - # Check if task type should be chunked (if filter is specified) - if self.audio_chunk_task_types is not None: - if task_type not in self.audio_chunk_task_types: - return False, None, 0.0 - - # Find audio in messages - for msg in messages: - if msg.get("role") == "user": - audio_info = msg.get("audio") or (msg.get("audios", [{}])[0] if msg.get("audios") else {}) - if audio_info and "path" in audio_info: - audio_path = os.path.join(self.data_dir, audio_info["path"]) - - if not os.path.exists(audio_path): - return False, None, 0.0 - - # Load audio to check duration - try: - audio_array, sampling_rate = load_audio_file(audio_path) - duration = len(audio_array) / sampling_rate - - if duration > self.chunk_audio_threshold_sec: - return True, audio_path, duration - except Exception: - pass - - return False, None, 0.0 - def _build_chat_request_params( self, messages: list[dict], @@ -380,9 +120,6 @@ def _build_chat_request_params( tools: list[dict] | None = None, extra_body: dict = None, ) -> dict: - # Preprocess messages for model-specific requirements (e.g., remove /no_think for Qwen) - messages = self._preprocess_messages_for_model(messages) - messages = [self.content_text_to_list(message) for message in messages] request = { "messages": messages, "max_tokens": tokens_to_generate, From 35219879fb6111aa523cef6822767a2429904f95 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Tue, 23 Dec 2025 08:01:38 -0800 Subject: [PATCH 37/38] fixing task type issue. 1) after refactoring AudioProcessor worked only if we pass ++audio arg. this is inconvenient 2) now we audio-enabling in based on dataset_group=speechlm or task_type=audio all audio containing benchmarks have one or both Signed-off-by: George Zelenfroind --- nemo_skills/inference/generate.py | 18 +++++++++++++++--- nemo_skills/pipeline/utils/eval.py | 6 ++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index d994feaca8..00993c5f66 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -199,6 +199,7 @@ class GenerateSolutionsConfig: # Evaluation setup if requested. If eval_type is set to None, evaluation is skipped eval_type: str | None = None # "lean4-proof", "math", etc. eval_config: dict = field(default_factory=dict) # Config for the evaluator + dataset_group: str | None = None # "math", "code", "speechlm", etc. from benchmark's DATASET_GROUP def __post_init__(self): self._post_init_validate_data() @@ -404,7 +405,14 @@ def setup_llm(self): llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) # Audio wrapper (preprocesses messages before they reach the model) - if self.cfg.audio is not None: + # Auto-enable for audio benchmarks on vLLM (eval_type=audio OR dataset_group=speechlm) + should_enable_audio = ( + self.cfg.audio is not None or + (self.cfg.server.get("server_type", "").lower() == "vllm" and + (self.cfg.eval_type == "audio" or self.cfg.dataset_group == "speechlm")) + ) + + if should_enable_audio: audio_supported_servers = {"vllm"} server_type = self.cfg.server.get("server_type", "").lower() if server_type not in audio_supported_servers: @@ -412,9 +420,13 @@ def setup_llm(self): f"Audio processing is not supported for server_type='{server_type}'. " f"Supported server types: {audio_supported_servers}" ) + + # Use provided config or create default + audio_config = self.cfg.audio if self.cfg.audio is not None else AudioProcessorConfig() + llm = AudioProcessor( llm, - self.cfg.audio, + audio_config, eval_config=dict(self.cfg.eval_config), eval_type=self.cfg.eval_type, ) @@ -643,7 +655,7 @@ async def process_single_datapoint(self, data_point, all_data): } # Pass task_type for audio chunking - if "task_type" in data_point: + if isinstance(self.llm, AudioProcessor) and "task_type" in data_point: generation_params["task_type"] = data_point["task_type"] if self.cfg.code_execution: diff --git a/nemo_skills/pipeline/utils/eval.py b/nemo_skills/pipeline/utils/eval.py index 736d1c7cb6..d2b8a5f2ac 100644 --- a/nemo_skills/pipeline/utils/eval.py +++ b/nemo_skills/pipeline/utils/eval.py @@ -131,6 +131,12 @@ def get_benchmark_args_from_module( if eval_args: generation_args = f"{eval_args} {generation_args}" generation_args += f" ++eval_config.split={split} " + + # Pass dataset_group from benchmark config if defined + dataset_group = get_arg_from_module_or_dict(benchmark_module, "DATASET_GROUP", None, override_dict=override_dict) + if dataset_group: + generation_args += f" ++dataset_group={dataset_group} " + requires_sandbox = get_arg_from_module_or_dict(benchmark_module, "REQUIRES_SANDBOX", False, override_dict) keep_mounts_for_sandbox = get_arg_from_module_or_dict( benchmark_module, "KEEP_MOUNTS_FOR_SANDBOX", False, override_dict From 1e418e7032bab7d266363b786fe441bb5f63c4a4 Mon Sep 17 00:00:00 2001 From: George Zelenfroind Date: Tue, 23 Dec 2025 08:08:05 -0800 Subject: [PATCH 38/38] fix linter Signed-off-by: George Zelenfroind --- nemo_skills/inference/generate.py | 13 ++++++------- nemo_skills/pipeline/utils/eval.py | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py index 00993c5f66..fd46cc3a9b 100644 --- a/nemo_skills/inference/generate.py +++ b/nemo_skills/inference/generate.py @@ -406,12 +406,11 @@ def setup_llm(self): # Audio wrapper (preprocesses messages before they reach the model) # Auto-enable for audio benchmarks on vLLM (eval_type=audio OR dataset_group=speechlm) - should_enable_audio = ( - self.cfg.audio is not None or - (self.cfg.server.get("server_type", "").lower() == "vllm" and - (self.cfg.eval_type == "audio" or self.cfg.dataset_group == "speechlm")) + should_enable_audio = self.cfg.audio is not None or ( + self.cfg.server.get("server_type", "").lower() == "vllm" + and (self.cfg.eval_type == "audio" or self.cfg.dataset_group == "speechlm") ) - + if should_enable_audio: audio_supported_servers = {"vllm"} server_type = self.cfg.server.get("server_type", "").lower() @@ -420,10 +419,10 @@ def setup_llm(self): f"Audio processing is not supported for server_type='{server_type}'. " f"Supported server types: {audio_supported_servers}" ) - + # Use provided config or create default audio_config = self.cfg.audio if self.cfg.audio is not None else AudioProcessorConfig() - + llm = AudioProcessor( llm, audio_config, diff --git a/nemo_skills/pipeline/utils/eval.py b/nemo_skills/pipeline/utils/eval.py index d2b8a5f2ac..9245337064 100644 --- a/nemo_skills/pipeline/utils/eval.py +++ b/nemo_skills/pipeline/utils/eval.py @@ -131,12 +131,12 @@ def get_benchmark_args_from_module( if eval_args: generation_args = f"{eval_args} {generation_args}" generation_args += f" ++eval_config.split={split} " - + # Pass dataset_group from benchmark config if defined dataset_group = get_arg_from_module_or_dict(benchmark_module, "DATASET_GROUP", None, override_dict=override_dict) if dataset_group: generation_args += f" ++dataset_group={dataset_group} " - + requires_sandbox = get_arg_from_module_or_dict(benchmark_module, "REQUIRES_SANDBOX", False, override_dict) keep_mounts_for_sandbox = get_arg_from_module_or_dict( benchmark_module, "KEEP_MOUNTS_FOR_SANDBOX", False, override_dict