Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
07a408e
print
karpnv Nov 13, 2025
6acbc9f
content_text_to_list
karpnv Nov 13, 2025
78c6313
rm print
karpnv Nov 13, 2025
8f5c3a9
self.discard_binary_types
karpnv Nov 13, 2025
5e36e55
print
karpnv Nov 13, 2025
862be04
messages
karpnv Nov 14, 2025
6768b00
binary_data_to_drop_from_output
karpnv Nov 14, 2025
f522c0b
print
karpnv Nov 14, 2025
d48d43f
print
karpnv Nov 14, 2025
4487ca1
data_dir
karpnv Nov 14, 2025
c53d405
data_dir=''
karpnv Nov 14, 2025
a7cf5b5
eval_config
karpnv Nov 14, 2025
689e6fc
eval_type
karpnv Nov 14, 2025
9536b8b
if
karpnv Nov 15, 2025
155a480
or
karpnv Nov 15, 2025
0d9292d
print
karpnv Nov 15, 2025
c9d99a5
print
karpnv Nov 15, 2025
0c54cad
print
karpnv Nov 15, 2025
fc7a344
++val_type
karpnv Nov 15, 2025
c68f8af
audio
karpnv Nov 20, 2025
cfaf38a
rm data_dir
karpnv Nov 21, 2025
8186164
else
karpnv Nov 21, 2025
813d675
mmau-pro open-ended metrics improvement
melllinia Nov 26, 2025
1b7e715
pre-commit fixes
melllinia Nov 26, 2025
ea8f294
Add audio tests for vLLM server
melllinia Nov 26, 2025
206c7d6
making audio paths generic
melllinia Nov 27, 2025
289c694
fix generate.py
melllinia Dec 2, 2025
3cc7ae5
requested changes and audio/text order fix
melllinia Dec 5, 2025
5825f27
resolving conflicts and lint fix
melllinia Dec 5, 2025
6f58a1b
Fix drop_binary_data missing messages check
melllinia Dec 5, 2025
0f1b86d
Enable audio chunking
Jorjeous Dec 10, 2025
32ed94c
remove qwen specific post processing
Jorjeous Dec 10, 2025
30a1c7f
Merge branch 'main' into audio_bin
Jorjeous Dec 15, 2025
15f30fd
fixing audio chunking config
melllinia Dec 16, 2025
eaaba9b
Merge branch 'main' into audio_bin
melllinia Dec 16, 2025
8a0765a
Fix audio tests to use VLLMModel instead of GenerationTask
melllinia Dec 16, 2025
f8c3089
adding vllm integration test to run_qwen.sh
melllinia Dec 16, 2025
a88736e
move audio processing into model (#1137)
gwarmstrong Dec 23, 2025
3521987
fixing task type issue.
Jorjeous Dec 23, 2025
1e418e7
fix linter
Jorjeous Dec 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions nemo_skills/dataset/mmau-pro/closed_form/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
METRICS_TYPE = "mmau_pro_closed_form"
SCORE_MODULE = "nemo_skills.evaluation.metrics.mmau_pro_metrics"
GENERATION_ARGS = "++prompt_format=openai"
EVAL_ARGS = "++eval_type=mmau-pro"

# NVEmbed judge configuration for closed-form evaluation
JUDGE_PIPELINE_ARGS = {
Expand Down
2 changes: 1 addition & 1 deletion nemo_skills/dataset/mmau-pro/open_ended/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@
"server_type": "openai",
"server_address": "https://integrate.api.nvidia.com/v1",
}
JUDGE_ARGS = "++prompt_config=judge/speechlm ++generation_key=judgement"
JUDGE_ARGS = "++prompt_config=judge/mmau-pro ++generation_key=judgement"
23 changes: 14 additions & 9 deletions nemo_skills/dataset/mmau-pro/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,22 +75,27 @@ def format_entry(entry, with_audio=False):
if category == "open":
content = entry["question"]
elif choices and len(choices) > 1:
options_text = "\n".join(f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices))
content = f"{entry['question']}\n\n{options_text}"
options_text = "\n".join(f"{chr(65 + i)}) {choice}" for i, choice in enumerate(choices))
content = f"{entry['question']}\n\n{options_text}\n\nRespond with the complete text of the correct option, not just the letter."
else:
content = entry["question"]

user_message = {"role": "user", "content": content}

if entry.get("audio_path"):
audio_path = entry["audio_path"]

if isinstance(audio_path, list) and audio_path:
user_message["audios"] = [{"path": path, "duration": 10.0} for path in audio_path]
elif isinstance(audio_path, str):
user_message["audio"] = {"path": audio_path, "duration": 10.0}

formatted_entry["messages"] = [user_message]
# Prepend /dataset/mmau-pro/ to make paths absolute for cluster
if len(audio_path) == 1:
user_message["audio"] = {"path": f"/dataset/mmau-pro/{audio_path[0]}"}
else:
user_message["audios"] = [{"path": f"/dataset/mmau-pro/{path}"} for path in audio_path]

# Don't use /no_think for open-ended questions to allow reasoning
system_content = "You are a helpful assistant."
if category != "open":
system_content += " /no_think"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is specific to qwen models, I don't think we should be doing it globally here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Imho, this is not problem for other models

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It certainly introduces a little mismatch, whether it's significant or not for accuracy is unclear, but I think it's much safer not to have it. What's the reason why we need to hardcode it here?


formatted_entry["messages"] = [{"role": "system", "content": system_content}, user_message]
return formatted_entry


Expand Down
118 changes: 108 additions & 10 deletions nemo_skills/evaluation/metrics/mmau_pro_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,77 @@
# limitations under the License.

import logging
import re

import numpy as np

from nemo_skills.evaluation.metrics.base import BaseMetrics, as_int, as_percentage
from nemo_skills.evaluation.metrics.utils import is_correct_judgement
from nemo_skills.utils import get_logger_name

LOG = logging.getLogger(get_logger_name(__file__))


def extract_multicriteria_scores(judgement_text: str) -> dict[str, float]:
"""Extract multi-criteria scores (1-5 scale) from LLM judge evaluation.

Expected format:
CORRECTNESS: [score] - [justification]
RELEVANCE: [score] - [justification]
COMPLETENESS: [score] - [justification]
CLARITY: [score] - [justification]
OVERALL: [score] - [overall assessment]

Returns:
Dictionary with keys: correctness, relevance, completeness, clarity, overall
Defaults to 3.0 if score not found.
"""
scores = {}

patterns = {
"correctness": r"CORRECTNESS:\s*(\d+(?:\.\d+)?)",
"relevance": r"RELEVANCE:\s*(\d+(?:\.\d+)?)",
"completeness": r"COMPLETENESS:\s*(\d+(?:\.\d+)?)",
"clarity": r"CLARITY:\s*(\d+(?:\.\d+)?)",
"overall": r"OVERALL:\s*(\d+(?:\.\d+)?)",
}

for criterion, pattern in patterns.items():
match = re.search(pattern, judgement_text, re.IGNORECASE)
scores[criterion] = float(match.group(1)) if match else 3.0

# Fallback: compute overall if missing or still 3.0
if "overall" not in scores or scores["overall"] == 3.0:
criteria_scores = [scores.get(k, 3.0) for k in ["correctness", "relevance", "completeness", "clarity"]]
scores["overall"] = sum(criteria_scores) / len(criteria_scores)
Comment on lines +54 to +57
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Fallback logic incorrectly overwrites legitimate score of 3.0.

The condition scores["overall"] == 3.0 will overwrite a legitimately assigned overall score of 3.0 with the computed average. A score of 3.0 could be a valid LLM response.

Consider tracking whether the overall score was actually matched vs. defaulted:

     for criterion, pattern in patterns.items():
         match = re.search(pattern, judgement_text, re.IGNORECASE)
-        scores[criterion] = float(match.group(1)) if match else 3.0
+        if match:
+            scores[criterion] = float(match.group(1))
+        else:
+            scores[criterion] = 3.0
+            if criterion == "overall":
+                scores["_overall_missing"] = True

-    # Fallback: compute overall if missing or still 3.0
-    if "overall" not in scores or scores["overall"] == 3.0:
+    # Fallback: compute overall if missing
+    if scores.get("_overall_missing", False):
         criteria_scores = [scores.get(k, 3.0) for k in ["correctness", "relevance", "completeness", "clarity"]]
         scores["overall"] = sum(criteria_scores) / len(criteria_scores)
+        del scores["_overall_missing"]

     return scores

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
nemo_skills/evaluation/metrics/mmau_pro_metrics.py around lines 54-57: the
current fallback checks scores["overall"] == 3.0 and thus overwrites a
legitimately assigned 3.0; change the logic to only compute and set overall when
the key is absent or explicitly marked as a default (e.g. use None as the
default overall value or a separate flag like scores["_overall_defaulted"]), so
update callers to set scores["overall"]=None or
scores["_overall_defaulted"]=True when no real overall exists, and then replace
the condition with a check for key absence or None/flag before computing the
average from the four criteria.


return scores


class MMAUProMetrics(BaseMetrics):
"""Metrics class for MMAU-Pro benchmark (all subgroups)."""

def __init__(self, compute_no_answer: bool = True, max_k: int = 1):
super().__init__(compute_no_answer=compute_no_answer)
self.max_k = max_k

# Track multi-criteria scores for open-ended questions (1-5 scale)
self.multicriteria_scores = {
"correctness": [],
"relevance": [],
"completeness": [],
"clarity": [],
"overall": [],
}

def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
"""Extract correctness scores from prediction."""
score_dict = {}

# Open-ended: extract from judge result
# Open-ended: use LLM judge correctness score >= 3 as correct
if "judgement" in prediction:
judge_result = is_correct_judgement(prediction["judgement"])
score_dict["judge_correct"] = judge_result
score_dict["correct"] = judge_result
# Closed-form and instruction following: use is_correct
multicriteria = extract_multicriteria_scores(prediction["judgement"])
score_dict["correct"] = multicriteria.get("correctness", 3.0) >= 3.0
# Closed-form / instruction-following: use binary correctness
elif "is_correct" in prediction:
score_dict["correct"] = prediction["is_correct"]
else:
Expand All @@ -58,24 +104,61 @@ def get_incorrect_sample(self, prediction: dict) -> dict:
def update(self, predictions):
"""Update metrics with new predictions."""
super().update(predictions)

predicted_answers = [pred.get("generation", None).strip() or None for pred in predictions]
self._compute_pass_at_k(predictions=predictions, predicted_answers=predicted_answers)
self._compute_majority_at_k(predictions=predictions, predicted_answers=predicted_answers)

# Collect multi-criteria scores for open-ended questions
for pred in predictions:
if "judgement" in pred:
multicriteria = extract_multicriteria_scores(pred["judgement"])
for criterion in self.multicriteria_scores:
self.multicriteria_scores[criterion].append(multicriteria.get(criterion, 3.0))

def get_metrics(self):
"""Get computed metrics."""
metrics_dict = super().get_metrics()

for agg_mode, agg_metrics in metrics_dict.items():
# Ensure avg_tokens is always present for MMAU-Pro
# Ensure avg_tokens is present
if "avg_tokens" not in agg_metrics:
agg_metrics["avg_tokens"] = 0
if "no_answer" in agg_metrics:
agg_metrics["no_answer"] = agg_metrics["no_answer"] / 2.0
# Set success_rate from correct or judge_correct
if "judge_correct" in agg_metrics:
agg_metrics["success_rate"] = agg_metrics["judge_correct"]

# Add multi-criteria averages for open-ended (convert 1-5 scale to percentage)
if self.multicriteria_scores["overall"]:
for criterion in self.multicriteria_scores:
scores = self.multicriteria_scores[criterion]
if scores:
# Convert 1-5 scale to 0-100 percentage scale
avg_score = np.mean(scores)
std_score = np.std(scores)
agg_metrics[f"avg_{criterion}"] = (avg_score / 5.0) * 100
agg_metrics[f"std_{criterion}"] = (std_score / 5.0) * 100

# Set correct and success_rate to avg_correctness for open-ended
agg_metrics["correct"] = agg_metrics["avg_correctness"]
agg_metrics["success_rate"] = agg_metrics["avg_correctness"]

# Calculate good/poor response rates based on overall >= 4 or <= 2
overall_scores = self.multicriteria_scores["overall"]
good_responses = sum(1 for score in overall_scores if score >= 4.0)
poor_responses = sum(1 for score in overall_scores if score <= 2.0)

agg_metrics["good_response_rate"] = (good_responses / len(overall_scores)) * 100
agg_metrics["poor_response_rate"] = (poor_responses / len(overall_scores)) * 100

# For closed-form / instruction-following: use binary correctness
elif "correct" in agg_metrics:
agg_metrics["success_rate"] = agg_metrics["correct"]

# Round all numeric values to 2 decimal places
for key, value in agg_metrics.items():
if isinstance(value, float) and not isinstance(value, bool):
agg_metrics[key] = round(value, 2)

return metrics_dict

def metrics_to_print(self):
Expand All @@ -87,5 +170,20 @@ def metrics_to_print(self):
}
if self.compute_no_answer:
base_metrics["no_answer"] = as_percentage

# Add multi-criteria metrics for open-ended questions (now in percentage format)
if self.multicriteria_scores["overall"]:
base_metrics.update(
{
"avg_overall": as_percentage,
"avg_correctness": as_percentage,
"avg_relevance": as_percentage,
"avg_completeness": as_percentage,
"avg_clarity": as_percentage,
"good_response_rate": as_percentage,
"poor_response_rate": as_percentage,
}
)

base_metrics["num_entries"] = as_int
return base_metrics
82 changes: 81 additions & 1 deletion nemo_skills/inference/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
supports_single_eval,
)
from nemo_skills.inference.model import (
AudioProcessor,
AudioProcessorConfig,
ParallelThinkingConfig,
get_code_execution_model,
get_model,
Expand Down Expand Up @@ -186,9 +188,18 @@ class GenerateSolutionsConfig:
# If True, will enable litellm disk cache (useful for keeping intermediate results in case of job timelimit failures)
enable_litellm_cache: bool = False

# List of content types to drop from messages (e.g., base64 audio) to keep output files smaller
drop_content_types: list[str] = field(default_factory=lambda: ["audio_url"])

# Audio processing configuration (EXPERIMENTAL)
# Set to enable audio file preprocessing (file->base64 conversion, chunking for long audio)
# Example: ++audio.data_dir=/path/to/audio ++audio.enable_chunking=true
audio: AudioProcessorConfig | None = None

# Evaluation setup if requested. If eval_type is set to None, evaluation is skipped
eval_type: str | None = None # "lean4-proof", "math", etc.
eval_config: dict = field(default_factory=dict) # Config for the evaluator
dataset_group: str | None = None # "math", "code", "speechlm", etc. from benchmark's DATASET_GROUP

def __post_init__(self):
self._post_init_validate_data()
Expand Down Expand Up @@ -393,6 +404,32 @@ def setup_llm(self):
else:
llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)

# Audio wrapper (preprocesses messages before they reach the model)
# Auto-enable for audio benchmarks on vLLM (eval_type=audio OR dataset_group=speechlm)
should_enable_audio = self.cfg.audio is not None or (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather configure this through prepare.py of relevant benchmarks. Otherwise this logic is too coupled with datasets but this module is very general. So instead of dataset_group, I'd add a flag should_enable_audio and you can just set it as default parameter in the init of those benchmarks. You can still have a check for vllm to be used as a server and other logic for wrapping, just let's remove the dataset_group based behaviour

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got your point, when moving to separate class PR #1157 this would be auto-resolved

self.cfg.server.get("server_type", "").lower() == "vllm"
and (self.cfg.eval_type == "audio" or self.cfg.dataset_group == "speechlm")
)

if should_enable_audio:
audio_supported_servers = {"vllm"}
server_type = self.cfg.server.get("server_type", "").lower()
if server_type not in audio_supported_servers:
raise ValueError(
f"Audio processing is not supported for server_type='{server_type}'. "
f"Supported server types: {audio_supported_servers}"
)

# Use provided config or create default
audio_config = self.cfg.audio if self.cfg.audio is not None else AudioProcessorConfig()

llm = AudioProcessor(
llm,
audio_config,
eval_config=dict(self.cfg.eval_config),
eval_type=self.cfg.eval_type,
)

if self.cfg.parallel_thinking.mode is not None:
# We don't want to override these key variables which overlap with self.cfg
inference_override_config = {
Expand Down Expand Up @@ -519,12 +556,45 @@ def fill_prompt(self, data_point, data):
filled_prompt[-1]["content"] += self.cfg.prompt_suffix
else:
filled_prompt += self.cfg.prompt_suffix

# Copy audio fields from data_point to the user message for audio models
if isinstance(filled_prompt, list):
for msg in filled_prompt:
if msg.get("role") == "user":
if "audio" in data_point:
msg["audio"] = data_point["audio"]
break

return filled_prompt

def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]:
"""Preprocess the prompt before sending to the model.

Override this method to add custom preprocessing logic.
Audio conversion is handled by the AudioProcessor wrapper.
"""
return prompt

def dump_outputs(self, outputs, data_points, fout):
for output in outputs:
fout.write(json.dumps(output) + "\n")

def drop_binary_data(self, output):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe just drop_data or better drop_fields_from_messages or something like this as it's not limited to binary but just checks what to drop from the parameter?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, why not

"""Remove binary data (like base64 audio) from messages to keep output files smaller."""
# Skip if output doesn't have messages (e.g., text completion mode or error cases)
if "messages" not in output:
return

for message in output["messages"]:
# Skip if content is not a list (e.g., string content in system messages)
if not isinstance(message.get("content"), list):
continue

# Filter out content types specified in drop_content_types config
message["content"] = [
content for content in message["content"] if content.get("type") not in self.cfg.drop_content_types
]

async def postprocess_single_output(self, output, original_data_point):
# to make it easier to follow up with other generations and limit accidental errors, we are adding
# all of the original data to the output file alongside the new generations
Expand All @@ -540,6 +610,9 @@ async def postprocess_single_output(self, output, original_data_point):
for key in output:
original_data_point.pop(key, None)
output.update(original_data_point)

self.drop_binary_data(output)

if self.cfg.parse_reasoning:
parse_reasoning(
output,
Expand Down Expand Up @@ -570,13 +643,20 @@ async def process_single_datapoint(self, data_point, all_data):
# Already a dict from Hydra
inference_params = dict(self.cfg.inference)

prompt = self.fill_prompt(data_point, all_data)
prompt = self.preprocess_prompt(prompt)

generation_params = {
**inference_params,
**self.extra_generate_params,
"prompt": self.fill_prompt(data_point, all_data),
"prompt": prompt,
"stop_phrases": [self.cfg.stop_phrase] if self.cfg.stop_phrase else None,
}

# Pass task_type for audio chunking
if isinstance(self.llm, AudioProcessor) and "task_type" in data_point:
generation_params["task_type"] = data_point["task_type"]

if self.cfg.code_execution:
if self.cfg.override_max_code_executions and self.cfg.total_code_executions_in_prompt is not None:
generation_params["max_code_executions"] = data_point["total_code_executions"]
Expand Down
Loading
Loading