NVIDIA-NeMo · karpnv · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/nemo_skills/dataset/mmau-pro/closed_form/__init__.py b/nemo_skills/dataset/mmau-pro/closed_form/__init__.py
@@ -16,6 +16,7 @@
 METRICS_TYPE = "mmau_pro_closed_form"
 SCORE_MODULE = "nemo_skills.evaluation.metrics.mmau_pro_metrics"
 GENERATION_ARGS = "++prompt_format=openai"
+EVAL_ARGS = "++eval_type=mmau-pro"
 
 # NVEmbed judge configuration for closed-form evaluation
 JUDGE_PIPELINE_ARGS = {

diff --git a/nemo_skills/dataset/mmau-pro/open_ended/__init__.py b/nemo_skills/dataset/mmau-pro/open_ended/__init__.py
@@ -23,4 +23,4 @@
     "server_type": "openai",
     "server_address": "https://integrate.api.nvidia.com/v1",
 }
-JUDGE_ARGS = "++prompt_config=judge/speechlm ++generation_key=judgement"
+JUDGE_ARGS = "++prompt_config=judge/mmau-pro ++generation_key=judgement"
diff --git a/nemo_skills/dataset/mmau-pro/prepare.py b/nemo_skills/dataset/mmau-pro/prepare.py
@@ -75,22 +75,27 @@ def format_entry(entry, with_audio=False):
     if category == "open":
         content = entry["question"]
     elif choices and len(choices) > 1:
-        options_text = "\n".join(f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices))
-        content = f"{entry['question']}\n\n{options_text}"
+        options_text = "\n".join(f"{chr(65 + i)}) {choice}" for i, choice in enumerate(choices))
+        content = f"{entry['question']}\n\n{options_text}\n\nRespond with the complete text of the correct option, not just the letter."
     else:
         content = entry["question"]
 
     user_message = {"role": "user", "content": content}
 
     if entry.get("audio_path"):
         audio_path = entry["audio_path"]
-
-        if isinstance(audio_path, list) and audio_path:
-            user_message["audios"] = [{"path": path, "duration": 10.0} for path in audio_path]
-        elif isinstance(audio_path, str):
-            user_message["audio"] = {"path": audio_path, "duration": 10.0}
-
-    formatted_entry["messages"] = [user_message]
+        # Prepend /dataset/mmau-pro/ to make paths absolute for cluster
+        if len(audio_path) == 1:
+            user_message["audio"] = {"path": f"/dataset/mmau-pro/{audio_path[0]}"}
+        else:
+            user_message["audios"] = [{"path": f"/dataset/mmau-pro/{path}"} for path in audio_path]
+
+    # Don't use /no_think for open-ended questions to allow reasoning
+    system_content = "You are a helpful assistant."
+    if category != "open":
+        system_content += " /no_think"
+
+    formatted_entry["messages"] = [{"role": "system", "content": system_content}, user_message]
     return formatted_entry
 
 

diff --git a/nemo_skills/evaluation/metrics/mmau_pro_metrics.py b/nemo_skills/evaluation/metrics/mmau_pro_metrics.py
@@ -13,31 +13,77 @@
 # limitations under the License.
 
 import logging
+import re
+
+import numpy as np
 
 from nemo_skills.evaluation.metrics.base import BaseMetrics, as_int, as_percentage
-from nemo_skills.evaluation.metrics.utils import is_correct_judgement
 from nemo_skills.utils import get_logger_name
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
 
+def extract_multicriteria_scores(judgement_text: str) -> dict[str, float]:
+    """Extract multi-criteria scores (1-5 scale) from LLM judge evaluation.
+
+    Expected format:
+        CORRECTNESS: [score] - [justification]
+        RELEVANCE: [score] - [justification]
+        COMPLETENESS: [score] - [justification]
+        CLARITY: [score] - [justification]
+        OVERALL: [score] - [overall assessment]
+
+    Returns:
+        Dictionary with keys: correctness, relevance, completeness, clarity, overall
+        Defaults to 3.0 if score not found.
+    """
+    scores = {}
+
+    patterns = {
+        "correctness": r"CORRECTNESS:\s*(\d+(?:\.\d+)?)",
+        "relevance": r"RELEVANCE:\s*(\d+(?:\.\d+)?)",
+        "completeness": r"COMPLETENESS:\s*(\d+(?:\.\d+)?)",
+        "clarity": r"CLARITY:\s*(\d+(?:\.\d+)?)",
+        "overall": r"OVERALL:\s*(\d+(?:\.\d+)?)",
+    }
+
+    for criterion, pattern in patterns.items():
+        match = re.search(pattern, judgement_text, re.IGNORECASE)
+        scores[criterion] = float(match.group(1)) if match else 3.0
+
+    # Fallback: compute overall if missing or still 3.0
+    if "overall" not in scores or scores["overall"] == 3.0:
+        criteria_scores = [scores.get(k, 3.0) for k in ["correctness", "relevance", "completeness", "clarity"]]
+        scores["overall"] = sum(criteria_scores) / len(criteria_scores)
+
+    return scores
+
+
 class MMAUProMetrics(BaseMetrics):
     """Metrics class for MMAU-Pro benchmark (all subgroups)."""
 
     def __init__(self, compute_no_answer: bool = True, max_k: int = 1):
         super().__init__(compute_no_answer=compute_no_answer)
         self.max_k = max_k
 
+        # Track multi-criteria scores for open-ended questions (1-5 scale)
+        self.multicriteria_scores = {
+            "correctness": [],
+            "relevance": [],
+            "completeness": [],
+            "clarity": [],
+            "overall": [],
+        }
+
     def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
         """Extract correctness scores from prediction."""
         score_dict = {}
 
-        # Open-ended: extract from judge result
+        # Open-ended: use LLM judge correctness score >= 3 as correct
         if "judgement" in prediction:
-            judge_result = is_correct_judgement(prediction["judgement"])
-            score_dict["judge_correct"] = judge_result
-            score_dict["correct"] = judge_result
-        # Closed-form and instruction following: use is_correct
+            multicriteria = extract_multicriteria_scores(prediction["judgement"])
+            score_dict["correct"] = multicriteria.get("correctness", 3.0) >= 3.0
+        # Closed-form / instruction-following: use binary correctness
         elif "is_correct" in prediction:
             score_dict["correct"] = prediction["is_correct"]
         else:
@@ -58,24 +104,61 @@ def get_incorrect_sample(self, prediction: dict) -> dict:
     def update(self, predictions):
         """Update metrics with new predictions."""
         super().update(predictions)
+
         predicted_answers = [pred.get("generation", None).strip() or None for pred in predictions]
         self._compute_pass_at_k(predictions=predictions, predicted_answers=predicted_answers)
         self._compute_majority_at_k(predictions=predictions, predicted_answers=predicted_answers)
 
+        # Collect multi-criteria scores for open-ended questions
+        for pred in predictions:
+            if "judgement" in pred:
+                multicriteria = extract_multicriteria_scores(pred["judgement"])
+                for criterion in self.multicriteria_scores:
+                    self.multicriteria_scores[criterion].append(multicriteria.get(criterion, 3.0))
+
     def get_metrics(self):
         """Get computed metrics."""
         metrics_dict = super().get_metrics()
+
         for agg_mode, agg_metrics in metrics_dict.items():
-            # Ensure avg_tokens is always present for MMAU-Pro
+            # Ensure avg_tokens is present
             if "avg_tokens" not in agg_metrics:
                 agg_metrics["avg_tokens"] = 0
             if "no_answer" in agg_metrics:
                 agg_metrics["no_answer"] = agg_metrics["no_answer"] / 2.0
-            # Set success_rate from correct or judge_correct
-            if "judge_correct" in agg_metrics:
-                agg_metrics["success_rate"] = agg_metrics["judge_correct"]
+
+            # Add multi-criteria averages for open-ended (convert 1-5 scale to percentage)
+            if self.multicriteria_scores["overall"]:
+                for criterion in self.multicriteria_scores:
+                    scores = self.multicriteria_scores[criterion]
+                    if scores:
+                        # Convert 1-5 scale to 0-100 percentage scale
+                        avg_score = np.mean(scores)
+                        std_score = np.std(scores)
+                        agg_metrics[f"avg_{criterion}"] = (avg_score / 5.0) * 100
+                        agg_metrics[f"std_{criterion}"] = (std_score / 5.0) * 100
+
+                # Set correct and success_rate to avg_correctness for open-ended
+                agg_metrics["correct"] = agg_metrics["avg_correctness"]
+                agg_metrics["success_rate"] = agg_metrics["avg_correctness"]
+
+                # Calculate good/poor response rates based on overall >= 4 or <= 2
+                overall_scores = self.multicriteria_scores["overall"]
+                good_responses = sum(1 for score in overall_scores if score >= 4.0)
+                poor_responses = sum(1 for score in overall_scores if score <= 2.0)
+
+                agg_metrics["good_response_rate"] = (good_responses / len(overall_scores)) * 100
+                agg_metrics["poor_response_rate"] = (poor_responses / len(overall_scores)) * 100
+
+            # For closed-form / instruction-following: use binary correctness
             elif "correct" in agg_metrics:
                 agg_metrics["success_rate"] = agg_metrics["correct"]
+
+            # Round all numeric values to 2 decimal places
+            for key, value in agg_metrics.items():
+                if isinstance(value, float) and not isinstance(value, bool):
+                    agg_metrics[key] = round(value, 2)
+
         return metrics_dict
 
     def metrics_to_print(self):
@@ -87,5 +170,20 @@ def metrics_to_print(self):
         }
         if self.compute_no_answer:
             base_metrics["no_answer"] = as_percentage
+
+        # Add multi-criteria metrics for open-ended questions (now in percentage format)
+        if self.multicriteria_scores["overall"]:
+            base_metrics.update(
+                {
+                    "avg_overall": as_percentage,
+                    "avg_correctness": as_percentage,
+                    "avg_relevance": as_percentage,
+                    "avg_completeness": as_percentage,
+                    "avg_clarity": as_percentage,
+                    "good_response_rate": as_percentage,
+                    "poor_response_rate": as_percentage,
+                }
+            )
+
         base_metrics["num_entries"] = as_int
         return base_metrics
diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
@@ -38,6 +38,8 @@
     supports_single_eval,
 )
 from nemo_skills.inference.model import (
+    AudioProcessor,
+    AudioProcessorConfig,
     ParallelThinkingConfig,
     get_code_execution_model,
     get_model,
@@ -186,9 +188,18 @@ class GenerateSolutionsConfig:
     # If True, will enable litellm disk cache (useful for keeping intermediate results in case of job timelimit failures)
     enable_litellm_cache: bool = False
 
+    # List of content types to drop from messages (e.g., base64 audio) to keep output files smaller
+    drop_content_types: list[str] = field(default_factory=lambda: ["audio_url"])
+
+    # Audio processing configuration (EXPERIMENTAL)
+    # Set to enable audio file preprocessing (file->base64 conversion, chunking for long audio)
+    # Example: ++audio.data_dir=/path/to/audio ++audio.enable_chunking=true
+    audio: AudioProcessorConfig | None = None
+
     # Evaluation setup if requested. If eval_type is set to None, evaluation is skipped
     eval_type: str | None = None  # "lean4-proof", "math", etc.
     eval_config: dict = field(default_factory=dict)  # Config for the evaluator
+    dataset_group: str | None = None  # "math", "code", "speechlm", etc. from benchmark's DATASET_GROUP
 
     def __post_init__(self):
         self._post_init_validate_data()
@@ -393,6 +404,32 @@ def setup_llm(self):
         else:
             llm = get_model(**self.cfg.server, tokenizer=self.tokenizer)
 
+        # Audio wrapper (preprocesses messages before they reach the model)
+        # Auto-enable for audio benchmarks on vLLM (eval_type=audio OR dataset_group=speechlm)
+        should_enable_audio = self.cfg.audio is not None or (
+            self.cfg.server.get("server_type", "").lower() == "vllm"
+            and (self.cfg.eval_type == "audio" or self.cfg.dataset_group == "speechlm")
+        )
+
+        if should_enable_audio:
+            audio_supported_servers = {"vllm"}
+            server_type = self.cfg.server.get("server_type", "").lower()
+            if server_type not in audio_supported_servers:
+                raise ValueError(
+                    f"Audio processing is not supported for server_type='{server_type}'. "
+                    f"Supported server types: {audio_supported_servers}"
+                )
+
+            # Use provided config or create default
+            audio_config = self.cfg.audio if self.cfg.audio is not None else AudioProcessorConfig()
+
+            llm = AudioProcessor(
+                llm,
+                audio_config,
+                eval_config=dict(self.cfg.eval_config),
+                eval_type=self.cfg.eval_type,
+            )
+
         if self.cfg.parallel_thinking.mode is not None:
             # We don't want to override these key variables which overlap with self.cfg
             inference_override_config = {
@@ -519,12 +556,45 @@ def fill_prompt(self, data_point, data):
                 filled_prompt[-1]["content"] += self.cfg.prompt_suffix
             else:
                 filled_prompt += self.cfg.prompt_suffix
+
+        # Copy audio fields from data_point to the user message for audio models
+        if isinstance(filled_prompt, list):
+            for msg in filled_prompt:
+                if msg.get("role") == "user":
+                    if "audio" in data_point:
+                        msg["audio"] = data_point["audio"]
+                    break
+
         return filled_prompt
 
+    def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]:
+        """Preprocess the prompt before sending to the model.
+
+        Override this method to add custom preprocessing logic.
+        Audio conversion is handled by the AudioProcessor wrapper.
+        """
+        return prompt
+
     def dump_outputs(self, outputs, data_points, fout):
         for output in outputs:
             fout.write(json.dumps(output) + "\n")
 
+    def drop_binary_data(self, output):
+        """Remove binary data (like base64 audio) from messages to keep output files smaller."""
+        # Skip if output doesn't have messages (e.g., text completion mode or error cases)
+        if "messages" not in output:
+            return
+
+        for message in output["messages"]:
+            # Skip if content is not a list (e.g., string content in system messages)
+            if not isinstance(message.get("content"), list):
+                continue
+
+            # Filter out content types specified in drop_content_types config
+            message["content"] = [
+                content for content in message["content"] if content.get("type") not in self.cfg.drop_content_types
+            ]
+
     async def postprocess_single_output(self, output, original_data_point):
         # to make it easier to follow up with other generations and limit accidental errors, we are adding
         # all of the original data to the output file alongside the new generations
@@ -540,6 +610,9 @@ async def postprocess_single_output(self, output, original_data_point):
         for key in output:
             original_data_point.pop(key, None)
         output.update(original_data_point)
+
+        self.drop_binary_data(output)
+
         if self.cfg.parse_reasoning:
             parse_reasoning(
                 output,
@@ -570,13 +643,20 @@ async def process_single_datapoint(self, data_point, all_data):
             # Already a dict from Hydra
             inference_params = dict(self.cfg.inference)
 
+        prompt = self.fill_prompt(data_point, all_data)
+        prompt = self.preprocess_prompt(prompt)
+
         generation_params = {
             **inference_params,
             **self.extra_generate_params,
-            "prompt": self.fill_prompt(data_point, all_data),
+            "prompt": prompt,
             "stop_phrases": [self.cfg.stop_phrase] if self.cfg.stop_phrase else None,
         }
 
+        # Pass task_type for audio chunking
+        if isinstance(self.llm, AudioProcessor) and "task_type" in data_point:
+            generation_params["task_type"] = data_point["task_type"]
+
         if self.cfg.code_execution:
             if self.cfg.override_max_code_executions and self.cfg.total_code_executions_in_prompt is not None:
                 generation_params["max_code_executions"] = data_point["total_code_executions"]