-
Notifications
You must be signed in to change notification settings - Fork 163
Request to vLLM with audio message #1042
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
07a408e
6acbc9f
78c6313
8f5c3a9
5e36e55
862be04
6768b00
f522c0b
d48d43f
4487ca1
c53d405
a7cf5b5
689e6fc
9536b8b
155a480
0d9292d
c9d99a5
0c54cad
fc7a344
c68f8af
cfaf38a
8186164
813d675
1b7e715
ea8f294
206c7d6
289c694
3cc7ae5
5825f27
6f58a1b
0f1b86d
32ed94c
30a1c7f
15f30fd
eaaba9b
8a0765a
f8c3089
a88736e
3521987
1e418e7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,31 +13,77 @@ | |
| # limitations under the License. | ||
|
|
||
| import logging | ||
| import re | ||
|
|
||
| import numpy as np | ||
|
|
||
| from nemo_skills.evaluation.metrics.base import BaseMetrics, as_int, as_percentage | ||
| from nemo_skills.evaluation.metrics.utils import is_correct_judgement | ||
| from nemo_skills.utils import get_logger_name | ||
|
|
||
| LOG = logging.getLogger(get_logger_name(__file__)) | ||
|
|
||
|
|
||
| def extract_multicriteria_scores(judgement_text: str) -> dict[str, float]: | ||
| """Extract multi-criteria scores (1-5 scale) from LLM judge evaluation. | ||
|
|
||
| Expected format: | ||
| CORRECTNESS: [score] - [justification] | ||
| RELEVANCE: [score] - [justification] | ||
| COMPLETENESS: [score] - [justification] | ||
| CLARITY: [score] - [justification] | ||
| OVERALL: [score] - [overall assessment] | ||
|
|
||
| Returns: | ||
| Dictionary with keys: correctness, relevance, completeness, clarity, overall | ||
| Defaults to 3.0 if score not found. | ||
| """ | ||
| scores = {} | ||
|
|
||
| patterns = { | ||
| "correctness": r"CORRECTNESS:\s*(\d+(?:\.\d+)?)", | ||
| "relevance": r"RELEVANCE:\s*(\d+(?:\.\d+)?)", | ||
| "completeness": r"COMPLETENESS:\s*(\d+(?:\.\d+)?)", | ||
| "clarity": r"CLARITY:\s*(\d+(?:\.\d+)?)", | ||
| "overall": r"OVERALL:\s*(\d+(?:\.\d+)?)", | ||
| } | ||
|
|
||
| for criterion, pattern in patterns.items(): | ||
| match = re.search(pattern, judgement_text, re.IGNORECASE) | ||
| scores[criterion] = float(match.group(1)) if match else 3.0 | ||
|
|
||
| # Fallback: compute overall if missing or still 3.0 | ||
| if "overall" not in scores or scores["overall"] == 3.0: | ||
| criteria_scores = [scores.get(k, 3.0) for k in ["correctness", "relevance", "completeness", "clarity"]] | ||
| scores["overall"] = sum(criteria_scores) / len(criteria_scores) | ||
|
Comment on lines
+54
to
+57
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fallback logic incorrectly overwrites legitimate score of 3.0. The condition Consider tracking whether the overall score was actually matched vs. defaulted: for criterion, pattern in patterns.items():
match = re.search(pattern, judgement_text, re.IGNORECASE)
- scores[criterion] = float(match.group(1)) if match else 3.0
+ if match:
+ scores[criterion] = float(match.group(1))
+ else:
+ scores[criterion] = 3.0
+ if criterion == "overall":
+ scores["_overall_missing"] = True
- # Fallback: compute overall if missing or still 3.0
- if "overall" not in scores or scores["overall"] == 3.0:
+ # Fallback: compute overall if missing
+ if scores.get("_overall_missing", False):
criteria_scores = [scores.get(k, 3.0) for k in ["correctness", "relevance", "completeness", "clarity"]]
scores["overall"] = sum(criteria_scores) / len(criteria_scores)
+ del scores["_overall_missing"]
return scores
🤖 Prompt for AI Agents |
||
|
|
||
| return scores | ||
|
|
||
|
|
||
| class MMAUProMetrics(BaseMetrics): | ||
| """Metrics class for MMAU-Pro benchmark (all subgroups).""" | ||
|
|
||
| def __init__(self, compute_no_answer: bool = True, max_k: int = 1): | ||
| super().__init__(compute_no_answer=compute_no_answer) | ||
| self.max_k = max_k | ||
|
|
||
| # Track multi-criteria scores for open-ended questions (1-5 scale) | ||
| self.multicriteria_scores = { | ||
| "correctness": [], | ||
| "relevance": [], | ||
| "completeness": [], | ||
| "clarity": [], | ||
| "overall": [], | ||
| } | ||
|
|
||
| def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]: | ||
| """Extract correctness scores from prediction.""" | ||
| score_dict = {} | ||
|
|
||
| # Open-ended: extract from judge result | ||
| # Open-ended: use LLM judge correctness score >= 3 as correct | ||
| if "judgement" in prediction: | ||
| judge_result = is_correct_judgement(prediction["judgement"]) | ||
| score_dict["judge_correct"] = judge_result | ||
| score_dict["correct"] = judge_result | ||
| # Closed-form and instruction following: use is_correct | ||
| multicriteria = extract_multicriteria_scores(prediction["judgement"]) | ||
| score_dict["correct"] = multicriteria.get("correctness", 3.0) >= 3.0 | ||
| # Closed-form / instruction-following: use binary correctness | ||
| elif "is_correct" in prediction: | ||
| score_dict["correct"] = prediction["is_correct"] | ||
| else: | ||
|
|
@@ -58,24 +104,61 @@ def get_incorrect_sample(self, prediction: dict) -> dict: | |
| def update(self, predictions): | ||
| """Update metrics with new predictions.""" | ||
| super().update(predictions) | ||
|
|
||
| predicted_answers = [pred.get("generation", None).strip() or None for pred in predictions] | ||
| self._compute_pass_at_k(predictions=predictions, predicted_answers=predicted_answers) | ||
| self._compute_majority_at_k(predictions=predictions, predicted_answers=predicted_answers) | ||
|
|
||
| # Collect multi-criteria scores for open-ended questions | ||
| for pred in predictions: | ||
| if "judgement" in pred: | ||
| multicriteria = extract_multicriteria_scores(pred["judgement"]) | ||
| for criterion in self.multicriteria_scores: | ||
| self.multicriteria_scores[criterion].append(multicriteria.get(criterion, 3.0)) | ||
|
|
||
| def get_metrics(self): | ||
| """Get computed metrics.""" | ||
| metrics_dict = super().get_metrics() | ||
|
|
||
| for agg_mode, agg_metrics in metrics_dict.items(): | ||
| # Ensure avg_tokens is always present for MMAU-Pro | ||
| # Ensure avg_tokens is present | ||
| if "avg_tokens" not in agg_metrics: | ||
| agg_metrics["avg_tokens"] = 0 | ||
| if "no_answer" in agg_metrics: | ||
| agg_metrics["no_answer"] = agg_metrics["no_answer"] / 2.0 | ||
| # Set success_rate from correct or judge_correct | ||
| if "judge_correct" in agg_metrics: | ||
| agg_metrics["success_rate"] = agg_metrics["judge_correct"] | ||
|
|
||
| # Add multi-criteria averages for open-ended (convert 1-5 scale to percentage) | ||
| if self.multicriteria_scores["overall"]: | ||
| for criterion in self.multicriteria_scores: | ||
| scores = self.multicriteria_scores[criterion] | ||
| if scores: | ||
| # Convert 1-5 scale to 0-100 percentage scale | ||
| avg_score = np.mean(scores) | ||
| std_score = np.std(scores) | ||
| agg_metrics[f"avg_{criterion}"] = (avg_score / 5.0) * 100 | ||
| agg_metrics[f"std_{criterion}"] = (std_score / 5.0) * 100 | ||
|
|
||
| # Set correct and success_rate to avg_correctness for open-ended | ||
| agg_metrics["correct"] = agg_metrics["avg_correctness"] | ||
| agg_metrics["success_rate"] = agg_metrics["avg_correctness"] | ||
|
|
||
| # Calculate good/poor response rates based on overall >= 4 or <= 2 | ||
| overall_scores = self.multicriteria_scores["overall"] | ||
| good_responses = sum(1 for score in overall_scores if score >= 4.0) | ||
| poor_responses = sum(1 for score in overall_scores if score <= 2.0) | ||
|
|
||
| agg_metrics["good_response_rate"] = (good_responses / len(overall_scores)) * 100 | ||
| agg_metrics["poor_response_rate"] = (poor_responses / len(overall_scores)) * 100 | ||
|
|
||
| # For closed-form / instruction-following: use binary correctness | ||
| elif "correct" in agg_metrics: | ||
| agg_metrics["success_rate"] = agg_metrics["correct"] | ||
|
|
||
| # Round all numeric values to 2 decimal places | ||
| for key, value in agg_metrics.items(): | ||
| if isinstance(value, float) and not isinstance(value, bool): | ||
| agg_metrics[key] = round(value, 2) | ||
|
|
||
| return metrics_dict | ||
|
|
||
| def metrics_to_print(self): | ||
|
|
@@ -87,5 +170,20 @@ def metrics_to_print(self): | |
| } | ||
| if self.compute_no_answer: | ||
| base_metrics["no_answer"] = as_percentage | ||
|
|
||
| # Add multi-criteria metrics for open-ended questions (now in percentage format) | ||
| if self.multicriteria_scores["overall"]: | ||
| base_metrics.update( | ||
| { | ||
| "avg_overall": as_percentage, | ||
| "avg_correctness": as_percentage, | ||
| "avg_relevance": as_percentage, | ||
| "avg_completeness": as_percentage, | ||
| "avg_clarity": as_percentage, | ||
| "good_response_rate": as_percentage, | ||
| "poor_response_rate": as_percentage, | ||
| } | ||
| ) | ||
|
|
||
| base_metrics["num_entries"] = as_int | ||
| return base_metrics | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,6 +38,8 @@ | |
| supports_single_eval, | ||
| ) | ||
| from nemo_skills.inference.model import ( | ||
| AudioProcessor, | ||
| AudioProcessorConfig, | ||
| ParallelThinkingConfig, | ||
| get_code_execution_model, | ||
| get_model, | ||
|
|
@@ -186,9 +188,18 @@ class GenerateSolutionsConfig: | |
| # If True, will enable litellm disk cache (useful for keeping intermediate results in case of job timelimit failures) | ||
| enable_litellm_cache: bool = False | ||
|
|
||
| # List of content types to drop from messages (e.g., base64 audio) to keep output files smaller | ||
| drop_content_types: list[str] = field(default_factory=lambda: ["audio_url"]) | ||
|
|
||
| # Audio processing configuration (EXPERIMENTAL) | ||
| # Set to enable audio file preprocessing (file->base64 conversion, chunking for long audio) | ||
| # Example: ++audio.data_dir=/path/to/audio ++audio.enable_chunking=true | ||
| audio: AudioProcessorConfig | None = None | ||
|
|
||
| # Evaluation setup if requested. If eval_type is set to None, evaluation is skipped | ||
| eval_type: str | None = None # "lean4-proof", "math", etc. | ||
| eval_config: dict = field(default_factory=dict) # Config for the evaluator | ||
| dataset_group: str | None = None # "math", "code", "speechlm", etc. from benchmark's DATASET_GROUP | ||
|
|
||
| def __post_init__(self): | ||
| self._post_init_validate_data() | ||
|
|
@@ -393,6 +404,32 @@ def setup_llm(self): | |
| else: | ||
| llm = get_model(**self.cfg.server, tokenizer=self.tokenizer) | ||
|
|
||
| # Audio wrapper (preprocesses messages before they reach the model) | ||
| # Auto-enable for audio benchmarks on vLLM (eval_type=audio OR dataset_group=speechlm) | ||
| should_enable_audio = self.cfg.audio is not None or ( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather configure this through prepare.py of relevant benchmarks. Otherwise this logic is too coupled with datasets but this module is very general. So instead of dataset_group, I'd add a flag should_enable_audio and you can just set it as default parameter in the init of those benchmarks. You can still have a check for vllm to be used as a server and other logic for wrapping, just let's remove the dataset_group based behaviour
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got your point, when moving to separate class PR #1157 this would be auto-resolved |
||
| self.cfg.server.get("server_type", "").lower() == "vllm" | ||
| and (self.cfg.eval_type == "audio" or self.cfg.dataset_group == "speechlm") | ||
| ) | ||
|
|
||
| if should_enable_audio: | ||
| audio_supported_servers = {"vllm"} | ||
| server_type = self.cfg.server.get("server_type", "").lower() | ||
| if server_type not in audio_supported_servers: | ||
| raise ValueError( | ||
| f"Audio processing is not supported for server_type='{server_type}'. " | ||
| f"Supported server types: {audio_supported_servers}" | ||
| ) | ||
|
|
||
| # Use provided config or create default | ||
| audio_config = self.cfg.audio if self.cfg.audio is not None else AudioProcessorConfig() | ||
|
|
||
| llm = AudioProcessor( | ||
| llm, | ||
| audio_config, | ||
| eval_config=dict(self.cfg.eval_config), | ||
| eval_type=self.cfg.eval_type, | ||
| ) | ||
|
|
||
| if self.cfg.parallel_thinking.mode is not None: | ||
| # We don't want to override these key variables which overlap with self.cfg | ||
| inference_override_config = { | ||
|
|
@@ -519,12 +556,45 @@ def fill_prompt(self, data_point, data): | |
| filled_prompt[-1]["content"] += self.cfg.prompt_suffix | ||
| else: | ||
| filled_prompt += self.cfg.prompt_suffix | ||
|
|
||
| # Copy audio fields from data_point to the user message for audio models | ||
| if isinstance(filled_prompt, list): | ||
| for msg in filled_prompt: | ||
| if msg.get("role") == "user": | ||
| if "audio" in data_point: | ||
| msg["audio"] = data_point["audio"] | ||
| break | ||
|
|
||
| return filled_prompt | ||
|
|
||
| def preprocess_prompt(self, prompt: str | list[dict]) -> str | list[dict]: | ||
| """Preprocess the prompt before sending to the model. | ||
|
|
||
| Override this method to add custom preprocessing logic. | ||
| Audio conversion is handled by the AudioProcessor wrapper. | ||
| """ | ||
| return prompt | ||
|
|
||
| def dump_outputs(self, outputs, data_points, fout): | ||
| for output in outputs: | ||
| fout.write(json.dumps(output) + "\n") | ||
|
|
||
| def drop_binary_data(self, output): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe just drop_data or better drop_fields_from_messages or something like this as it's not limited to binary but just checks what to drop from the parameter?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, why not |
||
| """Remove binary data (like base64 audio) from messages to keep output files smaller.""" | ||
| # Skip if output doesn't have messages (e.g., text completion mode or error cases) | ||
| if "messages" not in output: | ||
| return | ||
|
|
||
| for message in output["messages"]: | ||
| # Skip if content is not a list (e.g., string content in system messages) | ||
| if not isinstance(message.get("content"), list): | ||
| continue | ||
|
|
||
| # Filter out content types specified in drop_content_types config | ||
| message["content"] = [ | ||
| content for content in message["content"] if content.get("type") not in self.cfg.drop_content_types | ||
| ] | ||
|
|
||
| async def postprocess_single_output(self, output, original_data_point): | ||
| # to make it easier to follow up with other generations and limit accidental errors, we are adding | ||
| # all of the original data to the output file alongside the new generations | ||
|
|
@@ -540,6 +610,9 @@ async def postprocess_single_output(self, output, original_data_point): | |
| for key in output: | ||
| original_data_point.pop(key, None) | ||
| output.update(original_data_point) | ||
|
|
||
| self.drop_binary_data(output) | ||
|
|
||
| if self.cfg.parse_reasoning: | ||
| parse_reasoning( | ||
| output, | ||
|
|
@@ -570,13 +643,20 @@ async def process_single_datapoint(self, data_point, all_data): | |
| # Already a dict from Hydra | ||
| inference_params = dict(self.cfg.inference) | ||
|
|
||
| prompt = self.fill_prompt(data_point, all_data) | ||
| prompt = self.preprocess_prompt(prompt) | ||
|
|
||
| generation_params = { | ||
| **inference_params, | ||
| **self.extra_generate_params, | ||
| "prompt": self.fill_prompt(data_point, all_data), | ||
| "prompt": prompt, | ||
| "stop_phrases": [self.cfg.stop_phrase] if self.cfg.stop_phrase else None, | ||
| } | ||
|
|
||
| # Pass task_type for audio chunking | ||
| if isinstance(self.llm, AudioProcessor) and "task_type" in data_point: | ||
| generation_params["task_type"] = data_point["task_type"] | ||
|
|
||
| if self.cfg.code_execution: | ||
| if self.cfg.override_max_code_executions and self.cfg.total_code_executions_in_prompt is not None: | ||
| generation_params["max_code_executions"] = data_point["total_code_executions"] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is specific to qwen models, I don't think we should be doing it globally here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@melllinia ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Imho, this is not problem for other models
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It certainly introduces a little mismatch, whether it's significant or not for accuracy is unclear, but I think it's much safer not to have it. What's the reason why we need to hardcode it here?