Skip to content
24 changes: 19 additions & 5 deletions nemo_skills/evaluation/evaluator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@
eval_livebench_coding,
eval_livecodebench_pro,
)
from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator

try:
Comment thread
Jorjeous marked this conversation as resolved.
from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
except ImportError:
ComputeEvalEvaluator = None
from nemo_skills.evaluation.evaluator.icpc import ICPCEvaluator
from nemo_skills.evaluation.evaluator.ifbench import eval_ifbench
from nemo_skills.evaluation.evaluator.ifeval import eval_if
Expand All @@ -40,7 +44,12 @@
from nemo_skills.evaluation.evaluator.mcq import eval_mcq
from nemo_skills.evaluation.evaluator.mmau_pro import eval_mmau_pro
from nemo_skills.evaluation.evaluator.mrcr import eval_mrcr
from nemo_skills.evaluation.evaluator.ruler import eval_ruler, eval_ruler2

try:
from nemo_skills.evaluation.evaluator.ruler import eval_ruler, eval_ruler2
except ImportError:
eval_ruler = None
eval_ruler2 = None
from nemo_skills.evaluation.evaluator.scicode import eval_scicode

EVALUATOR_MAP = {
Expand All @@ -50,8 +59,6 @@
"ifbench": eval_ifbench,
"bfcl": eval_bfcl,
"multichoice": eval_mcq,
"ruler": eval_ruler,
"ruler2": eval_ruler2,
"livecodebench": eval_livecodebench,
"livebench_coding": eval_livebench_coding,
"livecodebench_pro": eval_livecodebench_pro,
Expand All @@ -61,6 +68,11 @@
"human_eval_infilling": eval_human_eval_infilling,
"mmau-pro": eval_mmau_pro,
}
# Optional evaluators (require additional dependencies)
Comment thread
Jorjeous marked this conversation as resolved.
if eval_ruler is not None:
EVALUATOR_MAP["ruler"] = eval_ruler
if eval_ruler2 is not None:
EVALUATOR_MAP["ruler2"] = eval_ruler2

# Evaluator class mapping, other evaluators can be added here as they're converted to classes
EVALUATOR_CLASS_MAP = {
Expand All @@ -71,8 +83,10 @@
"icpc": ICPCEvaluator,
"audio": AudioEvaluator,
"bird": BirdEvaluator,
"compute-eval": ComputeEvalEvaluator,
}
# Optional evaluators (require additional dependencies)
Comment thread
Jorjeous marked this conversation as resolved.
if ComputeEvalEvaluator is not None:
EVALUATOR_CLASS_MAP["compute-eval"] = ComputeEvalEvaluator

# Validation: Ensure no overlap between class and function maps
_class_types = set(EVALUATOR_CLASS_MAP.keys())
Expand Down
22 changes: 17 additions & 5 deletions nemo_skills/inference/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,13 +207,15 @@ class GenerationTaskConfig:
enable_litellm_cache: bool = False

# List of content types to drop from messages (e.g., base64 audio) to keep output files smaller
drop_content_types: list[str] = field(default_factory=lambda: ["audio_url"])
drop_content_types: list[str] = field(default_factory=lambda: ["input_audio"])

# Audio configuration - set by benchmarks that need audio processing (mmau-pro, audiobench, etc.)
enable_audio: bool = False # Enable audio preprocessing (set by benchmark configs)
enable_audio_chunking: bool = True
audio_chunk_task_types: list[str] | None = None # If None, chunk all task types; if specified, only chunk these
chunk_audio_threshold_sec: int = 30 # Duration in seconds for each audio chunk
# Audio format for API requests (input_audio only)
audio_format: str = "input_audio"

# Evaluation setup if requested. If eval_type is set to None, evaluation is skipped
eval_type: str | None = None # "lean4-proof", "math", etc.
Expand Down Expand Up @@ -432,11 +434,11 @@ def setup_llm(self):

# Build server config, potentially switching to vllm_multimodal for audio tasks
server_config = dict(self.cfg.server)
if needs_audio and server_config.get("server_type") not in ["vllm", "vllm_multimodal"]:
if needs_audio and server_config.get("server_type") not in ["vllm", "vllm_multimodal", "api_multimodal"]:
LOG.warning(

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated warnign, as it was misleading

f"enable_audio is set but server_type is '{server_config.get('server_type')}'. "
"Audio processing is only supported for vllm_multimodal server types. "
"Audio will not be processed."
f"Audio enabled with server_type='{server_config.get('server_type')}'. "
"Advanced audio preprocessing (base64 encoding, chunking) is only available for "
"vllm_multimodal and api_multimodal. Server will receive raw audio paths."
)
if needs_audio and server_config.get("server_type") in [
"vllm",
Expand All @@ -453,6 +455,16 @@ def setup_llm(self):
"chunk_audio_threshold_sec": self.cfg.chunk_audio_threshold_sec,
}
)
if needs_audio and server_config.get("server_type") == "api_multimodal":
# Pass audio config to api_multimodal (has built-in audio processing)
server_config.update(
{
"enable_audio_chunking": self.cfg.enable_audio_chunking,
"audio_chunk_task_types": self.cfg.audio_chunk_task_types,
"chunk_audio_threshold_sec": self.cfg.chunk_audio_threshold_sec,
"audio_format": self.cfg.audio_format,
}
)

if self.cfg.code_execution:
llm = get_code_execution_model(
Expand Down
3 changes: 3 additions & 0 deletions nemo_skills/inference/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from nemo_skills.mcp.utils import locate
from nemo_skills.utils import python_doc_to_cmd_help

from .api_multimodal import APIMultimodal

# NIM models (speech)
from .asr_nim import ASRNIMModel

Expand Down Expand Up @@ -65,6 +67,7 @@
"sglang": SGLangModel,
"tts_nim": TTSNIMModel,
"asr_nim": ASRNIMModel,
"api_multimodal": APIMultimodal,
}


Expand Down
Loading
Loading