From bbcbf4882b268af221c3b6d10a088797c3b658f5 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 5 Nov 2025 15:19:59 +0100 Subject: [PATCH 1/3] run all hf-providers --- src/lighteval/main_inspect.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py index be7958873..e13471526 100644 --- a/src/lighteval/main_inspect.py +++ b/src/lighteval/main_inspect.py @@ -24,6 +24,7 @@ from collections import defaultdict from typing import Literal +import requests from huggingface_hub import HfApi from inspect_ai import Epochs, Task, task from inspect_ai import eval_set as inspect_ai_eval_set @@ -182,13 +183,31 @@ def _format_metric_cell(data: dict, col: str, metric: str, stderr_metric: str) - return "-" +def _get_huggingface_providers(model_id: str): + model_id = model_id.replace("hf-inference-providers/", "").replace(":all", "") + url = f"https://huggingface.co/api/models/{model_id}" + params = {"expand[]": "inferenceProviderMapping"} + response = requests.get(url, params=params) + response.raise_for_status() # raise exception for HTTP errors + data = response.json() + # Extract provider mapping if available + providers = data.get("inferenceProviderMapping", {}) + + live_providers = [] + for provider, info in providers.items(): + if info.get("status") == "live": + live_providers.append(provider) + + return live_providers + + HELP_PANEL_NAME_1 = "Modeling Parameters" HELP_PANEL_NAME_2 = "Task Parameters" HELP_PANEL_NAME_3 = "Connection and parallelization parameters" HELP_PANEL_NAME_4 = "Logging parameters" -def eval( +def eval( # noqa C901 models: Annotated[list[str], Argument(help="Models to evaluate")], tasks: Annotated[str, Argument(help="Tasks to evaluate")], # model arguments @@ -404,13 +423,19 @@ def eval( else: model_args = {} + for model in models: + if model.split("/")[0] == "hf-inference-providers" and model.split(":")[-1] == "all": + providers = _get_huggingface_providers(model) + models = [f"{model.replace(':all', '')}:{provider}" for provider in providers] + success, logs = inspect_ai_eval_set( inspect_ai_tasks, model=models, max_connections=max_connections, timeout=timeout, retry_on_error=retry_on_error, - max_retries=max_retries, + max_retries=max_retries, # not counted + fail_on_error=True, limit=max_samples, max_tasks=max_tasks, log_dir=log_dir, From 8c39e758739d2050a03714ca541aa6ac43a07baf Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 5 Nov 2025 15:21:06 +0100 Subject: [PATCH 2/3] add example --- docs/source/inspect-ai.mdx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/inspect-ai.mdx b/docs/source/inspect-ai.mdx index 9cdeb8802..30c53ce4d 100644 --- a/docs/source/inspect-ai.mdx +++ b/docs/source/inspect-ai.mdx @@ -40,6 +40,13 @@ lighteval eval \ "lighteval|gpqa:diamond|0" ``` +You can also compare every providers serving one model in one line: + +```bash + hf-inference-providers/openai/gpt-oss-20b:all \ + "lighteval|gpqa:diamond|0" +``` + 4. Evaluate a vLLM or SGLang model. ```bash From 2486b64a29c8107a7ba5688a0d9634549789c100 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Wed, 5 Nov 2025 15:22:11 +0100 Subject: [PATCH 3/3] remove uneeded params --- src/lighteval/main_inspect.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py index e13471526..0bc50c34f 100644 --- a/src/lighteval/main_inspect.py +++ b/src/lighteval/main_inspect.py @@ -434,8 +434,7 @@ def eval( # noqa C901 max_connections=max_connections, timeout=timeout, retry_on_error=retry_on_error, - max_retries=max_retries, # not counted - fail_on_error=True, + max_retries=max_retries, limit=max_samples, max_tasks=max_tasks, log_dir=log_dir,