diff --git a/README.md b/README.md
index ba5f698b8..d5200f557 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@
   <a href="https://huggingface.co/docs/lighteval/main/en/index" target="_blank">
     <img alt="Documentation" src="https://img.shields.io/badge/Documentation-4F4F4F?style=for-the-badge&logo=readthedocs&logoColor=white" />
   </a>
-  <a href="https://huggingface.co/spaces/SaylorTwift/benchmark_finder" target="_blank">
+  <a href="https://huggingface.co/spaces/OpenEvals/open_benchmark_index" target="_blank">
     <img alt="Open Benchmark Index" src="https://img.shields.io/badge/Open%20Benchmark%20Index-4F4F4F?style=for-the-badge&logo=huggingface&logoColor=white" />
   </a>
 </p>
@@ -44,7 +44,7 @@ sample-by-sample results* to debug and see how your models stack-up.
 
 Lighteval supports **1000+ evaluation tasks** across multiple domains and
 languages. Use [this
-space](https://huggingface.co/spaces/SaylorTwift/benchmark_finder) to find what
+space](https://huggingface.co/spaces/OpenEvals/open_benchmark_index) to find what
 you need, or, here's an overview of some *popular benchmarks*:
 
 
@@ -107,6 +107,7 @@ huggingface-cli login
 
 Lighteval offers the following entry points for model evaluation:
 
+- `lighteval eval`: Evaluation models using [inspect-ai](https://inspect.aisi.org.uk/) as a backend (prefered).
 - `lighteval accelerate`: Evaluate models on CPU or one or more GPUs using [🤗
   Accelerate](https://github.com/huggingface/accelerate)
 - `lighteval nanotron`: Evaluate models in distributed settings using [⚡️
@@ -126,9 +127,7 @@ Did not find what you need ? You can always make your custom model API by follow
 Here's a **quick command** to evaluate using the *Accelerate backend*:
 
 ```shell
-lighteval accelerate \
-    "model_name=gpt2" \
-    "leaderboard|truthfulqa:mc|0"
+lighteval eval "hf-inference-providers/openai/gpt-oss-20b" "lighteval|gpqa:diamond|0"
 ```
 
 Or use the **Python API** to run a model *already loaded in memory*!
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index d3b9c9d9b..d3c33cdab 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -7,6 +7,8 @@
     title: Quicktour
   title: Getting started
 - sections:
+  - local: inspect-ai
+    title: Examples using Inspect-AI
   - local: saving-and-reading-results
     title: Save and read results
   - local: caching
diff --git a/docs/source/available-tasks.mdx b/docs/source/available-tasks.mdx
index 450b7ed49..57605577a 100644
--- a/docs/source/available-tasks.mdx
+++ b/docs/source/available-tasks.mdx
@@ -1,6 +1,8 @@
+# Available tasks
 
+Browse and inspect tasks available in LightEval.
 <iframe
-	src="https://saylortwift-benchmark-finder.hf.space"
+	src="https://openevals-benchmark-finder.hf.space"
 	frameborder="0"
 	width="850"
 	height="450"
@@ -8,21 +10,21 @@
 
 
 
-You can get a list of all available tasks by running:
+List all tasks:
 
 ```bash
 lighteval tasks list
 ```
 
-### Inspect Specific Tasks
+### Inspect specific tasks
 
-You can inspect a specific task to see its configuration, metrics, and requirements by running:
+Inspect a task to view its config, metrics, and requirements:
 
 ```bash
 lighteval tasks inspect <task_name>
 ```
 
-For example:
+Example:
 ```bash
 lighteval tasks inspect "lighteval|truthfulqa:mc|0"
 ```
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index e833f53e9..9e91d315e 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -9,6 +9,7 @@ and see how your models stack up.
 
 ### 🚀 **Multi-Backend Support**
 Evaluate your models using the most popular and efficient inference backends:
+- `eval`: Use [inspect-ai](https://inspect.aisi.org.uk/) as backend to evaluate and inspect your models ! (prefered way)
 - `transformers`: Evaluate models on CPU or one or more GPUs using [🤗
   Accelerate](https://github.com/huggingface/transformers)
 - `nanotron`: Evaluate models in distributed settings using [⚡️
@@ -45,26 +46,29 @@ pip install lighteval
 
 ### Basic Usage
 
-```bash
-# Evaluate a model using Transformers backend
-lighteval accelerate \
-    "model_name=openai-community/gpt2" \
-    "leaderboard|truthfulqa:mc|0"
-```
+#### Find a task
+
+<iframe
+	src="https://openevals-open-benchmark-index.hf.space"
+	frameborder="0"
+	width="850"
+	height="450"
+></iframe>
 
-### Save Results
+#### Run your benchmark and push details to the hub
 
 ```bash
-# Save locally
-lighteval accelerate \
-    "model_name=openai-community/gpt2" \
-    "leaderboard|truthfulqa:mc|0" \
-    --output-dir ./results
-
-# Push to Hugging Face Hub
-lighteval accelerate \
-    "model_name=openai-community/gpt2" \
-    "leaderboard|truthfulqa:mc|0" \
-    --push-to-hub \
-    --results-org your-username
+lighteval eval "hf-inference-providers/openai/gpt-oss-20b" \
+  "lighteval|gpqa:diamond|0" \
+    --bundle-dir gpt-oss-bundle \
+    --repo-id OpenEvals/evals
 ```
+
+Resulting Space:
+
+<iframe
+    src="https://openevals-evals.static.hf.space"
+    frameborder="0"
+    width="850"
+    height="450"
+></iframe>
diff --git a/docs/source/inspect-ai.mdx b/docs/source/inspect-ai.mdx
new file mode 100644
index 000000000..9cdeb8802
--- /dev/null
+++ b/docs/source/inspect-ai.mdx
@@ -0,0 +1,120 @@
+# Evaluate your model with Inspect-AI
+
+Pick the right benchmarks with our benchmark finder:
+Search by language, task type, dataset name, or keywords.
+
+> [!WARNING]
+> Not all tasks are compatible with inspect-ai's API as of yet, we are working on converting all of them !
+
+
+<iframe
+	src="https://openevals-open-benchmark-index.hf.space"
+	frameborder="0"
+	width="850"
+	height="450"
+></iframe>
+
+Once you've chosen a benchmark, run it with `lighteval eval`. Below are examples for common setups.
+
+### Examples
+
+1. Evaluate a model via Hugging Face Inference Providers.
+
+```bash
+lighteval eval "hf-inference-providers/openai/gpt-oss-20b" "lighteval|gpqa:diamond|0"
+```
+
+2. Run multiple evals at the same time.
+
+```bash
+lighteval eval "hf-inference-providers/openai/gpt-oss-20b" "lighteval|gpqa:diamond|0,lighteval|aime25|0"
+```
+
+3. Compare providers for the same model.
+
+```bash
+lighteval eval \
+    hf-inference-providers/openai/gpt-oss-20b:fireworks-ai \
+    hf-inference-providers/openai/gpt-oss-20b:together \
+    hf-inference-providers/openai/gpt-oss-20b:nebius \
+    "lighteval|gpqa:diamond|0"
+```
+
+4. Evaluate a vLLM or SGLang model.
+
+```bash
+lighteval eval vllm/HuggingFaceTB/SmolLM-135M-Instruct "lighteval|gpqa:diamond|0"
+```
+
+5. See the impact of few-shot on your model.
+
+```bash
+lighteval eval hf-inference-providers/openai/gpt-oss-20b "lighteval|gsm8k|0,lighteval|gsm8k|5"
+```
+
+6. Optimize custom server connections.
+
+```bash
+lighteval eval hf-inference-providers/openai/gpt-oss-20b "lighteval|gsm8k|0" \
+    --max-connections 50 \
+    --timeout 30 \
+    --retry-on-error 1 \
+    --max-retries 1 \
+    --max-samples 10
+```
+
+7. Use multiple epochs for more reliable results.
+
+```bash
+lighteval eval hf-inference-providers/openai/gpt-oss-20b "lighteval|aime25|0" --epochs 16 --epochs-reducer "pass_at_4"
+```
+
+8. Push to the Hub to share results.
+
+```bash
+lighteval eval hf-inference-providers/openai/gpt-oss-20b "lighteval|hle|0" \
+    --bundle-dir gpt-oss-bundle \
+    --repo-id OpenEvals/evals \
+    --max-samples 100
+```
+
+Resulting Space:
+
+<iframe
+	src="https://openevals-evals.static.hf.space"
+	frameborder="0"
+	width="850"
+	height="450"
+></iframe>
+
+9. Change model behaviour
+
+You can use any argument defined in inspect-ai's API.
+
+```bash
+lighteval eval hf-inference-providers/openai/gpt-oss-20b "lighteval|aime25|0" --temperature 0.1
+```
+
+10. Use model-args to use any inference provider specific argument.
+
+```bash
+lighteval eval google/gemini-2.5-pro "lighteval|aime25|0" --model-args location=us-east5
+```
+
+```bash
+lighteval eval openai/gpt-4o "lighteval|gpqa:diamond|0" --model-args service_tier=flex,client_timeout=1200
+```
+
+
+LightEval prints a per-model results table:
+
+```
+Completed all tasks in 'lighteval-logs' successfully
+
+|                 Model                 |gpqa|gpqa:diamond|
+|---------------------------------------|---:|-----------:|
+|vllm/HuggingFaceTB/SmolLM-135M-Instruct|0.01|        0.01|
+
+results saved to lighteval-logs
+run "inspect view --log-dir lighteval-logs" to view the results
+```
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
index e22ed3223..a8cf504b5 100644
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -11,7 +11,7 @@ Lighteval can be used with several different commands, each optimized for differ
 ## Find your benchmark
 
 <iframe
-	src="https://saylortwift-benchmark-finder.hf.space"
+	src="https://openevals-open-benchmark-index.hf.space"
 	frameborder="0"
 	width="850"
 	height="450"
diff --git a/pyproject.toml b/pyproject.toml
index a89024487..4a5335b7d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,6 +57,7 @@ keywords = ["evaluation", "nlp", "llm"]
 dependencies = [
     # Base dependencies
     "transformers>=4.54.0",
+    "inspect-ai",
     "accelerate",
     "huggingface_hub[hf_xet]>=0.30.2",
     "torch>=2.0,<3.0",
diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py
index 19ba8b284..1d3c2f0f8 100644
--- a/src/lighteval/__main__.py
+++ b/src/lighteval/__main__.py
@@ -29,6 +29,7 @@
 import lighteval.main_baseline
 import lighteval.main_custom
 import lighteval.main_endpoint
+import lighteval.main_inspect
 import lighteval.main_nanotron
 import lighteval.main_sglang
 import lighteval.main_tasks
@@ -69,6 +70,7 @@
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_vllm.vllm)
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_custom.custom)
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_sglang.sglang)
+app.command(rich_help_panel="Evaluation Backends")(lighteval.main_inspect.eval)
 app.add_typer(
     lighteval.main_endpoint.app,
     name="endpoint",
diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py
new file mode 100644
index 000000000..e51869f41
--- /dev/null
+++ b/src/lighteval/main_inspect.py
@@ -0,0 +1,477 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import logging
+from collections import defaultdict
+from typing import Literal
+
+from huggingface_hub import HfApi
+from inspect_ai import Epochs, Task, task
+from inspect_ai import eval_set as inspect_ai_eval_set
+from inspect_ai.dataset import hf_dataset
+from inspect_ai.scorer import exact
+from inspect_ai.solver import generate, system_message
+from pytablewriter import MarkdownTableWriter
+from typer import Argument, Option
+from typing_extensions import Annotated
+
+from lighteval.models.abstract_model import InspectAIModelConfig
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+@task
+def get_inspect_ai_task(
+    lighteval_task_config: LightevalTaskConfig,
+    epochs: int = 1,
+    epochs_reducer: Literal["mean", "median", "mode", "max", "at_least_{n}", "pass_at_{k}"] | None = None,
+) -> Task:
+    name = lighteval_task_config.name
+    sample_fields = lighteval_task_config.sample_fields
+
+    dataset_repo = lighteval_task_config.hf_repo
+    dataset_subset = lighteval_task_config.hf_subset
+    dataset_split = lighteval_task_config.evaluation_splits[0]
+
+    dataset = hf_dataset(dataset_repo, name=dataset_subset, split=dataset_split, sample_fields=sample_fields)
+    if lighteval_task_config.filter is not None:
+        dataset = dataset.filter(lighteval_task_config.filter)
+    solver = lighteval_task_config.solver or [
+        generate(cache=True),
+    ]
+    scorers = lighteval_task_config.scorer or exact()
+    # TODO: have per task epoch and epoch reducer
+
+    if lighteval_task_config.num_fewshots > 0:
+        name += f"_{lighteval_task_config.num_fewshots}_shots"
+        # TODO: use fewshot split
+        fewshots = hf_dataset(
+            path=dataset_repo,
+            name=dataset_subset,
+            split=dataset_split,
+            sample_fields=sample_fields,
+            shuffle=True,
+            seed=42,
+            limit=lighteval_task_config.num_fewshots,
+        )
+        solver.insert(
+            0,
+            system_message("\n\n".join([lighteval_task_config.sample_to_fewshot(sample) for sample in fewshots])),
+        )
+
+    return Task(dataset=dataset, solver=solver, scorer=scorers, name=name, epochs=Epochs(epochs, epochs_reducer))
+
+
+def push_to_hub(bundle_dir: str, repo_id: str, public: bool = False):
+    api = HfApi()
+    api.create_repo(repo_id=repo_id, repo_type="space", space_sdk="static", exist_ok=True, private=not public)
+    api.upload_folder(repo_id=repo_id, repo_type="space", folder_path=bundle_dir)
+    print(f"Details pushed to https://huggingface.co/spaces/{repo_id}")
+
+
+def mean_metrics_by_prefix(results_per_model_per_task, sep=":"):
+    out = {}
+    for model, tasks in results_per_model_per_task.items():
+        pref_metrics = defaultdict(lambda: defaultdict(list))
+        # Collect both per-task metrics and values for prefix aggregation
+        per_model_out = {}
+        for task_name, metrics in tasks.items():
+            if sep not in task_name:
+                # No subtasks: keep metrics as-is for this task
+                per_task_vals = {}
+                for mname, metric in metrics.items():
+                    per_task_vals[mname] = getattr(metric, "value", metric)
+                per_model_out[task_name] = per_task_vals
+                continue
+            prefix = task_name.split(sep, 1)[0]
+            # Keep non-averaged task metrics
+            per_task_vals = {}
+            for mname, metric in metrics.items():
+                value = getattr(metric, "value", metric)
+                per_task_vals[mname] = value
+                pref_metrics[prefix][mname].append(value)
+            per_model_out[task_name] = per_task_vals
+        # Add the averaged metrics per prefix
+        for p, md in pref_metrics.items():
+            per_model_out[p] = {m: sum(v) / len(v) for m, v in md.items()}
+        out[model] = per_model_out
+    return out
+
+
+def results_to_markdown_table(
+    results_per_model_per_task,
+    metric: str = "accuracy",
+    stderr_metric: str = "stderr",
+    max_total_columns: int | None = None,
+    means_only_task_threshold: int = 10,
+) -> str:
+    cols = _collect_columns(results_per_model_per_task, means_only_task_threshold, max_total_columns)
+
+    writer = MarkdownTableWriter()
+    writer.headers = ["Model"] + cols
+
+    rows = []
+    for model in sorted(results_per_model_per_task.keys()):
+        row = [model]
+        data = results_per_model_per_task[model]
+        for col in cols:
+            row.append(_format_metric_cell(data, col, metric, stderr_metric))
+        rows.append(row)
+
+    writer.value_matrix = rows
+    return writer.dumps()
+
+
+def _collect_columns(
+    results_per_model_per_task, means_only_task_threshold: int, max_total_columns: int | None
+) -> list[str]:
+    all_cols = set()
+    for model_data in results_per_model_per_task.values():
+        all_cols.update(model_data.keys())
+    agg_cols = sorted([c for c in all_cols if ":" not in c])
+    task_cols = sorted([c for c in all_cols if ":" in c])
+
+    if len(task_cols) > means_only_task_threshold:
+        logger.info(
+            f"Only showing the meaned tasks (aggregates only) because there are more than {means_only_task_threshold} tasks"
+        )
+        return agg_cols
+
+    cols = agg_cols + task_cols
+    if max_total_columns is not None and len(cols) > max_total_columns:
+        keep_left = max(1, max_total_columns // 2)
+        keep_right = max_total_columns - keep_left
+        left_cols = cols[:keep_left]
+        right_cols = cols[-keep_right:] if keep_right > 0 else []
+        return left_cols + ["…"] + right_cols
+    return cols
+
+
+def _format_metric_cell(data: dict, col: str, metric: str, stderr_metric: str) -> str:
+    if col == "…":
+        return "…"
+    metrics = data.get(col)
+    if not metrics:
+        return "-"
+    val = metrics.get(metric)
+    if isinstance(val, dict):
+        val = val.get("value", None)
+    if val is not None:
+        return "%.2f" % val
+    return "-"
+
+
+HELP_PANEL_NAME_1 = "Modeling Parameters"
+HELP_PANEL_NAME_2 = "Task Parameters"
+HELP_PANEL_NAME_3 = "Connection and parallelization parameters"
+HELP_PANEL_NAME_4 = "Logging parameters"
+
+
+def eval(
+    models: Annotated[list[str], Argument(help="Models to evaluate")],
+    tasks: Annotated[str, Argument(help="Tasks to evaluate")],
+    # model arguments
+    max_tokens: Annotated[
+        int | None,
+        Option(
+            help="The maximum number of tokens that can be generated in the completion (default is model specific)",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
+    system_message: Annotated[
+        str | None,
+        Option(
+            help="System message to use, overrides the task defined system and default one.",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
+    temperature: Annotated[
+        float | None,
+        Option(
+            help="Controls randomness in the model's output. Lower values make the model more deterministic and focused, while higher values make it more creative and varied.",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
+    top_p: Annotated[
+        float | None,
+        Option(
+            help="An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass.",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
+    top_k: Annotated[
+        int | None,
+        Option(
+            help="The number of highest probability vocabulary tokens to keep for each step of decoding.",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
+    frequence_penalty: Annotated[
+        float | None,
+        Option(
+            help="Number between -2.0 and 2.0, Penalizes tokens that appear in the text too frequently, reducing repetition.",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
+    presence_penalty: Annotated[
+        float | None,
+        Option(
+            help="Number between -2.0 and 2.0, Penalizes tokens that appear in the text, increasing diversity of the generated text.",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
+    logit_bias: Annotated[
+        str | None,
+        Option(
+            help="Bias for each token, can be used to prioritize or deprioritize certain tokens, for example 10=100, -10=-100.",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
+    seed: Annotated[
+        int | None, Option(help="Random seed to use for reproducibility", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = None,
+    stop_seqs: Annotated[
+        list[str] | None,
+        Option(
+            help="Stop sequences to use, can be used to stop the generation of the text.",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
+    num_choices: Annotated[
+        int | None,
+        Option(help="The number of choices to generate for each step of decoding.", rich_help_panel=HELP_PANEL_NAME_1),
+    ] = None,
+    best_of: Annotated[
+        int | None,
+        Option(
+            help="Generates best_of completions server-side and returns the one with the highest log probability per token.",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
+    log_probs: Annotated[
+        bool | None,
+        Option(
+            help="Returns log probabilities for each token in the generated text", rich_help_panel=HELP_PANEL_NAME_1
+        ),
+    ] = None,
+    top_logprobs: Annotated[
+        int | None,
+        Option(help="How many most likely tokens to return at each forward step.", rich_help_panel=HELP_PANEL_NAME_1),
+    ] = None,
+    cache_prompt: Annotated[
+        bool | None, Option(help="Cache prompt prefix.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = None,
+    reasoning_effort: Annotated[
+        int | None, Option(help="Value: `minimal`, `low`, `medium`, `high`", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = None,
+    reasoning_tokens: Annotated[
+        int | None,
+        Option(help="Maximum number of tokens to generate for reasoning", rich_help_panel=HELP_PANEL_NAME_1),
+    ] = None,
+    reasoning_history: Annotated[
+        bool | None,
+        Option(
+            help="values: `none`, `all`, `last`, `auto`. Include reasoning in chat message history sent to generate (defaults to “auto”, which uses the recommended default for each provider)",
+            rich_help_panel=HELP_PANEL_NAME_1,
+        ),
+    ] = None,
+    response_format: Annotated[
+        str | None, Option(help="JSON schema for the response", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = None,
+    parallel_tool_calls: Annotated[
+        bool | None, Option(help="Enable parallel tool calls", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = None,
+    max_tool_output: Annotated[
+        int | None,
+        Option(help="Maximum number of tokens to generate for tool output", rich_help_panel=HELP_PANEL_NAME_1),
+    ] = None,
+    internal_tools: Annotated[
+        bool | None, Option(help="Enable internal tools", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = None,
+    model_args: Annotated[
+        str | None, Option(help="Provider specific arguments: example: 'device=1'", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = None,
+    # task parameters
+    custom_tasks: Annotated[
+        str | None,
+        Option(
+            help="Path to a Python file containing custom task definitions. The file should define a TASKS_TABLE with LightevalTaskConfig objects.",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = None,
+    max_samples: Annotated[
+        int | None,
+        Option(
+            help="Maximum number of samples to use per task. If your task has multiple subtasks, this will be the maximum number of samples to use per subtask.",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = None,
+    # Metric parameters
+    epochs: Annotated[
+        int,
+        Option(
+            help="Number of times to evaluate the model on the task, the results will be aggregated by the specified reducer.",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = 1,
+    epochs_reducer: Annotated[
+        str | None,
+        Option(
+            help="Epochs Reducer to use: mean, median, mode, max, at_least_{n}, pass_at_{k}",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = None,
+    # Connection and parallelization parameters
+    max_connections: Annotated[
+        int,
+        Option(
+            help="Maximum number of concurrent connections to use for each model", rich_help_panel=HELP_PANEL_NAME_3
+        ),
+    ] = 50,
+    timeout: Annotated[
+        int, Option(help="Timeout in seconds for each connection", rich_help_panel=HELP_PANEL_NAME_3)
+    ] = 30,
+    retry_on_error: Annotated[
+        int, Option(help="Number of times to retry on error", rich_help_panel=HELP_PANEL_NAME_3)
+    ] = 1,
+    max_retries: Annotated[
+        int, Option(help="Maximum number of retries to use", rich_help_panel=HELP_PANEL_NAME_3)
+    ] = 5,
+    max_tasks: Annotated[
+        int | None, Option(help="Maximum number of tasks to evaluate in parallel", rich_help_panel=HELP_PANEL_NAME_3)
+    ] = None,
+    # Logging parameters
+    log_dir: Annotated[
+        str,
+        Option(help="Log directory to use, will be created if it doesn't exist", rich_help_panel=HELP_PANEL_NAME_4),
+    ] = "lighteval-logs",
+    log_dir_allow_dirty: Annotated[
+        bool, Option(help="Allow dirty log directory", rich_help_panel=HELP_PANEL_NAME_4)
+    ] = True,
+    display: Annotated[
+        Literal["rich", "full", "conversations", "plain", "log", "none"],
+        Option(help="Display", rich_help_panel=HELP_PANEL_NAME_4),
+    ] = "rich",
+    bundle_dir: Annotated[
+        str | None,
+        Option(help="Bundle directory to use, will be created if it doesn't exist", rich_help_panel=HELP_PANEL_NAME_4),
+    ] = None,
+    repo_id: Annotated[
+        str | None,
+        Option(help="Repository ID to use, will be created if it doesn't exist", rich_help_panel=HELP_PANEL_NAME_4),
+    ] = None,
+    public: Annotated[
+        bool,
+        Option(
+            help="Whether to make the uploaded results and details public on the Hugging Face Hub. If False, datasets will be private.",
+            rich_help_panel=HELP_PANEL_NAME_4,
+        ),
+    ] = False,
+):
+    from lighteval.tasks.registry import Registry
+
+    registry = Registry(tasks=tasks, custom_tasks=None, load_multilingual=False)
+    task_configs = registry.task_to_configs
+    inspect_ai_tasks = []
+
+    for task_name, task_configs in task_configs.items():
+        for task_config in task_configs:
+            inspect_ai_tasks.append(get_inspect_ai_task(task_config, epochs=epochs, epochs_reducer=epochs_reducer))
+
+    if model_args is not None:
+        model_args = InspectAIModelConfig._parse_args(model_args)
+    else:
+        model_args = {}
+
+    success, logs = inspect_ai_eval_set(
+        inspect_ai_tasks,
+        model=models,
+        max_connections=max_connections,
+        timeout=timeout,
+        retry_on_error=retry_on_error,
+        max_retries=max_retries,
+        limit=max_samples,
+        max_tasks=max_tasks,
+        log_dir=log_dir,
+        log_dir_allow_dirty=log_dir_allow_dirty,
+        display=display,
+        bundle_dir=bundle_dir,
+        model_args=model_args,
+        max_tokens=max_tokens,
+        system_message=system_message,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        frequence_penalty=frequence_penalty,
+        presence_penalty=presence_penalty,
+        seed=seed,
+        stop_seqs=stop_seqs,
+        num_choices=num_choices,
+        best_of=best_of,
+        log_probs=log_probs,
+        top_logprobs=top_logprobs,
+        cache_prompt=cache_prompt,
+        reasoning_effort=reasoning_effort,
+        reasoning_tokens=reasoning_tokens,
+        reasoning_history=reasoning_history,
+        response_format=response_format,
+        parallel_tool_calls=parallel_tool_calls,
+        max_tool_output=max_tool_output,
+        internal_tools=internal_tools,
+        overwrite=True,
+    )
+
+    if not success:
+        return
+
+    results_per_model_per_task = {}
+
+    for model in models:
+        results_per_model_per_task[model] = {}
+
+        for log in logs:
+            if log.eval.model == model:
+                results_per_model_per_task[model][log.eval.task] = log.results.metrics
+
+    results_per_model_per_task_agg = mean_metrics_by_prefix(results_per_model_per_task)
+    table_md = results_to_markdown_table(results_per_model_per_task_agg)
+
+    if repo_id is not None:
+        push_to_hub(bundle_dir, repo_id, public=public)
+
+    print()
+    print(table_md)
+    print(f"results saved to {log_dir}")
+    print(f'run "inspect view --log-dir {log_dir}" to view the results')
+
+
+if __name__ == "__main__":
+    task = "lighteval|gsm8k|5,lighteval|gsm8k|1,lighteval|gsm8k|0"
+    task = "lighteval|agieval|0"
+    task = "lighteval|hle|0"
+    task = "lighteval|ifeval|0"
+    task = "lighteval|gpqa|0"
+    task = "lighteval|ifbench_test|0"
+    model = "hf-inference-providers/meta-llama/Llama-3.1-8B-Instruct:nebius"
+    eval(models=[model], tasks=task)
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index f6d6125c6..82cfbb706 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -25,6 +25,8 @@
 
 import numpy as np
 from aenum import Enum
+from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr
+from inspect_ai.solver import TaskState
 
 from lighteval.metrics.dynamic_metrics import MultilingualExtractiveMatchMetric
 from lighteval.metrics.harness_compatibility.drop import DropMetrics
@@ -66,6 +68,8 @@
     ExprExtractionConfig,
     IndicesExtractionConfig,
     LatexExtractionConfig,
+    extract_target_from_pred,
+    get_extraction_regexes_inspect,
 )
 from lighteval.metrics.utils.metric_utils import (
     CorpusLevelMetric,
@@ -77,6 +81,68 @@
 from lighteval.utils.language import Language
 
 
+@scorer(metrics=[accuracy()])
+def math_scorer():
+    gold_extraction_target = (ExprExtractionConfig(),)
+    pred_extraction_target = (ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0))
+    language = Language.ENGLISH
+    fallback_mode = "first_match"
+    extraction_mode = "first_match"
+    timeout_seconds = 5
+
+    gold_extraction_regexes = get_extraction_regexes_inspect(gold_extraction_target, language, len_choices=1)
+    pred_extraction_regexes = get_extraction_regexes_inspect(pred_extraction_target, language, len_choices=1)
+
+    async def score(state: TaskState, target: Target):
+        extracted_predictions = extract_target_from_pred(
+            state.output.completion, pred_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds
+        )
+        extracted_gold = extract_target_from_pred(
+            target.text, gold_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds
+        )
+        return Score(
+            # Correct or Incorrect, used by inspect-ai backend
+            value="C" if extracted_predictions == extracted_gold else "I",
+            explanation=state.output.completion,
+            answer=str(extracted_predictions),
+        )
+
+    return score
+
+
+@scorer(metrics=[accuracy(), stderr()])
+def multichoice_scorer():
+    language = Language.ENGLISH
+    gold_extraction_target = (
+        IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True),
+    )
+    pred_extraction_target = (
+        IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True),
+    )
+    fallback_mode = "first_match"
+    extraction_mode = "first_match"
+    timeout_seconds = 5
+
+    gold_extraction_regexes = get_extraction_regexes_inspect(gold_extraction_target, language)
+    pred_extraction_regexes = get_extraction_regexes_inspect(pred_extraction_target, language)
+
+    async def score(state: TaskState, target: Target):
+        extracted_predictions = extract_target_from_pred(
+            state.output.completion, pred_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds
+        )
+        extracted_gold = extract_target_from_pred(
+            target.text, gold_extraction_regexes, fallback_mode, extraction_mode, timeout_seconds
+        )
+        return Score(
+            # Correct or Incorrect, used by inspect-ai backend
+            value="C" if extracted_predictions == extracted_gold else "I",
+            explanation=state.output.completion,
+            answer=str(extracted_predictions),
+        )
+
+    return score
+
+
 class Metrics(Enum):
     acc_golds_likelihood = SampleLevelMetric(  # todo: we need a better name for this!
         metric_name="acc",
diff --git a/src/lighteval/metrics/utils/extractive_match_utils.py b/src/lighteval/metrics/utils/extractive_match_utils.py
index cce2b1793..da0d718d7 100644
--- a/src/lighteval/metrics/utils/extractive_match_utils.py
+++ b/src/lighteval/metrics/utils/extractive_match_utils.py
@@ -344,6 +344,37 @@ def lazy_indices_regex(
     return [(re.compile(pattern), priority) for pattern, priority in regexes]
 
 
+def get_extraction_regexes_inspect(
+    target_types: Sequence[ExtractionTarget], language: Language, len_choices: int = 1
+) -> list[tuple[list[tuple[re.Pattern[str], int]], ExtractionTarget]]:
+    """Get extraction regexes for inspect AI.
+    Temporary implementation.
+    TODO: refacto this function to share code with get_extraction_regexes
+    """
+    extraction_regexes: list[tuple[list[tuple[re.Pattern[str], int]], ExtractionTarget]] = [
+        (lazy_latex_regex(target_type, language), target_type)
+        if isinstance(target_type, LatexExtractionConfig)
+        else (lazy_expr_regex(target_type, language), target_type)
+        if isinstance(target_type, ExprExtractionConfig)
+        else (lazy_indices_regex(target_type, len_choices, language), target_type)
+        for target_type in target_types
+    ]
+
+    # Sort the extraction res so that order is indices, latex, expr
+    def get_target_type_order(target_type: ExtractionTarget) -> int:
+        match target_type:
+            case IndicesExtractionConfig():
+                return 0
+            case LatexExtractionConfig():
+                return 1
+            case ExprExtractionConfig():
+                return 2
+
+    extraction_regexes = sorted(extraction_regexes, key=lambda x: get_target_type_order(x[1]))
+
+    return extraction_regexes
+
+
 def get_extraction_regexes(
     formatted_doc: Doc, target_types: Sequence[ExtractionTarget], language: Language
 ) -> list[tuple[list[tuple[re.Pattern[str], int]], ExtractionTarget]]:
diff --git a/src/lighteval/models/abstract_model.py b/src/lighteval/models/abstract_model.py
index 81d725e6a..d9d5b4100 100644
--- a/src/lighteval/models/abstract_model.py
+++ b/src/lighteval/models/abstract_model.py
@@ -155,6 +155,68 @@ def _parse_args(args: str) -> dict:
         return model_config
 
 
+class InspectAIModelConfig(BaseModel):
+    max_tokens: int | None = None
+    system_message: str | None = None
+    temperature: float | None = None
+    top_p: float | None = None
+    top_k: int | None = None
+    frequence_penalty: float | None = None
+    presence_penalty: float | None = None
+    seed: int | None = None
+    stop_seqs: list[str] | None = None
+    num_choices: int | None = None
+    best_of: int | None = None
+    log_probs: bool | None = None
+    top_logprobs: int | None = None
+    cache_prompt: bool | None = None
+    reasoning_effort: int | None = None
+    reasoning_tokens: int | None = None
+    reasoning_history: bool | None = None
+    response_format: str | None = None
+    parallel_tool_calls: bool | None = None
+    max_tool_output: int | None = None
+    internal_tools: bool | None = None
+
+    @staticmethod
+    def _parse_args(args: str) -> dict:
+        """Parse a string of arguments into a configuration dictionary.
+
+        This function parses a string containing model arguments and generation parameters
+        into a structured dictionary with two main sections: 'model' and 'generation'.
+        It specifically handles generation parameters enclosed in curly braces.
+
+        Args:
+            args (str): A string containing comma-separated key-value pairs, where generation
+                parameters can be specified in a nested JSON-like format.
+
+        Returns:
+            dict: A dictionary with two keys:
+                - 'model': Contains general model configuration parameters
+                - 'generation': Contains generation-specific parameters
+
+        Examples:
+            >>> parse_args("max_length=100")
+            {
+                'max_tokens': '100',
+            }
+        """
+        args = re.sub(r"generation_parameters=\{.*?\},?", "", args).strip(",")
+        return {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in args.split(",")}
+
+    @classmethod
+    def from_path(cls, path: str):
+        with open(path, "r") as f:
+            config = yaml.safe_load(f)
+
+        return cls(**config["model_parameters"])
+
+    @classmethod
+    def from_args(cls, args: str):
+        config = cls._parse_args(args)
+        return cls(**config)
+
+
 class LightevalModel(ABC):
     DATASET_SPLITS = 4
     is_async = False
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index b84d421a6..00c570338 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -27,6 +27,7 @@
 
 from datasets import DatasetDict, load_dataset
 from huggingface_hub import TextGenerationInputGrammarType
+from inspect_ai.dataset import Sample
 from multiprocess import Pool
 from pytablewriter import MarkdownTableWriter
 
@@ -114,6 +115,13 @@ class LightevalTaskConfig:
     hf_subset: str
     metrics: ListLike[Metric]  # List of metric , should be configurable
 
+    # Inspect AI compatible parameters
+    solver: None = None
+    scorer: None = None
+    sample_fields: Callable[[dict], Sample] | None = None
+    sample_to_fewshot: Callable[[Sample], str] | None = None
+    filter: Callable[[dict], bool] | None = None
+
     # Additional hf dataset config
     hf_revision: str | None = None
     hf_filter: Callable[[dict], bool] | None = None
diff --git a/src/lighteval/tasks/tasks/agieval.py b/src/lighteval/tasks/tasks/agieval.py
index 1f6f6f3d2..63c8ca89b 100644
--- a/src/lighteval/tasks/tasks/agieval.py
+++ b/src/lighteval/tasks/tasks/agieval.py
@@ -24,14 +24,32 @@
 https://arxiv.org/abs/2304.06364
 """
 
+from string import ascii_uppercase
+
+from inspect_ai.dataset import Sample
+from inspect_ai.scorer import choice
+from inspect_ai.solver import multiple_choice
+
 import lighteval.tasks.default_prompts as prompt
 from lighteval.metrics.metrics import Metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 
 
+def record_to_sample(record):
+    # we need to remove prepended (A), (B), (C), (D) from the choices
+    choices = [
+        c.replace("(A)", "").replace("(B)", "").replace("(C)", "").replace("(D)", "").strip()
+        for c in record["choices"]
+    ]
+    return Sample(input=record["query"], target=ascii_uppercase[record["gold"][0]], choices=choices)
+
+
 agieval_aqua_rat = LightevalTaskConfig(
     name="agieval:aqua-rat",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-aqua-rat",
     hf_subset="default",
@@ -50,6 +68,9 @@
 agieval_gaokao_biology = LightevalTaskConfig(
     name="agieval:gaokao-biology",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-gaokao-biology",
     hf_subset="default",
@@ -68,6 +89,9 @@
 agieval_gaokao_chemistry = LightevalTaskConfig(
     name="agieval:gaokao-chemistry",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-gaokao-chemistry",
     hf_subset="default",
@@ -86,6 +110,9 @@
 agieval_gaokao_chinese = LightevalTaskConfig(
     name="agieval:gaokao-chinese",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-gaokao-chinese",
     hf_subset="default",
@@ -104,6 +131,9 @@
 agieval_gaokao_english = LightevalTaskConfig(
     name="agieval:gaokao-english",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-gaokao-english",
     hf_subset="default",
@@ -122,6 +152,9 @@
 agieval_gaokao_geography = LightevalTaskConfig(
     name="agieval:gaokao-geography",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-gaokao-geography",
     hf_subset="default",
@@ -140,6 +173,9 @@
 agieval_gaokao_history = LightevalTaskConfig(
     name="agieval:gaokao-history",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-gaokao-history",
     hf_subset="default",
@@ -158,6 +194,9 @@
 agieval_gaokao_mathqa = LightevalTaskConfig(
     name="agieval:gaokao-mathqa",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-gaokao-mathqa",
     hf_subset="default",
@@ -176,6 +215,9 @@
 agieval_gaokao_physics = LightevalTaskConfig(
     name="agieval:gaokao-physics",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-gaokao-physics",
     hf_subset="default",
@@ -194,6 +236,9 @@
 agieval_logiqa_en = LightevalTaskConfig(
     name="agieval:logiqa-en",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-logiqa-en",
     hf_subset="default",
@@ -212,6 +257,9 @@
 agieval_logiqa_zh = LightevalTaskConfig(
     name="agieval:logiqa-zh",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-logiqa-zh",
     hf_subset="default",
@@ -230,6 +278,9 @@
 agieval_lsat_ar = LightevalTaskConfig(
     name="agieval:lsat-ar",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-lsat-ar",
     hf_subset="default",
@@ -248,6 +299,9 @@
 agieval_lsat_lr = LightevalTaskConfig(
     name="agieval:lsat-lr",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-lsat-lr",
     hf_subset="default",
@@ -266,6 +320,9 @@
 agieval_lsat_rc = LightevalTaskConfig(
     name="agieval:lsat-rc",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-lsat-rc",
     hf_subset="default",
@@ -284,6 +341,9 @@
 agieval_sat_en = LightevalTaskConfig(
     name="agieval:sat-en",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-sat-en",
     hf_subset="default",
@@ -302,6 +362,9 @@
 agieval_sat_en_without_passage = LightevalTaskConfig(
     name="agieval:sat-en-without-passage",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-sat-en-without-passage",
     hf_subset="default",
@@ -320,6 +383,9 @@
 agieval_sat_math = LightevalTaskConfig(
     name="agieval:sat-math",
     suite=["lighteval"],
+    sample_fields=record_to_sample,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     prompt_function=prompt.agieval,
     hf_repo="dmayhem93/agieval-sat-math",
     hf_subset="default",
diff --git a/src/lighteval/tasks/tasks/aime.py b/src/lighteval/tasks/tasks/aime.py
index ac82a00eb..68930deb0 100644
--- a/src/lighteval/tasks/tasks/aime.py
+++ b/src/lighteval/tasks/tasks/aime.py
@@ -24,15 +24,40 @@
 https://maa.org/aime-thresholds-are-available/
 """
 
+from inspect_ai.dataset import Sample
+from inspect_ai.solver import generate, prompt_template
+
 import lighteval.tasks.default_prompts as prompt
-from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.metrics import Metrics, math_scorer
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 
 
+MATH_PROMPT_TEMPLATE = """
+Solve the following math problem step by step. The last line of your
+response should be of the form "ANSWER: $ANSWER" (without quotes)
+where $ANSWER is the answer to the problem.
+
+{prompt}
+
+Remember to put your answer on its own line at the end in the form
+"ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to
+the problem, and you do not need to use a \\boxed command.
+
+Reasoning:
+""".strip()
+
+
+def record_to_sample(record):
+    return Sample(input=record["problem"], target=record["answer"])
+
+
 aime24 = LightevalTaskConfig(
     name="aime24",
     suite=["lighteval"],
     prompt_function=prompt.aime_prompt_fn,
+    sample_fields=record_to_sample,
+    solver=[prompt_template(MATH_PROMPT_TEMPLATE), generate(cache=True)],
+    scorer=math_scorer(),
     hf_repo="HuggingFaceH4/aime_2024",
     hf_subset="default",
     hf_avail_splits=["train"],
@@ -48,6 +73,7 @@
     name="aime24_avg",
     suite=["lighteval"],
     prompt_function=prompt.aime_prompt_fn,
+    sample_fields=record_to_sample,
     hf_repo="HuggingFaceH4/aime_2024",
     hf_subset="default",
     hf_avail_splits=["train"],
@@ -63,6 +89,7 @@
     name="aime24_gpassk",
     suite=["lighteval"],
     prompt_function=prompt.aime_prompt_fn,
+    sample_fields=record_to_sample,
     hf_repo="HuggingFaceH4/aime_2024",
     hf_subset="default",
     hf_avail_splits=["train"],
@@ -78,6 +105,9 @@
     name="aime25",
     suite=["lighteval"],
     prompt_function=prompt.aime_prompt_fn,
+    sample_fields=record_to_sample,
+    solver=[prompt_template(MATH_PROMPT_TEMPLATE), generate(cache=True)],
+    scorer=math_scorer(),
     hf_repo="yentinglin/aime_2025",
     hf_subset="default",
     hf_avail_splits=["train"],
@@ -93,6 +123,7 @@
     name="aime25_avg",
     suite=["lighteval"],
     prompt_function=prompt.aime_prompt_fn,
+    sample_fields=record_to_sample,
     hf_repo="yentinglin/aime_2025",
     hf_subset="default",
     hf_avail_splits=["train"],
@@ -108,6 +139,7 @@
     name="aime25_gpassk",
     suite=["lighteval"],
     prompt_function=prompt.aime_prompt_fn,
+    sample_fields=record_to_sample,
     hf_repo="yentinglin/aime_2025",
     hf_subset="default",
     hf_avail_splits=["train"],
diff --git a/src/lighteval/tasks/tasks/gpqa.py b/src/lighteval/tasks/tasks/gpqa.py
index 5d0e67bda..7e18aed6c 100644
--- a/src/lighteval/tasks/tasks/gpqa.py
+++ b/src/lighteval/tasks/tasks/gpqa.py
@@ -23,15 +23,41 @@
 https://arxiv.org/abs/2311.12022
 """
 
+import random
+from string import ascii_uppercase
+
+from inspect_ai.dataset import Sample
+from inspect_ai.scorer import choice
+from inspect_ai.solver import multiple_choice
+
 import lighteval.tasks.default_prompts as prompt
 from lighteval.metrics.metrics import Metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 
 
+def record_to_sample(record):
+    gold_index = random.randint(0, 3)
+    choices = [record["Incorrect Answer 1"], record["Incorrect Answer 2"], record["Incorrect Answer 3"]]
+    choices.insert(gold_index, record["Correct Answer"])
+    return Sample(
+        input=record["Question"].strip(),
+        choices=choices,
+        target=ascii_uppercase[gold_index],
+    )
+
+
+def sample_to_fewshot(sample):
+    return f"{sample.input}\n\n" + f"ANSWER: {sample.target}"
+
+
 gpqa = LightevalTaskConfig(
     name="gpqa:mc",
     suite=["lighteval"],
     prompt_function=prompt.gpqa,
+    sample_fields=record_to_sample,
+    sample_to_fewshot=sample_to_fewshot,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     hf_repo="Idavidrein/gpqa",
     hf_subset="gpqa_main",
     hf_avail_splits=["train"],
@@ -48,6 +74,10 @@
     name="gpqa:diamond",
     suite=["lighteval"],
     prompt_function=prompt.gpqa_instruct,
+    sample_fields=record_to_sample,
+    sample_to_fewshot=sample_to_fewshot,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     hf_repo="Idavidrein/gpqa",
     hf_subset="gpqa_diamond",
     hf_avail_splits=["train"],
@@ -64,6 +94,10 @@
     name="gpqa:extended",
     suite=["lighteval"],
     prompt_function=prompt.gpqa_instruct,
+    sample_fields=record_to_sample,
+    sample_to_fewshot=sample_to_fewshot,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     hf_repo="Idavidrein/gpqa",
     hf_subset="gpqa_extended",
     hf_avail_splits=["train"],
@@ -80,6 +114,10 @@
     name="gpqa:main",
     suite=["lighteval"],
     prompt_function=prompt.gpqa_instruct,
+    sample_fields=record_to_sample,
+    sample_to_fewshot=sample_to_fewshot,
+    solver=[multiple_choice(cache=True)],
+    scorer=choice(),
     hf_repo="Idavidrein/gpqa",
     hf_subset="gpqa_main",
     hf_avail_splits=["train"],
diff --git a/src/lighteval/tasks/tasks/gsm8k.py b/src/lighteval/tasks/tasks/gsm8k.py
index c4b5a51a6..98414e04f 100644
--- a/src/lighteval/tasks/tasks/gsm8k.py
+++ b/src/lighteval/tasks/tasks/gsm8k.py
@@ -18,15 +18,51 @@
 https://arxiv.org/abs/2110.14168
 """
 
+from inspect_ai.dataset import Sample
+from inspect_ai.solver import generate, prompt_template
+
 import lighteval.tasks.default_prompts as prompt
-from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.metrics import Metrics, math_scorer
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 
 
+# setup for problem + instructions for providing answer
+MATH_PROMPT_TEMPLATE = """
+Solve the following math problem step by step. The last line of your
+response should be of the form "ANSWER: $ANSWER" (without quotes)
+where $ANSWER is the answer to the problem.
+
+{prompt}
+
+Remember to put your answer on its own line at the end in the form
+"ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to
+the problem, and you do not need to use a \\boxed command.
+
+Reasoning:
+""".strip()
+
+
+def record_to_sample(record):
+    DELIM = "####"
+    input = record["question"]
+    answer = record["answer"].split(DELIM)
+    target = answer.pop().strip()
+    reasoning = DELIM.join(answer)
+    return Sample(input=input, target=target, metadata={"reasoning": reasoning.strip()})
+
+
+def sample_to_fewshot(sample):
+    return f"{sample.input}\n\nReasoning:\n" + f"{sample.metadata['reasoning']}\n\n" + f"ANSWER: {sample.target}"
+
+
 gsm8k = LightevalTaskConfig(
     name="gsm8k",
     suite=["lighteval"],
     prompt_function=prompt.gsm8k,
+    sample_fields=record_to_sample,
+    sample_to_fewshot=sample_to_fewshot,
+    solver=[prompt_template(MATH_PROMPT_TEMPLATE), generate(cache=True)],
+    scorer=math_scorer(),
     hf_repo="openai/gsm8k",
     hf_subset="main",
     hf_avail_splits=["train", "test"],
diff --git a/src/lighteval/tasks/tasks/gsm_plus.py b/src/lighteval/tasks/tasks/gsm_plus.py
index 65afadef2..bf70e5d75 100644
--- a/src/lighteval/tasks/tasks/gsm_plus.py
+++ b/src/lighteval/tasks/tasks/gsm_plus.py
@@ -20,15 +20,50 @@
 https://arxiv.org/abs/2402.19255
 """
 
+from inspect_ai.dataset import Sample
+from inspect_ai.solver import generate, prompt_template
+
 import lighteval.tasks.default_prompts as prompt
-from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.metrics import Metrics, math_scorer
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 
 
+# setup for problem + instructions for providing answer
+MATH_PROMPT_TEMPLATE = """
+Solve the following math problem step by step. The last line of your
+response should be of the form "ANSWER: $ANSWER" (without quotes)
+where $ANSWER is the answer to the problem.
+
+{prompt}
+
+Remember to put your answer on its own line at the end in the form
+"ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to
+the problem, and you do not need to use a \\boxed command.
+
+Reasoning:
+""".strip()
+
+
+def record_to_sample(record):
+    return Sample(
+        input=record["question"],
+        target=record["answer"],
+        metadata={"reasoning": record["solution"]},
+    )
+
+
+def sample_to_fewshot(sample):
+    return f"{sample.input}\n\nReasoning:\n" + f"{sample.metadata['reasoning']}\n\n" + f"ANSWER: {sample.target}"
+
+
 gsm_plus = LightevalTaskConfig(
     name="gsm_plus",
     suite=["lighteval"],
     prompt_function=prompt.gsm_plus,
+    sample_fields=record_to_sample,
+    sample_to_fewshot=sample_to_fewshot,
+    solver=[prompt_template(MATH_PROMPT_TEMPLATE), generate(cache=True)],
+    scorer=math_scorer(),
     hf_repo="qintongli/GSM-Plus",
     hf_subset="default",
     hf_avail_splits=["test", "testmini"],
diff --git a/src/lighteval/tasks/tasks/hle/main.py b/src/lighteval/tasks/tasks/hle/main.py
index c22dcaf72..6131c9817 100644
--- a/src/lighteval/tasks/tasks/hle/main.py
+++ b/src/lighteval/tasks/tasks/hle/main.py
@@ -27,6 +27,9 @@
 
 import numpy as np
 from aenum import extend_enum
+from inspect_ai.dataset import Sample
+from inspect_ai.scorer import model_graded_fact
+from inspect_ai.solver import generate, system_message
 from pydantic import BaseModel
 
 from lighteval.metrics.metrics import Metrics
@@ -188,15 +191,21 @@ def calib_err(confidence, correct, p="2", beta=100):
     return cerr
 
 
+SYSTEM_MESSAGE = """
+Your response should be in the following format:\nExplanation: {your explanation for your answer choice}\nAnswer: {your chosen answer}\nConfidence: {your confidence score between 0% and 100% for your answer}
+""".strip()
+
+
 def hle_text_only(line, task_name: str = None):
     if line["image"] not in [None, ""]:
         return None
 
     return Doc(
         task_name=task_name,
-        query=f"Question: {line['question']}\nAnswer:",
+        query=line["question"],
         choices=[line["answer"]],
         gold_index=0,
+        instruction=SYSTEM_MESSAGE,
         specific={"question": line["question"]},
     )
 
@@ -210,6 +219,15 @@ def hle_text_only(line, task_name: str = None):
 )
 extend_enum(Metrics, "hle_metrics", hle_metrics)
 
+
+def record_to_sample(record):
+    return Sample(
+        input=record["question"],
+        target=record["answer"],
+        metadata={"is_image_question": record["image"] not in [None, ""]},
+    )
+
+
 hle = LightevalTaskConfig(
     name="hle",
     suite=["lighteval"],
@@ -224,6 +242,10 @@ def hle_text_only(line, task_name: str = None):
     metrics=[Metrics.exact_match, Metrics.hle_metrics],
     stop_sequence=[],
     version=0,
+    sample_fields=record_to_sample,
+    solver=[system_message(SYSTEM_MESSAGE), generate(cache=True)],
+    scorer=model_graded_fact(),
+    filter=lambda x: not x.metadata["is_image_question"],
 )
 
 
diff --git a/src/lighteval/tasks/tasks/ifbench/main.py b/src/lighteval/tasks/tasks/ifbench/main.py
index 419c86600..1f2abf22d 100644
--- a/src/lighteval/tasks/tasks/ifbench/main.py
+++ b/src/lighteval/tasks/tasks/ifbench/main.py
@@ -20,6 +20,9 @@
 
 import numpy as np
 from aenum import extend_enum
+from inspect_ai.dataset import Sample
+from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr
+from inspect_ai.solver import TaskState, generate
 
 from lighteval.metrics.metrics import Metrics
 from lighteval.metrics.metrics_sample import SampleLevelComputation
@@ -30,6 +33,7 @@
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc, SamplingMethod
 from lighteval.tasks.tasks.ifbench import evaluation_lib
+from lighteval.utils.imports import requires
 
 
 def ifbench_prompt(line, task_name: str = ""):
@@ -84,6 +88,49 @@ def agg_inst_level_acc(items):
     return inst_level_acc
 
 
+@requires("langdetect")
+def record_to_sample(record):
+    metadata = {"instruction_id_list": record["instruction_id_list"], "kwargs": record["kwargs"]}
+    return Sample(
+        input=record["prompt"],
+        metadata=metadata,
+    )
+
+
+@scorer(
+    metrics={
+        "prompt_level_strict_acc": [accuracy(), stderr()],
+        "prompt_level_loose_acc": [accuracy(), stderr()],
+    }
+)
+def ifbench_scorer():
+    async def score(state: TaskState, target: Target):
+        response = state.output.completion
+        # Create InputExample from the doc data
+        inp = evaluation_lib.InputExample(
+            key=0,  # Not used in evaluation
+            instruction_id_list=state.metadata["instruction_id_list"],
+            prompt=state.input,
+            kwargs=state.metadata["kwargs"],
+        )
+
+        # Create prompt_to_response mapping for evaluation_lib functions
+        prompt_to_response = {state.input: response}
+
+        # Use existing evaluation_lib functions
+        strict_result = evaluation_lib.test_instruction_following_strict(inp, prompt_to_response)
+        loose_result = evaluation_lib.test_instruction_following_loose(inp, prompt_to_response)
+        return Score(
+            value={
+                "prompt_level_strict_acc": int(strict_result.follow_all_instructions),
+                "prompt_level_loose_acc": int(loose_result.follow_all_instructions),
+            },
+            explanation=str(state.metadata["instruction_id_list"]),
+        )
+
+    return score
+
+
 ifbench_metrics = SampleLevelMetricGrouping(
     metric_name=submetric_names,
     higher_is_better=dict.fromkeys(submetric_names, True),
@@ -112,6 +159,9 @@ def agg_inst_level_acc(items):
     generation_size=1280,
     stop_sequence=[],  # no stop sequence, will use eot token
     version="0.1",
+    sample_fields=record_to_sample,
+    solver=[generate(cache=True)],
+    scorer=ifbench_scorer(),
 )
 
 # Multi-turn IFBench task config
@@ -129,6 +179,9 @@ def agg_inst_level_acc(items):
     generation_size=1280,
     stop_sequence=[],  # no stop sequence, will use eot token
     version="0.1",
+    sample_fields=record_to_sample,
+    solver=[generate(cache=True)],
+    scorer=ifbench_scorer(),
 )
 
 TASKS_TABLE = [ifbench_test, ifbench_multiturn]
diff --git a/src/lighteval/tasks/tasks/ifeval/main.py b/src/lighteval/tasks/tasks/ifeval/main.py
index 2922e5fb6..5200bc04e 100644
--- a/src/lighteval/tasks/tasks/ifeval/main.py
+++ b/src/lighteval/tasks/tasks/ifeval/main.py
@@ -20,6 +20,9 @@
 """
 
 import numpy as np
+from inspect_ai.dataset import Sample
+from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr
+from inspect_ai.solver import TaskState, generate
 
 import lighteval.tasks.tasks.ifeval.instructions_registry as instructions_registry
 from lighteval.metrics.metrics_sample import SampleLevelComputation
@@ -57,6 +60,29 @@ def ifeval_prompt(line, task_name: str = ""):
 ]
 
 
+def _preprocess_response(response: str) -> str:
+    all_responses = []
+    r = response.split("\n")
+    response_remove_first = "\n".join(r[1:]).strip()
+    response_remove_last = "\n".join(r[:-1]).strip()
+    response_remove_both = "\n".join(r[1:-1]).strip()
+    revised_response = response.replace("*", "")
+    revised_response_remove_first = response_remove_first.replace("*", "")
+    revised_response_remove_last = response_remove_last.replace("*", "")
+    revised_response_remove_both = response_remove_both.replace("*", "")
+    all_responses = [
+        response,
+        revised_response,
+        response_remove_first,
+        response_remove_last,
+        response_remove_both,
+        revised_response_remove_first,
+        revised_response_remove_last,
+        revised_response_remove_both,
+    ]
+    return all_responses
+
+
 class IFEvalMetrics(SampleLevelComputation):
     def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict:
         response = model_response.final_text[0]
@@ -67,24 +93,7 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict:
         prompt = doc.query
 
         # Loose instructions
-        r = response.split("\n")
-        response_remove_first = "\n".join(r[1:]).strip()
-        response_remove_last = "\n".join(r[:-1]).strip()
-        response_remove_both = "\n".join(r[1:-1]).strip()
-        revised_response = response.replace("*", "")
-        revised_response_remove_first = response_remove_first.replace("*", "")
-        revised_response_remove_last = response_remove_last.replace("*", "")
-        revised_response_remove_both = response_remove_both.replace("*", "")
-        all_responses = [
-            response,
-            revised_response,
-            response_remove_first,
-            response_remove_last,
-            response_remove_both,
-            revised_response_remove_first,
-            revised_response_remove_last,
-            revised_response_remove_both,
-        ]
+        all_responses = _preprocess_response(response)
 
         is_following_list_strict = []
         is_following_list_loose = []
@@ -143,6 +152,67 @@ def agg_inst_level_acc(items):
     },
 )
 
+
+@requires("langdetect")
+def record_to_sample(record):
+    metadata = {"instruction_id_list": record["instruction_id_list"], "kwargs": record["kwargs"]}
+    return Sample(
+        input=record["prompt"],
+        metadata=metadata,
+    )
+
+
+@scorer(
+    metrics={
+        "prompt_level_strict_acc": [accuracy(), stderr()],
+        "prompt_level_loose_acc": [accuracy(), stderr()],
+    }
+)
+def ifeval_scorer():
+    async def score(state: TaskState, target: Target):
+        response = state.output.completion
+        # Strict instructions
+        instruction_list = state.metadata["instruction_id_list"]
+        all_kwargs = state.metadata["kwargs"]
+        prompt = state.input
+        # Loose instructions
+        all_responses = _preprocess_response(response)
+
+        is_following_list_strict = []
+        is_following_list_loose = []
+        for index, instruction_id in enumerate(instruction_list):
+            instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+            instruction = instruction_cls(instruction_id)
+            # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+            task_kwargs = {k: v for k, v in all_kwargs[index].items() if v}
+            instruction.build_description(**task_kwargs)
+            args = instruction.get_instruction_args()
+            if args and "prompt" in args:
+                instruction.build_description(prompt=prompt)
+            # Strict
+            if response.strip() and instruction.check_following(response):
+                is_following_list_strict.append(True)
+            else:
+                is_following_list_strict.append(False)
+            # Loose
+            is_following = False
+            for r in all_responses:
+                if r.strip() and instruction.check_following(r):
+                    is_following = True
+                    break
+
+            is_following_list_loose.append(is_following)
+        return Score(
+            value={
+                "prompt_level_strict_acc": int(all(is_following_list_strict)),
+                "prompt_level_loose_acc": int(all(is_following_list_loose)),
+            },
+            explanation=str(instruction_list),
+        )
+
+    return score
+
+
 # We create the task config
 ifeval = LightevalTaskConfig(
     name="ifeval",
@@ -158,6 +228,9 @@ def agg_inst_level_acc(items):
     generation_size=1280,
     stop_sequence=[],  # no stop sequence, will use eot token
     version="0.1",
+    sample_fields=record_to_sample,
+    solver=[generate(cache=True)],
+    scorer=ifeval_scorer(),
 )