NVIDIA · rapids-bot · Aug 14, 2025 · Aug 5, 2025 · Aug 5, 2025 · Aug 5, 2025
diff --git a/docs/source/guides/evaluate-api.md b/docs/source/guides/evaluate-api.md
@@ -138,6 +138,42 @@ eval:
               - sympy__sympy-21055
 ```
 
+### Custom Dataset Format
+You can use a dataset of a custom format by providing a custom dataset parser function.
+
+**Example:**
+`examples/evaluation_and_profiling/simple_calculator_eval/configs/config-custom-dataset-format.yml`:
+```yaml
+eval:
+  general:
+    dataset:
+      _type: custom
+      file_path: examples/evaluation_and_profiling/simple_calculator_eval/data/simple_calculator_nested.json
+      function: aiq_simple_calculator_eval.scripts.custom_dataset_parser.extract_nested_questions
+      kwargs:
+        difficulty: "medium"
+        max_rows: 5
+```
+This example configuration uses a custom dataset parser function to extract the nested questions from the example dataset, filter them by difficulty and return only the first five questions. The example dataset `simple_calculator_nested.json` is a nested JSON file with questions and answers.
+
+The custom dataset parser function is a Python function that takes the dataset `file_path`, optional `kwargs` and returns an `EvalInput` object. Signature of the sample custom dataset parser function is as follows:
+```python
+def extract_nested_questions(file_path: Path, difficulty: str = None, max_rows: int = None) -> EvalInput:
+```
+
+{py:class}`~aiq.eval.evaluator.evaluator_model.EvalInput` is a Pydantic model that contains a list of `EvalInputItem` objects.
+{py:class}`~aiq.eval.evaluator.evaluator_model.EvalInputItem` is a Pydantic model that contains the fields for an item in the dataset.
+The custom dataset parser function should fill the following fields in the `EvalInputItem` object:
+- `id`: The id of the item. Every item in the dataset must have a unique id of type `str` or `int`.
+- `input_obj`: This is the question.
+- `expected_output_obj`: This is the ground truth answer.
+- `full_dataset_entry`: This is the entire dataset entry and is passed as is to the evaluator.
+
+To run the evaluation, run the following command:
+```bash
+aiq eval --config_file=examples/evaluation_and_profiling/simple_calculator_eval/configs/config-custom-dataset-format.yml
+```
+
 ## NeMo Agent Toolkit Built-in Evaluators
 NeMo Agent toolkit provides the following built-in evaluator:
 - `ragas` - An evaluator to run and evaluate RAG-like workflows using the public RAGAS API.

@@ -0,0 +1 @@
+src/aiq_simple_calculator_eval/configs
@@ -0,0 +1 @@
+src/aiq_simple_calculator_eval/data
@@ -2,9 +2,6 @@
 build-backend = "setuptools.build_meta"
 requires = ["setuptools >= 64", "setuptools-scm>=8"]
 
-[tool.setuptools]
-packages = []
-
 [tool.setuptools_scm]
 root = "../../.."
 

diff --git a/...valuation_and_profiling/simple_calculator_eval/src/aiq_simple_calculator_eval/__init__.py b/...valuation_and_profiling/simple_calculator_eval/src/aiq_simple_calculator_eval/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/...e_calculator_eval/src/aiq_simple_calculator_eval/configs/config-custom-dataset-format.yml b/...e_calculator_eval/src/aiq_simple_calculator_eval/configs/config-custom-dataset-format.yml
@@ -0,0 +1,85 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+general:
+  use_uvloop: true
+
+functions:
+  calculator_multiply:
+    _type: calculator_multiply
+  calculator_inequality:
+    _type: calculator_inequality
+  calculator_divide:
+    _type: aiq_simple_calculator/calculator_divide
+  current_datetime:
+    _type: current_datetime
+  calculator_subtract:
+    _type: calculator_subtract
+
+llms:
+  nim_llm:
+    _type: nim
+    model_name: meta/llama-3.1-70b-instruct
+    temperature: 0.2
+    max_tokens: 2048
+  eval_llm:
+    _type: nim
+    model_name: mistralai/mixtral-8x22b-instruct-v0.1
+    temperature: 0.0
+    max_tokens: 1024
+
+workflow:
+  _type: react_agent
+  tool_names:
+    - calculator_multiply
+    - calculator_inequality
+    - current_datetime
+    - calculator_divide
+    - calculator_subtract
+  llm_name: nim_llm
+  verbose: true
+  retry_agent_response_parsing_errors: true
+  parse_agent_response_max_retries: 3
+
+
+eval:
+  general:
+    output_dir: .tmp/aiq/examples/simple_calculator/eval
+    dataset:
+      _type: custom
+      file_path: examples/evaluation_and_profiling/simple_calculator_eval/data/simple_calculator_nested.json
+      function: aiq_simple_calculator_eval.scripts.custom_dataset_parser.extract_nested_questions
+      kwargs:
+        difficulty: "medium"
+        max_rows: 5
+
+  evaluators:
+    tuneable_eval:
+      _type: tunable_rag_evaluator
+      llm_name: eval_llm
+      default_scoring: true
+      default_score_weights:
+        coverage: 0.5
+        correctness: 0.3
+        relevance: 0.2
+      judge_llm_prompt: >
+        You are an intelligent evaluator that scores the generated answer based on the description of the expected answer.
+        The score is a measure of how well the generated answer matches the description of the expected answer based on the question.
+        Take into account the question, the relevance of the answer to the question and the quality compared to the description of the expected answer.
+
+        Rules:
+        - The score must be a float of any value between 0.0 and 1.0 on a sliding scale.
+        - The reasoning string must be concise and to the point. It should be 1 sentence and 2 only if extra description is needed. It must explain why the score was given and what is different between the generated answer and the expected answer.
+        - The tags <image> and <chart> are real images and charts.
diff --git a/...lator_eval/configs/config-sizing-calc.yml → ...lator_eval/configs/config-sizing-calc.yml b/...lator_eval/configs/config-sizing-calc.yml → ...lator_eval/configs/config-sizing-calc.yml
diff --git a/..._eval/configs/config-tunable-rag-eval.yml → ..._eval/configs/config-tunable-rag-eval.yml b/..._eval/configs/config-tunable-rag-eval.yml → ..._eval/configs/config-tunable-rag-eval.yml
diff --git a/.../simple_calculator_eval/src/aiq_simple_calculator_eval/data/simple_calculator_nested.json b/.../simple_calculator_eval/src/aiq_simple_calculator_eval/data/simple_calculator_nested.json
diff --git a/...n_and_profiling/simple_calculator_eval/src/aiq_simple_calculator_eval/scripts/__init__.py b/...n_and_profiling/simple_calculator_eval/src/aiq_simple_calculator_eval/scripts/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/...ng/simple_calculator_eval/src/aiq_simple_calculator_eval/scripts/custom_dataset_parser.py b/...ng/simple_calculator_eval/src/aiq_simple_calculator_eval/scripts/custom_dataset_parser.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+
+from aiq.eval.evaluator.evaluator_model import EvalInput
+from aiq.eval.evaluator.evaluator_model import EvalInputItem
+
+
+def extract_nested_questions(file_path: Path, difficulty: str = None, max_rows: int = None) -> EvalInput:
+    """
+    This is a sample custom dataset parser that:
+    1. Loads a nested JSON file
+    2. Extracts the questions array from the nested structure
+    3. Applies optional filtering by difficulty (hard, medium, easy)
+    4. Applies an optional maximum number of questions to return
+    5. Creates an EvalInput object with the extracted questions and returns it
+
+    Expects JSON format:
+    {
+        "metadata": {...},
+        "configuration": {...},
+        "questions": [
+            {"id": 1, "question": "...", "answer": "...", "category": "...", "difficulty": "...", ...},
+            ...
+        ]
+    }
+
+    Args:
+        file_path: Path to the nested JSON file
+        difficulty: Optional difficulty to filter questions by
+        max_rows: Optional maximum number of questions to return
+
+    Returns:
+        EvalInput object containing the extracted questions
+    """
+
+    # Load the nested JSON
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+
+    # Extract questions array from the nested structure
+    questions = data.get('questions', [])
+
+    # Apply filtering if specified
+    if difficulty:
+        filtered_questions = []
+        for question in questions:
+            # Check if difficulty matches difficulty (hard, medium, easy)
+            if question.get('difficulty', '').lower() == difficulty.lower():
+                filtered_questions.append(question)
+        questions = filtered_questions
+
+    # Apply max_rows limit if specified
+    if max_rows and max_rows > 0:
+        questions = questions[:max_rows]
+
+    eval_items = []
+
+    for item in questions:
+        eval_item = EvalInputItem(id=item['id'],
+                                  input_obj=item['question'],
+                                  expected_output_obj=item['answer'],
+                                  full_dataset_entry=item)
+        eval_items.append(eval_item)
+
+    return EvalInput(eval_input_items=eval_items)
diff --git a/src/aiq/data_models/dataset_handler.py b/src/aiq/data_models/dataset_handler.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib
 import json
 import typing
 from collections.abc import Callable
@@ -112,12 +113,57 @@ def parser() -> tuple[Callable, dict]:
         return pd.read_excel, {"engine": "openpyxl"}
 
 
+class EvalDatasetCustomConfig(EvalDatasetBaseConfig, name="custom"):
+    """
+    Configuration for custom dataset type that allows users to specify
+    a custom Python function to transform their dataset into EvalInput format.
+    """
+
+    function: str  # Direct import path to function, format: "module.path.function_name"
+    kwargs: dict[str, typing.Any] = {}  # Additional arguments to pass to the custom function
+
+    def parser(self) -> tuple[Callable, dict]:
+        """
+        Load and return the custom function for dataset transformation.
+
+        Returns:
+            Tuple of (custom_function, kwargs) where custom_function transforms
+            a dataset file into an EvalInput object.
+        """
+        custom_function = self._load_custom_function()
+        return custom_function, self.kwargs
+
+    def _load_custom_function(self) -> Callable:
+        """
+        Import and return the custom function using standard Python import path.
+        """
+        if not self.function:
+            raise ValueError("Function path cannot be empty")
+
+        # Split the function path to get module and function name
+        module_path, function_name = self.function.rsplit(".", 1)
+
+        # Import the module
+        module = importlib.import_module(module_path)
+
+        # Get the function from the module
+        if not hasattr(module, function_name):
+            raise AttributeError(f"Function '{function_name}' not found in module '{module_path}'")
+
+        custom_function = getattr(module, function_name)
+
+        if not callable(custom_function):
+            raise ValueError(f"'{self.function}' is not callable")
+
+        return custom_function
+
+
 # Union model with discriminator
-EvalDatasetConfig = typing.Annotated[typing.Annotated[EvalDatasetJsonConfig, Tag(EvalDatasetJsonConfig.static_type())]
-                                     | typing.Annotated[EvalDatasetCsvConfig, Tag(EvalDatasetCsvConfig.static_type())]
-                                     | typing.Annotated[EvalDatasetXlsConfig, Tag(EvalDatasetXlsConfig.static_type())]
-                                     | typing.Annotated[EvalDatasetParquetConfig,
-                                                        Tag(EvalDatasetParquetConfig.static_type())]
-                                     | typing.Annotated[EvalDatasetJsonlConfig,
-                                                        Tag(EvalDatasetJsonlConfig.static_type())],
-                                     Discriminator(TypedBaseModel.discriminator)]
+EvalDatasetConfig = typing.Annotated[
+    typing.Annotated[EvalDatasetJsonConfig, Tag(EvalDatasetJsonConfig.static_type())]
+    | typing.Annotated[EvalDatasetCsvConfig, Tag(EvalDatasetCsvConfig.static_type())]
+    | typing.Annotated[EvalDatasetXlsConfig, Tag(EvalDatasetXlsConfig.static_type())]
+    | typing.Annotated[EvalDatasetParquetConfig, Tag(EvalDatasetParquetConfig.static_type())]
+    | typing.Annotated[EvalDatasetJsonlConfig, Tag(EvalDatasetJsonlConfig.static_type())]
+    | typing.Annotated[EvalDatasetCustomConfig, Tag(EvalDatasetCustomConfig.static_type())],
+    Discriminator(TypedBaseModel.discriminator)]