From 770f39a246155f7a86ed22ef3d30b3e8093687dc Mon Sep 17 00:00:00 2001 From: Anuradha Karuppiah Date: Fri, 2 May 2025 10:09:16 -0700 Subject: [PATCH 1/3] Config option to specify the intermediate step types in workflow_output.json Currently, only TOOL_OUTPUT and LLM_OUTPUT are included in the published workflow_output.json. But there are usecases where other types such as TOOL_START may also be needed. This PR provides a config option in config.yml to allow the user to specify the filter. Sample config (examples/simple/configs/eval_config.yml): eval: general: output: dir: ./.tmp/aiq/examples/simple/ workflow_output_step_filter: [LLM_END, TOOL_START, TOOL_END] Signed-off-by: Anuradha Karuppiah --- src/aiq/data_models/evaluate.py | 3 +++ src/aiq/eval/dataset_handler/dataset_handler.py | 15 ++++++++++----- src/aiq/eval/evaluate.py | 3 ++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/aiq/data_models/evaluate.py b/src/aiq/data_models/evaluate.py index 096446490..9a5cda9ee 100644 --- a/src/aiq/data_models/evaluate.py +++ b/src/aiq/data_models/evaluate.py @@ -24,6 +24,7 @@ from aiq.data_models.dataset_handler import EvalDatasetConfig from aiq.data_models.dataset_handler import EvalS3Config from aiq.data_models.evaluator import EvaluatorBaseConfig +from aiq.data_models.intermediate_step import IntermediateStepType from aiq.data_models.profiler import ProfilerConfig @@ -45,6 +46,8 @@ class EvalOutputConfig(BaseModel): s3: EvalS3Config | None = None # Whether to cleanup the output directory before running the workflow cleanup: bool = True + # Filter for the workflow output steps + workflow_output_step_filter: list[IntermediateStepType] | None = None class EvalGeneralConfig(BaseModel): diff --git a/src/aiq/eval/dataset_handler/dataset_handler.py b/src/aiq/eval/dataset_handler/dataset_handler.py index c28f31ac0..ea2a94dbd 100644 --- a/src/aiq/eval/dataset_handler/dataset_handler.py +++ b/src/aiq/eval/dataset_handler/dataset_handler.py @@ -20,6 +20,7 @@ from aiq.data_models.dataset_handler import EvalDatasetConfig from aiq.data_models.dataset_handler import EvalDatasetJsonConfig from aiq.data_models.intermediate_step import IntermediateStep +from aiq.data_models.intermediate_step import IntermediateStepType from aiq.eval.dataset_handler.dataset_downloader import DatasetDownloader from aiq.eval.dataset_handler.dataset_filter import DatasetFilter from aiq.eval.evaluator.evaluator_model import EvalInput @@ -132,20 +133,24 @@ def get_eval_input_from_dataset(self, dataset: str) -> EvalInput: # Convert the DataFrame to a list of EvalInput objects return self.get_eval_input_from_df(input_df) - def filter_intermediate_steps(self, intermediate_steps: list[IntermediateStep]) -> list[dict]: + def filter_intermediate_steps(self, + intermediate_steps: list[IntermediateStep], + event_filter: list[IntermediateStepType] = None) -> list[dict]: """ Filter out the intermediate steps that are not relevant for evaluation. The output is written with with the intention of re-running the evaluation using the original config file. """ - filtered_steps = self.intermediate_step_adapter.filter_intermediate_steps( - intermediate_steps, self.intermediate_step_adapter.DEFAULT_EVENT_FILTER) + if event_filter is None: + event_filter = self.intermediate_step_adapter.DEFAULT_EVENT_FILTER + filtered_steps = self.intermediate_step_adapter.filter_intermediate_steps(intermediate_steps, event_filter) return self.intermediate_step_adapter.serialize_intermediate_steps(filtered_steps) - def publish_eval_input(self, eval_input) -> str: + def publish_eval_input(self, eval_input, workflow_output_step_filter: list[IntermediateStepType] = None) -> str: """ Convert the EvalInput object to a JSON output for storing in a file. Use the orginal keys to allow re-running evaluation using the orignal config file and '--skip_workflow' option. """ + indent = 2 if self.is_structured_input(): # Extract structured data from EvalInputItems @@ -154,7 +159,7 @@ def publish_eval_input(self, eval_input) -> str: self.question_key: item.input_obj, self.answer_key: item.expected_output_obj, self.generated_answer_key: item.output_obj, - self.trajectory_key: self.filter_intermediate_steps(item.trajectory), + self.trajectory_key: self.filter_intermediate_steps(item.trajectory, workflow_output_step_filter), self.expected_trajectory_key: self.filter_intermediate_steps(item.expected_trajectory), } for item in eval_input.eval_input_items] else: diff --git a/src/aiq/eval/evaluate.py b/src/aiq/eval/evaluate.py index 57ab3600d..cfbe7fcbb 100644 --- a/src/aiq/eval/evaluate.py +++ b/src/aiq/eval/evaluate.py @@ -177,7 +177,8 @@ def write_output(self, dataset_handler: DatasetHandler): workflow_output_file.parent.mkdir(parents=True, exist_ok=True) # Write the workflow output to a file (this can be used for re-running the evaluation) - workflow_output = dataset_handler.publish_eval_input(self.eval_input) + workflow_output = dataset_handler.publish_eval_input( + self.eval_input, self.eval_config.general.output.workflow_output_step_filter) with open(workflow_output_file, "w", encoding="utf-8") as f: # set indent to 2 for pretty printing f.write(workflow_output) From 2c98f752bc864c89f39d0cab7177cb198ffdcdcd Mon Sep 17 00:00:00 2001 From: Anuradha Karuppiah Date: Fri, 2 May 2025 11:30:44 -0700 Subject: [PATCH 2/3] Add documentation. Signed-off-by: Anuradha Karuppiah --- docs/source/concepts/evaluate.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/source/concepts/evaluate.md b/docs/source/concepts/evaluate.md index 3d54e6a4a..d4920ffcd 100644 --- a/docs/source/concepts/evaluate.md +++ b/docs/source/concepts/evaluate.md @@ -370,6 +370,18 @@ The output of the evaluators are stored in distinct files in the same `output_di } ``` +## Workflow Output Intermediate Step Filtering +The workflow_output.json file contains the intermediate steps for each entry in the dataset. The intermediate steps are filtered using the `eval.general.output.workflow_output_step_filter` parameter in the `config.yml` file. The default value for the filter is `[LLM_END, TOOL_END]`. You can customize the filter by providing a list of intermediate step types to include in the output file. + +**Example:** +`examples/simple/configs/eval_config.yml` can be modified to include the intermediate steps in the output by adding the following configuration: +```yaml +eval: + general: + output: + workflow_output_step_filter: [LLM_END, TOOL_START, TOOL_END] +``` + ## Customizing the output You can customize the output of the pipeline by providing custom scripts. One or more Python scripts can be provided in the `eval.general.output_scripts` section of the `config.yml` file. From b382caf38c2c4f9b4398e5e191180cd3360e1d60 Mon Sep 17 00:00:00 2001 From: Anuradha Karuppiah Date: Fri, 2 May 2025 11:53:11 -0700 Subject: [PATCH 3/3] Add an unit test Signed-off-by: Anuradha Karuppiah --- .../dataset_handler/test_dataset_handler.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/aiq/eval/dataset_handler/test_dataset_handler.py b/tests/aiq/eval/dataset_handler/test_dataset_handler.py index 6f59dfb0b..3f4bbc9ca 100644 --- a/tests/aiq/eval/dataset_handler/test_dataset_handler.py +++ b/tests/aiq/eval/dataset_handler/test_dataset_handler.py @@ -18,6 +18,9 @@ from aiq.data_models.dataset_handler import EvalDatasetJsonConfig from aiq.data_models.dataset_handler import EvalDatasetStructureConfig +from aiq.data_models.intermediate_step import IntermediateStep +from aiq.data_models.intermediate_step import IntermediateStepPayload +from aiq.data_models.intermediate_step import IntermediateStepType from aiq.eval.dataset_handler.dataset_handler import DatasetHandler from aiq.eval.evaluator.evaluator_model import EvalInput @@ -232,3 +235,46 @@ def test_setup_reps(dataset_handler, mock_input_df, dataset_id_key): assert len(replicated_df) == len(mock_input_df) * dataset_handler.reps, "Dataset should be replicated correctly" assert all("_rep" in str(i) for i in replicated_df[dataset_id_key]), "IDs should be suffixed with `_repX`" + + +@pytest.fixture +def mock_intermediate_steps(): + """Create a list of mock intermediate steps with different event types.""" + steps = [] + # Add LLM_START step + steps.append( + IntermediateStep(payload=IntermediateStepPayload(event_type=IntermediateStepType.LLM_START, name="llm_start"))) + # Add LLM_END step + steps.append( + IntermediateStep(payload=IntermediateStepPayload(event_type=IntermediateStepType.LLM_END, name="llm_end"))) + # Add TOOL_START step + steps.append( + IntermediateStep( + payload=IntermediateStepPayload(event_type=IntermediateStepType.TOOL_START, name="tool_start"))) + # Add TOOL_END step + steps.append( + IntermediateStep(payload=IntermediateStepPayload(event_type=IntermediateStepType.TOOL_END, name="tool_end"))) + return steps + + +def test_filter_intermediate_steps(dataset_handler, mock_intermediate_steps): + """Test that filter_intermediate_steps correctly filters steps based on event types.""" + # Define the filter to include only LLM_END, TOOL_START, and TOOL_END + event_filter = [IntermediateStepType.LLM_END, IntermediateStepType.TOOL_START, IntermediateStepType.TOOL_END] + + # Get the filtered steps + filtered_steps = dataset_handler.filter_intermediate_steps(mock_intermediate_steps, event_filter) + + # Verify that only the specified event types are included (LLM_START is filtered out) + event_types = [step["payload"]["event_type"] for step in filtered_steps] + assert IntermediateStepType.LLM_START not in event_types, "LLM_START should be filtered out" + assert IntermediateStepType.LLM_END in event_types, "LLM_END should be included" + assert IntermediateStepType.TOOL_START in event_types, "TOOL_START should be included" + assert IntermediateStepType.TOOL_END in event_types, "TOOL_END should be included" + + # Verify the order of steps is preserved + assert len(filtered_steps) == 3, "Should have exactly 3 steps after filtering" + assert filtered_steps[0]["payload"]["event_type"] == IntermediateStepType.LLM_END, "First step should be LLM_END" + assert filtered_steps[1]["payload"]["event_type"] == IntermediateStepType.TOOL_START, \ + "Second step should be TOOL_START" + assert filtered_steps[2]["payload"]["event_type"] == IntermediateStepType.TOOL_END, "Third step should be TOOL_END"