From 770f39a246155f7a86ed22ef3d30b3e8093687dc Mon Sep 17 00:00:00 2001
From: Anuradha Karuppiah <anuradhak@nvidia.com>
Date: Fri, 2 May 2025 10:09:16 -0700
Subject: [PATCH 1/3] Config option to specify the intermediate step types in
 workflow_output.json

Currently, only TOOL_OUTPUT and LLM_OUTPUT are included in the published
workflow_output.json. But there are usecases where other types such as
TOOL_START may also be needed. This PR provides a config option in
config.yml to allow the user to specify the filter.

Sample config (examples/simple/configs/eval_config.yml):
eval:
  general:
    output:
      dir: ./.tmp/aiq/examples/simple/
      workflow_output_step_filter: [LLM_END, TOOL_START, TOOL_END]

Signed-off-by: Anuradha Karuppiah <anuradhak@nvidia.com>
---
 src/aiq/data_models/evaluate.py                 |  3 +++
 src/aiq/eval/dataset_handler/dataset_handler.py | 15 ++++++++++-----
 src/aiq/eval/evaluate.py                        |  3 ++-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/aiq/data_models/evaluate.py b/src/aiq/data_models/evaluate.py
index 096446490..9a5cda9ee 100644
--- a/src/aiq/data_models/evaluate.py
+++ b/src/aiq/data_models/evaluate.py
@@ -24,6 +24,7 @@
 from aiq.data_models.dataset_handler import EvalDatasetConfig
 from aiq.data_models.dataset_handler import EvalS3Config
 from aiq.data_models.evaluator import EvaluatorBaseConfig
+from aiq.data_models.intermediate_step import IntermediateStepType
 from aiq.data_models.profiler import ProfilerConfig
 
 
@@ -45,6 +46,8 @@ class EvalOutputConfig(BaseModel):
     s3: EvalS3Config | None = None
     # Whether to cleanup the output directory before running the workflow
     cleanup: bool = True
+    # Filter for the workflow output steps
+    workflow_output_step_filter: list[IntermediateStepType] | None = None
 
 
 class EvalGeneralConfig(BaseModel):
diff --git a/src/aiq/eval/dataset_handler/dataset_handler.py b/src/aiq/eval/dataset_handler/dataset_handler.py
index c28f31ac0..ea2a94dbd 100644
--- a/src/aiq/eval/dataset_handler/dataset_handler.py
+++ b/src/aiq/eval/dataset_handler/dataset_handler.py
@@ -20,6 +20,7 @@
 from aiq.data_models.dataset_handler import EvalDatasetConfig
 from aiq.data_models.dataset_handler import EvalDatasetJsonConfig
 from aiq.data_models.intermediate_step import IntermediateStep
+from aiq.data_models.intermediate_step import IntermediateStepType
 from aiq.eval.dataset_handler.dataset_downloader import DatasetDownloader
 from aiq.eval.dataset_handler.dataset_filter import DatasetFilter
 from aiq.eval.evaluator.evaluator_model import EvalInput
@@ -132,20 +133,24 @@ def get_eval_input_from_dataset(self, dataset: str) -> EvalInput:
         # Convert the DataFrame to a list of EvalInput objects
         return self.get_eval_input_from_df(input_df)
 
-    def filter_intermediate_steps(self, intermediate_steps: list[IntermediateStep]) -> list[dict]:
+    def filter_intermediate_steps(self,
+                                  intermediate_steps: list[IntermediateStep],
+                                  event_filter: list[IntermediateStepType] = None) -> list[dict]:
         """
         Filter out the intermediate steps that are not relevant for evaluation.
         The output is written with with the intention of re-running the evaluation using the original config file.
         """
-        filtered_steps = self.intermediate_step_adapter.filter_intermediate_steps(
-            intermediate_steps, self.intermediate_step_adapter.DEFAULT_EVENT_FILTER)
+        if event_filter is None:
+            event_filter = self.intermediate_step_adapter.DEFAULT_EVENT_FILTER
+        filtered_steps = self.intermediate_step_adapter.filter_intermediate_steps(intermediate_steps, event_filter)
         return self.intermediate_step_adapter.serialize_intermediate_steps(filtered_steps)
 
-    def publish_eval_input(self, eval_input) -> str:
+    def publish_eval_input(self, eval_input, workflow_output_step_filter: list[IntermediateStepType] = None) -> str:
         """
         Convert the EvalInput object to a JSON output for storing in a file. Use the orginal keys to
         allow re-running evaluation using the orignal config file and '--skip_workflow' option.
         """
+
         indent = 2
         if self.is_structured_input():
             # Extract structured data from EvalInputItems
@@ -154,7 +159,7 @@ def publish_eval_input(self, eval_input) -> str:
                 self.question_key: item.input_obj,
                 self.answer_key: item.expected_output_obj,
                 self.generated_answer_key: item.output_obj,
-                self.trajectory_key: self.filter_intermediate_steps(item.trajectory),
+                self.trajectory_key: self.filter_intermediate_steps(item.trajectory, workflow_output_step_filter),
                 self.expected_trajectory_key: self.filter_intermediate_steps(item.expected_trajectory),
             } for item in eval_input.eval_input_items]
         else:
diff --git a/src/aiq/eval/evaluate.py b/src/aiq/eval/evaluate.py
index 57ab3600d..cfbe7fcbb 100644
--- a/src/aiq/eval/evaluate.py
+++ b/src/aiq/eval/evaluate.py
@@ -177,7 +177,8 @@ def write_output(self, dataset_handler: DatasetHandler):
         workflow_output_file.parent.mkdir(parents=True, exist_ok=True)
 
         # Write the workflow output to a file (this can be used for re-running the evaluation)
-        workflow_output = dataset_handler.publish_eval_input(self.eval_input)
+        workflow_output = dataset_handler.publish_eval_input(
+            self.eval_input, self.eval_config.general.output.workflow_output_step_filter)
         with open(workflow_output_file, "w", encoding="utf-8") as f:
             # set indent to 2 for pretty printing
             f.write(workflow_output)

From 2c98f752bc864c89f39d0cab7177cb198ffdcdcd Mon Sep 17 00:00:00 2001
From: Anuradha Karuppiah <anuradhak@nvidia.com>
Date: Fri, 2 May 2025 11:30:44 -0700
Subject: [PATCH 2/3] Add documentation.

Signed-off-by: Anuradha Karuppiah <anuradhak@nvidia.com>
---
 docs/source/concepts/evaluate.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/source/concepts/evaluate.md b/docs/source/concepts/evaluate.md
index 3d54e6a4a..d4920ffcd 100644
--- a/docs/source/concepts/evaluate.md
+++ b/docs/source/concepts/evaluate.md
@@ -370,6 +370,18 @@ The output of the evaluators are stored in distinct files in the same `output_di
 }
 ```
 
+## Workflow Output Intermediate Step Filtering
+The workflow_output.json file contains the intermediate steps for each entry in the dataset. The intermediate steps are filtered using the `eval.general.output.workflow_output_step_filter` parameter in the `config.yml` file. The default value for the filter is `[LLM_END, TOOL_END]`. You can customize the filter by providing a list of intermediate step types to include in the output file.
+
+**Example:**
+`examples/simple/configs/eval_config.yml` can be modified to include the intermediate steps in the output by adding the following configuration:
+```yaml
+eval:
+  general:
+    output:
+    workflow_output_step_filter: [LLM_END, TOOL_START, TOOL_END]
+```
+
 ## Customizing the output
 You can customize the output of the pipeline by providing custom scripts. One or more Python scripts can be provided in the `eval.general.output_scripts` section of the `config.yml` file.
 

From b382caf38c2c4f9b4398e5e191180cd3360e1d60 Mon Sep 17 00:00:00 2001
From: Anuradha Karuppiah <anuradhak@nvidia.com>
Date: Fri, 2 May 2025 11:53:11 -0700
Subject: [PATCH 3/3] Add an unit test

Signed-off-by: Anuradha Karuppiah <anuradhak@nvidia.com>
---
 .../dataset_handler/test_dataset_handler.py   | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/tests/aiq/eval/dataset_handler/test_dataset_handler.py b/tests/aiq/eval/dataset_handler/test_dataset_handler.py
index 6f59dfb0b..3f4bbc9ca 100644
--- a/tests/aiq/eval/dataset_handler/test_dataset_handler.py
+++ b/tests/aiq/eval/dataset_handler/test_dataset_handler.py
@@ -18,6 +18,9 @@
 
 from aiq.data_models.dataset_handler import EvalDatasetJsonConfig
 from aiq.data_models.dataset_handler import EvalDatasetStructureConfig
+from aiq.data_models.intermediate_step import IntermediateStep
+from aiq.data_models.intermediate_step import IntermediateStepPayload
+from aiq.data_models.intermediate_step import IntermediateStepType
 from aiq.eval.dataset_handler.dataset_handler import DatasetHandler
 from aiq.eval.evaluator.evaluator_model import EvalInput
 
@@ -232,3 +235,46 @@ def test_setup_reps(dataset_handler, mock_input_df, dataset_id_key):
 
     assert len(replicated_df) == len(mock_input_df) * dataset_handler.reps, "Dataset should be replicated correctly"
     assert all("_rep" in str(i) for i in replicated_df[dataset_id_key]), "IDs should be suffixed with `_repX`"
+
+
+@pytest.fixture
+def mock_intermediate_steps():
+    """Create a list of mock intermediate steps with different event types."""
+    steps = []
+    # Add LLM_START step
+    steps.append(
+        IntermediateStep(payload=IntermediateStepPayload(event_type=IntermediateStepType.LLM_START, name="llm_start")))
+    # Add LLM_END step
+    steps.append(
+        IntermediateStep(payload=IntermediateStepPayload(event_type=IntermediateStepType.LLM_END, name="llm_end")))
+    # Add TOOL_START step
+    steps.append(
+        IntermediateStep(
+            payload=IntermediateStepPayload(event_type=IntermediateStepType.TOOL_START, name="tool_start")))
+    # Add TOOL_END step
+    steps.append(
+        IntermediateStep(payload=IntermediateStepPayload(event_type=IntermediateStepType.TOOL_END, name="tool_end")))
+    return steps
+
+
+def test_filter_intermediate_steps(dataset_handler, mock_intermediate_steps):
+    """Test that filter_intermediate_steps correctly filters steps based on event types."""
+    # Define the filter to include only LLM_END, TOOL_START, and TOOL_END
+    event_filter = [IntermediateStepType.LLM_END, IntermediateStepType.TOOL_START, IntermediateStepType.TOOL_END]
+
+    # Get the filtered steps
+    filtered_steps = dataset_handler.filter_intermediate_steps(mock_intermediate_steps, event_filter)
+
+    # Verify that only the specified event types are included (LLM_START is filtered out)
+    event_types = [step["payload"]["event_type"] for step in filtered_steps]
+    assert IntermediateStepType.LLM_START not in event_types, "LLM_START should be filtered out"
+    assert IntermediateStepType.LLM_END in event_types, "LLM_END should be included"
+    assert IntermediateStepType.TOOL_START in event_types, "TOOL_START should be included"
+    assert IntermediateStepType.TOOL_END in event_types, "TOOL_END should be included"
+
+    # Verify the order of steps is preserved
+    assert len(filtered_steps) == 3, "Should have exactly 3 steps after filtering"
+    assert filtered_steps[0]["payload"]["event_type"] == IntermediateStepType.LLM_END, "First step should be LLM_END"
+    assert filtered_steps[1]["payload"]["event_type"] == IntermediateStepType.TOOL_START, \
+        "Second step should be TOOL_START"
+    assert filtered_steps[2]["payload"]["event_type"] == IntermediateStepType.TOOL_END, "Third step should be TOOL_END"