Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions docs/source/concepts/evaluate.md
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,18 @@ The output of the evaluators are stored in distinct files in the same `output_di
}
```

## Workflow Output Intermediate Step Filtering
The workflow_output.json file contains the intermediate steps for each entry in the dataset. The intermediate steps are filtered using the `eval.general.output.workflow_output_step_filter` parameter in the `config.yml` file. The default value for the filter is `[LLM_END, TOOL_END]`. You can customize the filter by providing a list of intermediate step types to include in the output file.

**Example:**
`examples/simple/configs/eval_config.yml` can be modified to include the intermediate steps in the output by adding the following configuration:
```yaml
eval:
general:
output:
workflow_output_step_filter: [LLM_END, TOOL_START, TOOL_END]
```

## Customizing the output
You can customize the output of the pipeline by providing custom scripts. One or more Python scripts can be provided in the `eval.general.output_scripts` section of the `config.yml` file.

Expand Down
3 changes: 3 additions & 0 deletions src/aiq/data_models/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from aiq.data_models.dataset_handler import EvalDatasetConfig
from aiq.data_models.dataset_handler import EvalS3Config
from aiq.data_models.evaluator import EvaluatorBaseConfig
from aiq.data_models.intermediate_step import IntermediateStepType
from aiq.data_models.profiler import ProfilerConfig


Expand All @@ -45,6 +46,8 @@ class EvalOutputConfig(BaseModel):
s3: EvalS3Config | None = None
# Whether to cleanup the output directory before running the workflow
cleanup: bool = True
# Filter for the workflow output steps
workflow_output_step_filter: list[IntermediateStepType] | None = None


class EvalGeneralConfig(BaseModel):
Expand Down
15 changes: 10 additions & 5 deletions src/aiq/eval/dataset_handler/dataset_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from aiq.data_models.dataset_handler import EvalDatasetConfig
from aiq.data_models.dataset_handler import EvalDatasetJsonConfig
from aiq.data_models.intermediate_step import IntermediateStep
from aiq.data_models.intermediate_step import IntermediateStepType
from aiq.eval.dataset_handler.dataset_downloader import DatasetDownloader
from aiq.eval.dataset_handler.dataset_filter import DatasetFilter
from aiq.eval.evaluator.evaluator_model import EvalInput
Expand Down Expand Up @@ -132,20 +133,24 @@ def get_eval_input_from_dataset(self, dataset: str) -> EvalInput:
# Convert the DataFrame to a list of EvalInput objects
return self.get_eval_input_from_df(input_df)

def filter_intermediate_steps(self, intermediate_steps: list[IntermediateStep]) -> list[dict]:
def filter_intermediate_steps(self,
intermediate_steps: list[IntermediateStep],
event_filter: list[IntermediateStepType] = None) -> list[dict]:
"""
Filter out the intermediate steps that are not relevant for evaluation.
The output is written with with the intention of re-running the evaluation using the original config file.
"""
filtered_steps = self.intermediate_step_adapter.filter_intermediate_steps(
intermediate_steps, self.intermediate_step_adapter.DEFAULT_EVENT_FILTER)
if event_filter is None:
event_filter = self.intermediate_step_adapter.DEFAULT_EVENT_FILTER
filtered_steps = self.intermediate_step_adapter.filter_intermediate_steps(intermediate_steps, event_filter)
return self.intermediate_step_adapter.serialize_intermediate_steps(filtered_steps)

def publish_eval_input(self, eval_input) -> str:
def publish_eval_input(self, eval_input, workflow_output_step_filter: list[IntermediateStepType] = None) -> str:
"""
Convert the EvalInput object to a JSON output for storing in a file. Use the orginal keys to
allow re-running evaluation using the orignal config file and '--skip_workflow' option.
"""

indent = 2
if self.is_structured_input():
# Extract structured data from EvalInputItems
Expand All @@ -154,7 +159,7 @@ def publish_eval_input(self, eval_input) -> str:
self.question_key: item.input_obj,
self.answer_key: item.expected_output_obj,
self.generated_answer_key: item.output_obj,
self.trajectory_key: self.filter_intermediate_steps(item.trajectory),
self.trajectory_key: self.filter_intermediate_steps(item.trajectory, workflow_output_step_filter),
self.expected_trajectory_key: self.filter_intermediate_steps(item.expected_trajectory),
} for item in eval_input.eval_input_items]
else:
Expand Down
3 changes: 2 additions & 1 deletion src/aiq/eval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,8 @@ def write_output(self, dataset_handler: DatasetHandler):
workflow_output_file.parent.mkdir(parents=True, exist_ok=True)

# Write the workflow output to a file (this can be used for re-running the evaluation)
workflow_output = dataset_handler.publish_eval_input(self.eval_input)
workflow_output = dataset_handler.publish_eval_input(
self.eval_input, self.eval_config.general.output.workflow_output_step_filter)
with open(workflow_output_file, "w", encoding="utf-8") as f:
# set indent to 2 for pretty printing
f.write(workflow_output)
Expand Down
46 changes: 46 additions & 0 deletions tests/aiq/eval/dataset_handler/test_dataset_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@

from aiq.data_models.dataset_handler import EvalDatasetJsonConfig
from aiq.data_models.dataset_handler import EvalDatasetStructureConfig
from aiq.data_models.intermediate_step import IntermediateStep
from aiq.data_models.intermediate_step import IntermediateStepPayload
from aiq.data_models.intermediate_step import IntermediateStepType
from aiq.eval.dataset_handler.dataset_handler import DatasetHandler
from aiq.eval.evaluator.evaluator_model import EvalInput

Expand Down Expand Up @@ -232,3 +235,46 @@ def test_setup_reps(dataset_handler, mock_input_df, dataset_id_key):

assert len(replicated_df) == len(mock_input_df) * dataset_handler.reps, "Dataset should be replicated correctly"
assert all("_rep" in str(i) for i in replicated_df[dataset_id_key]), "IDs should be suffixed with `_repX`"


@pytest.fixture
def mock_intermediate_steps():
"""Create a list of mock intermediate steps with different event types."""
steps = []
# Add LLM_START step
steps.append(
IntermediateStep(payload=IntermediateStepPayload(event_type=IntermediateStepType.LLM_START, name="llm_start")))
# Add LLM_END step
steps.append(
IntermediateStep(payload=IntermediateStepPayload(event_type=IntermediateStepType.LLM_END, name="llm_end")))
# Add TOOL_START step
steps.append(
IntermediateStep(
payload=IntermediateStepPayload(event_type=IntermediateStepType.TOOL_START, name="tool_start")))
# Add TOOL_END step
steps.append(
IntermediateStep(payload=IntermediateStepPayload(event_type=IntermediateStepType.TOOL_END, name="tool_end")))
return steps


def test_filter_intermediate_steps(dataset_handler, mock_intermediate_steps):
"""Test that filter_intermediate_steps correctly filters steps based on event types."""
# Define the filter to include only LLM_END, TOOL_START, and TOOL_END
event_filter = [IntermediateStepType.LLM_END, IntermediateStepType.TOOL_START, IntermediateStepType.TOOL_END]

# Get the filtered steps
filtered_steps = dataset_handler.filter_intermediate_steps(mock_intermediate_steps, event_filter)

# Verify that only the specified event types are included (LLM_START is filtered out)
event_types = [step["payload"]["event_type"] for step in filtered_steps]
assert IntermediateStepType.LLM_START not in event_types, "LLM_START should be filtered out"
assert IntermediateStepType.LLM_END in event_types, "LLM_END should be included"
assert IntermediateStepType.TOOL_START in event_types, "TOOL_START should be included"
assert IntermediateStepType.TOOL_END in event_types, "TOOL_END should be included"

# Verify the order of steps is preserved
assert len(filtered_steps) == 3, "Should have exactly 3 steps after filtering"
assert filtered_steps[0]["payload"]["event_type"] == IntermediateStepType.LLM_END, "First step should be LLM_END"
assert filtered_steps[1]["payload"]["event_type"] == IntermediateStepType.TOOL_START, \
"Second step should be TOOL_START"
assert filtered_steps[2]["payload"]["event_type"] == IntermediateStepType.TOOL_END, "Third step should be TOOL_END"