From 79cf74ebcfb193d27b7a70ed88a79969688896dd Mon Sep 17 00:00:00 2001 From: Asutosh Samal Date: Sun, 27 Jul 2025 19:06:37 +0530 Subject: [PATCH] agent eval: multi-turn & refactoring fix conv id handling --- lsc_agent_eval/README.md | 225 ++++--- .../sample_data/agent_goal_eval_example.yaml | 88 ++- .../script/{eval3 => conv3}/cleanup.sh | 0 .../script/{eval3 => conv3}/setup.sh | 0 .../script/{eval4 => conv4}/cleanup.sh | 0 .../script/{eval4 => conv4/eval1}/verify.sh | 0 .../script/{eval4 => conv4}/setup.sh | 0 lsc_agent_eval/src/lsc_agent_eval/__init__.py | 35 +- .../core/agent_goal_eval/__init__.py | 16 + .../core/agent_goal_eval/agent_goal_eval.py | 191 ++++-- .../core/agent_goal_eval/eval_data.py | 192 ++++-- .../core/agent_goal_eval/evaluator.py | 136 ++--- .../core/agent_goal_eval/models.py | 304 +++++++++- .../core/agent_goal_eval/results.py | 83 ++- .../core/agent_goal_eval/script_runner.py | 30 +- .../core/agent_goal_eval/utils.py | 42 ++ .../lsc_agent_eval/core/utils/api_client.py | 14 +- .../lsc_agent_eval/core/utils/exceptions.py | 4 +- .../agent_goal_eval/test_agent_goal_eval.py | 308 +++------- .../core/agent_goal_eval/test_eval_data.py | 569 ++++++++++-------- .../core/agent_goal_eval/test_evaluator.py | 506 ++++++++-------- .../tests/core/agent_goal_eval/test_models.py | 306 ++++++++-- .../core/agent_goal_eval/test_results.py | 559 +++++++---------- .../agent_goal_eval/test_script_runner.py | 50 +- .../tests/core/utils/test_api_client.py | 35 +- .../tests/core/utils/test_exceptions.py | 34 +- 26 files changed, 2162 insertions(+), 1565 deletions(-) rename lsc_agent_eval/sample_data/script/{eval3 => conv3}/cleanup.sh (100%) rename lsc_agent_eval/sample_data/script/{eval3 => conv3}/setup.sh (100%) rename lsc_agent_eval/sample_data/script/{eval4 => conv4}/cleanup.sh (100%) rename lsc_agent_eval/sample_data/script/{eval4 => conv4/eval1}/verify.sh (100%) rename lsc_agent_eval/sample_data/script/{eval4 => conv4}/setup.sh (100%) create mode 100644 lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/utils.py diff --git a/lsc_agent_eval/README.md b/lsc_agent_eval/README.md index 593118c8..76d0de4d 100644 --- a/lsc_agent_eval/README.md +++ b/lsc_agent_eval/README.md @@ -1,16 +1,17 @@ # Lightspeed Agent Evaluation -A standalone package for evaluating agent-based systems, specifically designed for evaluating agent goal achievement. +A framework for evaluating AI agent performance. ## Features - **Agent Goal Evaluation**: Evaluate whether an agent successfully achieves specified goals +- **Multi-turn Evaluation**: Organize evaluations into conversation groups for multi-turn testing - **Multi-type Evaluation**: Support for different evaluation types: - `judge-llm`: LLM-based evaluation using a judge model - `script`: Script-based evaluation using verification scripts (similar to [k8s-bench](https://github.com/GoogleCloudPlatform/kubectl-ai/tree/main/k8s-bench)) - - `sub-string`: Simple substring matching evaluation + - `sub-string`: Simple substring matching evaluation (ALL keywords must be present in response) - **Setup/Cleanup Scripts**: Support for running setup and cleanup scripts before/after evaluation -- **Result Tracking**: Result tracking and CSV output +- **Result Tracking**: Result tracking with CSV output and JSON statistics - **Standalone Package**: Can be installed and used independently of the main lightspeed-core-evaluation package - **LiteLLM Integration**: Unified interface for Judge LLM @@ -45,13 +46,102 @@ pip install -e . pdm install ``` -## Usage +## Data Configuration + +The evaluation is configured using a YAML file that defines conversations. Each conversation contains one or more evaluations and includes: + +- `conversation_group`: Identifier for grouping related evaluations/conversation +- `description`: Description of the conversation (Optional) +- `setup_script`: Setup script to run before the conversation (Optional) +- `cleanup_script`: Cleanup script to run after the conversation (Optional) +- `conversation`: List of evaluations in this conversation + +Each evaluation within a conversation can include: +- `eval_id`: Unique identifier for the evaluation +- `eval_query`: The query/task to send to the agent +- `eval_type`: Type of evaluation (judge-llm, script, sub-string) +- `expected_response`: Expected response (for judge-llm evaluation) +- `expected_keywords`: Keywords to look for (for sub-string evaluation) +- `eval_verify_script`: Verification script (for script evaluation) +- `description`: Description of the evaluation (Optional) + +Note: `eval_id` can't contain duplicate values within a conversation group. But it is okay for cross conversation group (A warning is logged anyway for awareness) + +### Example Data Configuration + +```yaml +# Multi-turn Conversations +- conversation_group: conv1 + description: Basic conversation flow testing cluster operations + conversation: + - eval_id: eval1 + eval_query: Hi! + eval_type: judge-llm + expected_response: Hello! I'm an AI assistant for the Installer. + description: Initial greeting to start conversation + - eval_id: eval2 + eval_query: Get me active clusters + eval_type: judge-llm + expected_response: Active clusters are x1, x2. + description: Request for cluster information + +- conversation_group: conv2 + description: Multi-turn conversation with setup/cleanup + setup_script: sample_data/script/setup_environment.sh + cleanup_script: sample_data/script/cleanup_environment.sh + conversation: + - eval_id: eval1 + eval_query: Hi! Can you help me manage pods? + eval_type: judge-llm + expected_response: Hello! I can help you manage pods. + description: Initial greeting + - eval_id: eval2 + eval_query: Create a pod named test-pod + eval_type: script + eval_verify_script: sample_data/script/verify_pod.sh + description: Create pod and verify + - eval_id: eval3 + eval_query: List all pods + eval_type: sub-string + expected_keywords: ['test-pod'] + description: Verify pod is listed + +# Single-turn Conversations +- conversation_group: conv3 + description: Test namespace creation and detection with scripts + setup_script: sample_data/script/conv3/setup.sh + cleanup_script: sample_data/script/conv3/cleanup.sh + conversation: + - eval_id: eval1 + eval_query: is there a openshift-lightspeed namespace ? + eval_type: sub-string + expected_keywords: + - 'yes' + - 'lightspeed' + description: Check for openshift-lightspeed namespace after setup +``` + +The `sample_data/` directory contains example configurations: +- `agent_goal_eval_example.yaml`: Examples with various evaluation types +- `script/`: Example setup, cleanup, and verify scripts + +## Judge LLM + +For judge-llm evaluations, currently LiteLLM is used. + +### Judge LLM - Setup +Expectation is that, either a third-party inference provider access is there or local model inference is already created. The eval framework doesn't handle this. + +- **OpenAI**: Set `OPENAI_API_KEY` environment variable +- **Azure OpenAI**: Set `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_ENDPOINT` +- **IBM Watsonx**: Set `WATSONX_API_KEY`, `WATSONX_API_BASE`, `WATSONX_PROJECT_ID` +- **Ollama**: Set `OLLAMA_API_BASE` (for local models) +- **Any Other Provider**: Check [LiteLLM documentation](https://docs.litellm.ai/docs/providers) -### Command Line Interface +## Usage ```bash -# Run agent evaluation with basic configuration -lsc-agent-eval \ +lsc_agent_eval \ --eval_data_yaml agent_goal_eval.yaml \ --agent_endpoint http://localhost:8080 \ --agent_provider watsonx \ @@ -61,8 +151,6 @@ lsc-agent-eval \ --result_dir ./eval_output ``` -### Python API - ```python from lsc_agent_eval import AgentGoalEval @@ -84,44 +172,7 @@ evaluator = AgentGoalEval(args) evaluator.run_evaluation() ``` -## Configuration - -The evaluation is configured using a YAML file that defines test cases. Each test case can include: - -- `eval_id`: Unique identifier for the evaluation -- `eval_query`: The query/task to send to the agent -- `eval_type`: Type of evaluation (judge-llm, script, sub-string) -- `expected_response`: Expected response (for judge-llm evaluation) -- `expected_keywords`: Keywords to look for (for sub-string evaluation) -- `eval_verify_script`: Verification script (for script evaluation) -- `eval_setup_script`: Optional setup script to run before evaluation -- `eval_cleanup_script`: Optional cleanup script to run after evaluation - -### Example YAML Configuration - -```yaml -# data/example_eval.yaml -- eval_id: eval1 - eval_query: "is there a openshift-monitoring namespace?" - eval_type: sub-string - expected_keywords: - - 'yes' - - openshift-monitoring - -- eval_id: eval2 - eval_query: "is there a openshift-monitoring namespace?" - eval_type: judge-llm - expected_response: "there is a openshift-monitoring namespace." - -- eval_id: eval3 - eval_query: "create a namespace called openshift-lightspeed" - eval_setup_script: script/eval3/setup.sh - eval_type: script - eval_verify_script: script/eval3/verify.sh - eval_cleanup_script: script/eval3/cleanup.sh -``` - -## Command Line Arguments +### Key Arguments - `--eval_data_yaml`: Path to the YAML file containing evaluation data - `--agent_endpoint`: Endpoint URL for the agent API (default: ) @@ -133,33 +184,60 @@ The evaluation is configured using a YAML file that defines test cases. Each tes - `--result_dir`: Directory to save evaluation results (default: eval_output/) - `--kubeconfig`: Path to kubeconfig file (if needed for scripts) -## Output +## Evaluation Flow -The evaluation results are saved to a CSV file containing: -- `eval_id`: Evaluation identifier -- `query`: The query sent to the agent -- `response`: The agent's response -- `eval_type`: Type of evaluation performed -- `result`: Result (pass/fail) +### Conversation Processing Order -## Dependencies +1. **Load Configuration**: Parse and validate YAML configuration +2. **Process Conversations**: For each conversation group: + - Run setup script (if provided) + - Run all evaluations sequentially: + - For the first evaluation: Send query without conversation ID, receive new conversation ID from API + - For subsequent evaluations: Use the conversation ID from the first evaluation to maintain context + - Execute evaluation based on eval_type (either sub-string, judge-llm or script) + - Run cleanup script (if provided) +3. **Save Results**: Export to CSV and JSON with statistics -This package depends on: -- `pandas`: Data manipulation and analysis -- `httpx`: HTTP client for API calls -- `tqdm`: Progress bars -- `pyyaml`: YAML file processing -- `litellm`: Unified interface to 100+ LLM providers +### Script Execution -## LiteLLM Integration (Judge LLM) +- **Setup Scripts**: Run once before all evaluations in a conversation + - If setup fails, all evaluations in the conversation are marked as ERROR +- **Cleanup Scripts**: Run once after all evaluations in a conversation + - Cleanup failures are logged as warnings (non-critical) + - Always executed regardless of evaluation results +- **Verify Scripts**: Run per individual evaluation for script type evaluations + - Used to verify the agent's action is successful -For judge-llm evaluations, you can use any of the 100+ supported providers: +### Error Handling -- **OpenAI**: Set `OPENAI_API_KEY` environment variable -- **Azure OpenAI**: Set `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_ENDPOINT` -- **IBM Watsonx**: Set `WATSONX_API_KEY`, `WATSONX_API_BASE`, `WATSONX_PROJECT_ID` -- **Ollama**: Set `OLLAMA_API_BASE` (for local models) -- **And many more**: See [LiteLLM documentation](https://docs.litellm.ai/docs/providers) +- **Setup Failure**: Marks all evaluations in conversation as ERROR +- **Cleanup Failure**: Logged as warning, does not affect evaluation results +- **API Errors**: Evaluation marked as Error +- **Evaluation Failure**: Individual evaluation marked as ERROR or FAIL +- **Configuration Errors**: Detailed validation message + +## Output + +The framework generates two types of output: + +### CSV Results (`agent_goal_eval_results_YYYYMMDD_HHMMSS.csv`) + +Contains detailed results with columns: +- `conversation_group`: The conversation group identifier +- `conversation_id`: The conversation ID returned by the Agent API +- `eval_id`: Individual evaluation identifier +- `result`: PASS, FAIL, or ERROR +- `eval_type`: Type of evaluation performed +- `query`: The question/task sent to the agent +- `response`: The agent's response +- `error`: Error message (if any) + +### JSON Statistics (`agent_goal_eval_summary_YYYYMMDD_HHMMSS.json`) + +Result statistics: +- **Overall Summary**: Total evaluations, pass/fail/error counts, success rate +- **By Conversation**: Breakdown of results for each conversation group +- **By Evaluation Type**: Performance metrics for each evaluation type (judge-llm, script, sub-string) ## Development @@ -174,10 +252,15 @@ cd lightspeed-evaluation/lsc_agent_eval pdm install --dev # Run tests -pdm run pytest +pdm run pytest tests --cov=src # Run linting pdm run ruff check +pdm run isort src tests +pdm run black src tests +pdm run mypy src +pdm run pyright src +pdm run pylint src ``` ### Contributing @@ -186,7 +269,7 @@ pdm run ruff check 2. Create a feature branch 3. Make your changes 4. Add tests for new functionality -5. Run the test suite +5. Run tests and lint checks 6. Submit a pull request ## License @@ -195,4 +278,4 @@ This project is licensed under the Apache License 2.0. See the LICENSE file for ## Support -For issues and questions, please use the [GitHub Issues](https://github.com/lightspeed-core/lightspeed-evaluation/issues) tracker. \ No newline at end of file +For issues and questions, please use the [GitHub Issues](https://github.com/lightspeed-core/lightspeed-evaluation/issues) tracker. diff --git a/lsc_agent_eval/sample_data/agent_goal_eval_example.yaml b/lsc_agent_eval/sample_data/agent_goal_eval_example.yaml index efed30aa..d346057e 100644 --- a/lsc_agent_eval/sample_data/agent_goal_eval_example.yaml +++ b/lsc_agent_eval/sample_data/agent_goal_eval_example.yaml @@ -1,26 +1,68 @@ -- eval_id: eval1 - eval_query: is there a openshift-monitoring namespace ? - eval_type: sub-string - expected_keywords: - - 'yes' - - openshift-monitoring +- conversation_group: conv1 + description: Test namespace detection using substring matching + conversation: + - eval_id: eval1 + eval_query: is there a openshift-monitoring namespace ? + eval_type: sub-string + expected_keywords: + - 'yes' + - openshift-monitoring + description: Check for openshift-monitoring namespace existence -- eval_id: eval2 - eval_query: is there a openshift-monitoring namespace ? - eval_type: judge-llm - expected_response: there is a openshift-monitoring namespace. +- conversation_group: conv2 + description: Test namespace detection using LLM judge + conversation: + - eval_id: eval1 + eval_query: is there a openshift-monitoring namespace ? + eval_type: judge-llm + expected_response: there is a openshift-monitoring namespace. + description: Verify openshift-monitoring namespace with LLM evaluation -- eval_id: eval3 - eval_query: is there a openshift-lightspeed namespace ? - eval_setup_script: sample_data/script/eval3/setup.sh - eval_type: sub-string - expected_keywords: - - 'yes' - eval_cleanup_script: sample_data/script/eval3/cleanup.sh +- conversation_group: conv3 + description: Test namespace creation and detection with scripts + setup_script: sample_data/script/conv3/setup.sh + cleanup_script: sample_data/script/conv3/cleanup.sh + conversation: + - eval_id: eval1 + eval_query: is there a openshift-lightspeed namespace ? + eval_type: sub-string + expected_keywords: + - 'yes' + description: Check for openshift-lightspeed namespace after setup -- eval_id: eval4 - eval_query: create a namespace called openshift-lightspeed - eval_setup_script: sample_data/script/eval4/setup.sh - eval_type: script - eval_verify_script: sample_data/script/eval4/verify.sh - eval_cleanup_script: sample_data/script/eval4/cleanup.sh +- conversation_group: conv4 + description: Test namespace creation with full script validation + setup_script: sample_data/script/conv4/setup.sh + cleanup_script: sample_data/script/conv4/cleanup.sh + conversation: + - eval_id: eval1 + eval_query: create a namespace called openshift-lightspeed + eval_type: script + eval_verify_script: sample_data/script/conv4/eval1/verify.sh + description: Create namespace and verify with script + +- conversation_group: conv5 + description: Test conversation retention - multi turn success + conversation: + - eval_id: eval1 + eval_query: what is openshift virtualization ? + eval_type: sub-string + expected_keywords: + - virtualization + description: Test first conversation + - eval_id: eval2 + eval_query: what was my previous query ? + eval_type: sub-string + expected_keywords: + - virtualization + description: Test second conversation + +- conversation_group: conv6 + description: Test conversation retention - new conversation + conversation: + - eval_id: eval1 + eval_query: what was my previous query ? + eval_type: sub-string + expected_keywords: + - virtualization + description: new conversation (failure) diff --git a/lsc_agent_eval/sample_data/script/eval3/cleanup.sh b/lsc_agent_eval/sample_data/script/conv3/cleanup.sh similarity index 100% rename from lsc_agent_eval/sample_data/script/eval3/cleanup.sh rename to lsc_agent_eval/sample_data/script/conv3/cleanup.sh diff --git a/lsc_agent_eval/sample_data/script/eval3/setup.sh b/lsc_agent_eval/sample_data/script/conv3/setup.sh similarity index 100% rename from lsc_agent_eval/sample_data/script/eval3/setup.sh rename to lsc_agent_eval/sample_data/script/conv3/setup.sh diff --git a/lsc_agent_eval/sample_data/script/eval4/cleanup.sh b/lsc_agent_eval/sample_data/script/conv4/cleanup.sh similarity index 100% rename from lsc_agent_eval/sample_data/script/eval4/cleanup.sh rename to lsc_agent_eval/sample_data/script/conv4/cleanup.sh diff --git a/lsc_agent_eval/sample_data/script/eval4/verify.sh b/lsc_agent_eval/sample_data/script/conv4/eval1/verify.sh similarity index 100% rename from lsc_agent_eval/sample_data/script/eval4/verify.sh rename to lsc_agent_eval/sample_data/script/conv4/eval1/verify.sh diff --git a/lsc_agent_eval/sample_data/script/eval4/setup.sh b/lsc_agent_eval/sample_data/script/conv4/setup.sh similarity index 100% rename from lsc_agent_eval/sample_data/script/eval4/setup.sh rename to lsc_agent_eval/sample_data/script/conv4/setup.sh diff --git a/lsc_agent_eval/src/lsc_agent_eval/__init__.py b/lsc_agent_eval/src/lsc_agent_eval/__init__.py index a9c203f9..ce9304d0 100644 --- a/lsc_agent_eval/src/lsc_agent_eval/__init__.py +++ b/lsc_agent_eval/src/lsc_agent_eval/__init__.py @@ -1,38 +1,27 @@ """Agent evaluation modules.""" -from .core.agent_goal_eval.agent_goal_eval import AgentGoalEval -from .core.agent_goal_eval.eval_data import AgentGoalEvalDataManager -from .core.agent_goal_eval.evaluator import EvaluationRunner -from .core.agent_goal_eval.models import EvaluationDataConfig, EvaluationResult -from .core.agent_goal_eval.results import ResultsManager -from .core.agent_goal_eval.script_runner import ScriptRunner -from .core.utils.api_client import AgentHttpClient +from .core.agent_goal_eval import AgentGoalEval +from .core.agent_goal_eval.models import ( + ConversationDataConfig, + EvaluationDataConfig, + EvaluationResult, +) from .core.utils.exceptions import ( AgentAPIError, AgentEvaluationError, - ConfigurationError, + EvaluationDataError, JudgeModelError, ScriptExecutionError, ) -from .core.utils.judge import JudgeModelManager __all__ = [ - # Exceptions + "AgentGoalEval", + "EvaluationDataConfig", + "EvaluationResult", + "ConversationDataConfig", "AgentEvaluationError", - "ConfigurationError", + "EvaluationDataError", "AgentAPIError", "ScriptExecutionError", "JudgeModelError", - # Models - "EvaluationResult", - "EvaluationDataConfig", - # Components - "AgentGoalEvalDataManager", - "AgentHttpClient", - "ScriptRunner", - "JudgeModelManager", - "EvaluationRunner", - "ResultsManager", - # Main class - "AgentGoalEval", ] diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/__init__.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/__init__.py index 1218ccd4..36028c4d 100644 --- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/__init__.py +++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/__init__.py @@ -1 +1,17 @@ """Agent goal evaluation modules.""" + +from .agent_goal_eval import AgentGoalEval +from .eval_data import AgentGoalEvalDataManager +from .evaluator import EvaluationRunner +from .models import ConversationDataConfig, EvaluationDataConfig, EvaluationResult +from .results import ResultsManager + +__all__ = [ + "AgentGoalEval", + "AgentGoalEvalDataManager", + "EvaluationRunner", + "EvaluationDataConfig", + "EvaluationResult", + "ConversationDataConfig", + "ResultsManager", +] diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/agent_goal_eval.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/agent_goal_eval.py index f6d01725..4aae8428 100644 --- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/agent_goal_eval.py +++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/agent_goal_eval.py @@ -2,15 +2,22 @@ import argparse import logging +from pathlib import Path +from typing import TYPE_CHECKING, Optional from tqdm import tqdm from ..utils.api_client import AgentHttpClient +from ..utils.exceptions import AgentEvaluationError, ScriptExecutionError from ..utils.judge import JudgeModelManager from .eval_data import AgentGoalEvalDataManager from .evaluator import EvaluationRunner -from .models import EvaluationDataConfig, EvaluationResult from .results import ResultsManager +from .script_runner import ScriptRunner +from .utils import create_error_result + +if TYPE_CHECKING: + from .models import ConversationDataConfig, EvaluationDataConfig, EvaluationResult logger = logging.getLogger(__name__) @@ -29,6 +36,9 @@ def _setup_components(self) -> None: # Eval data manager self.data_manager = AgentGoalEvalDataManager(self.eval_args.eval_data_yaml) + # Script runner + self.script_runner = ScriptRunner(getattr(self.eval_args, "kubeconfig", None)) + # Agent HTTP client self.agent_client = AgentHttpClient( self.eval_args.agent_endpoint, self.eval_args.agent_auth_token_file @@ -43,37 +53,38 @@ def _setup_components(self) -> None: # Evaluation runner self.evaluation_runner = EvaluationRunner( - self.agent_client, - self.judge_manager, - kubeconfig=getattr(self.eval_args, "kubeconfig", None), + self.agent_client, self.script_runner, self.judge_manager ) - # Results manager - self.results_manager = ResultsManager(self.eval_args.result_dir) - def run_evaluation(self) -> None: """Run all evaluations and save results.""" try: - eval_data = self.data_manager.get_eval_data() - logger.info("Running %d evaluations", len(eval_data)) + conversations = self.data_manager.get_conversations() + + logger.info( + "Starting Agent Goal Evaluation\n" + "Total: %d evaluations across %d conversations", + self.data_manager.get_eval_count(), + len(conversations), + ) results = [] - pbar = tqdm(eval_data) - for data_config in pbar: - pbar.set_description(f"Running evaluation for {data_config.eval_id}") - result = self.evaluation_runner.run_evaluation( - data_config, - self.eval_args.agent_provider, - self.eval_args.agent_model, + + # Process each conversation for evaluation + for conv_idx, conversation in enumerate(conversations, 1): + print( + f"\nšŸ“‹ Conversation {conv_idx}/{len(conversations)}: " + f"{conversation.conversation_group}" ) - self._print_individual_result(data_config, result, pbar) - results.append(result) + conversation_results = self._process_conversation(conversation) + results.extend(conversation_results) # Save results - self.results_manager.save_results(results) + results_manager = ResultsManager(results) + results_manager.save_results(self.eval_args.result_dir) # Print summary - self._print_summary(results) + self._print_summary(results_manager) except Exception as e: logger.error("Evaluation failed: %s", e) @@ -82,9 +93,104 @@ def run_evaluation(self) -> None: # Clean up resources self._cleanup() + def _process_conversation( + self, conversation: "ConversationDataConfig" + ) -> list["EvaluationResult"]: + """Process single conversation group.""" + conversation_group = conversation.conversation_group + evaluations = conversation.conversation + print(f" Evaluations count: {len(evaluations)}") + + # Always start with None - conversation_id will be obtained from first API call + conversation_id = None + + results = [] + + # Run setup script for the conversation + if conversation.setup_script: + try: + self._run_setup_script(conversation.setup_script, conversation_group) + except ScriptExecutionError as e: + # If setup fails, mark all evaluations as ERROR + for eval_data in evaluations: + error_result = create_error_result( + eval_data, f"Setup script failed: {str(e)}", conversation_id + ) + results.append(error_result) + print(f"āŒ Setup script failed for {conversation_group}: {e}") + return results + + # Run evaluations + print(f" Running {len(evaluations)} evaluations...") + evaluation_results = self._run_conversation_evaluations( + evaluations, conversation_group, conversation_id + ) + results.extend(evaluation_results) + + # Run cleanup script for the conversation + if conversation.cleanup_script: + self._run_cleanup_script(conversation.cleanup_script, conversation_group) + + return results + + def _run_setup_script(self, setup_script: Path, conversation_group: str) -> None: + """Run setup script for a conversation.""" + setup_success = self.script_runner.run_script(setup_script) + if not setup_success: + raise ScriptExecutionError("Setup script returned non-zero exit code") + logger.debug("Setup script executed successfully for %s", conversation_group) + + def _run_cleanup_script( + self, cleanup_script: Path, conversation_group: str + ) -> None: + """Run cleanup script for a conversation.""" + try: + cleanup_success = self.script_runner.run_script(cleanup_script) + if cleanup_success: + logger.debug("Cleanup completed successfully") + else: + logger.warning("Cleanup script failed (non-critical)") + except ScriptExecutionError as e: + logger.warning("Cleanup script failed for %s: %s", conversation_group, e) + + def _run_conversation_evaluations( + self, + evaluations: list["EvaluationDataConfig"], + conversation_group: str, + conversation_id: Optional[str], + ) -> list["EvaluationResult"]: + """Run all evaluations for a conversation.""" + results = [] + + with tqdm( + total=len(evaluations), + desc=f"Evaluating {conversation_group}", + ) as pbar: + for eval_data in evaluations: + result = self.evaluation_runner.run_evaluation( + eval_data, + self.eval_args.agent_provider, + self.eval_args.agent_model, + conversation_id, + ) + + # Update conversation_id from API response for subsequent evaluations + if conversation_id is None: + conversation_id = result.conversation_id + print( + f" Received conversation ID from API: {result.conversation_id}" + ) + + self._print_individual_result(eval_data, result, pbar) + results.append(result) + + pbar.update(1) + + return results + @staticmethod def _print_individual_result( - data_config: EvaluationDataConfig, result: EvaluationResult, pbar: tqdm + data_config: "EvaluationDataConfig", result: "EvaluationResult", pbar: tqdm ) -> None: """Print individual result.""" match result.result: @@ -94,7 +200,10 @@ def _print_individual_result( marker = "āŒ" case _: marker = "āš ļø " - pbar.write(f"{marker} {result.eval_id}: {result.result}") + pbar.write( + f"{marker} {result.conversation_group}/{result.eval_id} " + f"{result.conversation_id}: {result.result}" + ) if result.result != "PASS": pbar.write(f" Query: {result.query}") @@ -111,25 +220,36 @@ def _print_individual_result( if result.result == "ERROR": pbar.write(f" Error message: {result.error}") - def _print_summary(self, results: list[EvaluationResult]) -> None: + def _print_summary(self, results_manager: ResultsManager) -> None: """Print evaluation summary.""" - total = len(results) - passed = sum(1 for r in results if r.result == "PASS") - failed = sum(1 for r in results if r.result == "FAIL") - errored = sum(1 for r in results if r.result == "ERROR") - success_rate = (passed / total * 100) if total > 0 else 0 + stats = results_manager.get_results_stats() print(f"\n{'='*25}") print("EVALUATION SUMMARY") print(f"{'='*25}") - print(f"Total Evaluations: {total}") - print(f"āœ… Passed: {passed}") - print(f"āŒ Failed: {failed}") - print(f"āš ļø Errored: {errored}") - print(f"Success Rate: {success_rate:.1f}%") + print(f"Total Evaluations: {stats.total_evaluations}") + print(f"āœ… Passed: {stats.passed}") + print(f"āŒ Failed: {stats.failed}") + print(f"āš ļø Errored: {stats.errored}") + print(f"Success Rate: {stats.success_rate:.1f}%") + + # Show conversation breakdown if multiple conversations + if len(stats.by_conversation) > 1: + print("\nSummary by Conversation:") + for conv_group, counts in stats.by_conversation.items(): + print( + f"{conv_group}: {counts['passed']}/{counts['total']} " + f"({counts['success_rate']:.1f}%)" + ) + print(f"{'='*25}\n") - self.result_summary = {"PASS": passed, "FAIL": failed, "ERROR": errored} + self.result_summary = { + "TOTAL": stats.total_evaluations, + "PASS": stats.passed, + "FAIL": stats.failed, + "ERROR": stats.errored, + } def _cleanup(self) -> None: """Clean up resources.""" @@ -141,4 +261,7 @@ def _cleanup(self) -> None: def get_result_summary(self) -> dict[str, int]: """Get result summary.""" + if not self.result_summary: + raise AgentEvaluationError("No results available. Run evaluation first.") + return self.result_summary diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/eval_data.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/eval_data.py index e664912c..615f3adf 100644 --- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/eval_data.py +++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/eval_data.py @@ -1,88 +1,170 @@ """Agent Goal Eval data management.""" +import logging from pathlib import Path from typing import Any import yaml +from pydantic import ValidationError -from ..utils.exceptions import ConfigurationError -from .models import EvaluationDataConfig +from ..utils.exceptions import EvaluationDataError +from .models import ConversationDataConfig + +logger = logging.getLogger(__name__) class AgentGoalEvalDataManager: """Processes agent eval data and validation.""" - def __init__(self, eval_data_file: str): - """Initialize configuration manager.""" - self.eval_data_file = Path(eval_data_file) - self.eval_data: list[EvaluationDataConfig] = [] - self._validate_eval_data_file() - self._load_eval_data() - - def _validate_eval_data_file(self) -> None: - """Validate eval data file exists and is readable.""" - if not self.eval_data_file.exists(): - raise ConfigurationError(f"Eval data file not found: {self.eval_data_file}") + def __init__(self, eval_data_file: str) -> None: + """Initialize eval data manager.""" + self.eval_data_file = eval_data_file + self.conversations: list[ConversationDataConfig] = [] - if not self.eval_data_file.is_file(): - raise ConfigurationError( - f"Eval data file path is not a file: {self.eval_data_file}" - ) + self._load_eval_data() + self._log_loaded_data_stats() def _load_eval_data(self) -> None: """Load evaluation data from YAML file.""" try: - with open(self.eval_data_file, "r", encoding="utf-8") as file: - eval_data = yaml.safe_load(file) + eval_data_path = Path(self.eval_data_file).resolve() + logger.info("Loading evaluation data from: %s", str(eval_data_path)) - if not isinstance(eval_data, list): - raise ConfigurationError( - "Eval data file must contain a list of evaluations" + with open(eval_data_path, "r", encoding="utf-8") as file: + raw_data = yaml.safe_load(file) + + if raw_data is None: + raise EvaluationDataError("Eval data file is empty") + if not isinstance(raw_data, list): + raise EvaluationDataError( + f"Eval data file must contain a list of conversations, got {type(raw_data)}" + ) + if not raw_data: + raise EvaluationDataError( + "Eval data file must contain at least one conversation" ) - self.eval_data = [] - for data in eval_data: - self._validate_eval_data(data) - self.eval_data.append(EvaluationDataConfig(**data)) + logger.info("Found %d conversation(s) in YAML file", len(raw_data)) + + # Process each conversation + self._load_conversation_data(raw_data) except yaml.YAMLError as e: - raise ConfigurationError(f"Invalid YAML in eval data file: {e}") from e + raise EvaluationDataError(f"Invalid YAML in eval data file: {e}") from e + except FileNotFoundError as e: + raise EvaluationDataError(f"Eval data file not found: {e}") from e + except EvaluationDataError: + raise except Exception as e: - raise ConfigurationError(f"Error loading eval data file: {e}") from e - - def _validate_eval_data(self, eval_data: dict[str, Any]) -> None: - """Validate a single evaluation data point.""" - required_fields = ["eval_id", "eval_query"] - for field in required_fields: - if field not in eval_data: - raise ConfigurationError( - f"Missing required field '{field}' in evaluation data" + raise EvaluationDataError(f"Error loading eval data file: {e}") from e + + def _load_conversation_data(self, raw_data: list[dict[str, Any]]) -> None: + """Load conversation data.""" + logger.info("Processing conversation data...") + + self.conversations = [] + processed_groups = set() + + for idx, conversation_data in enumerate(raw_data, 1): + logger.debug("Processing conversation %d", idx) + + try: + conversation_config = ConversationDataConfig(**conversation_data) + + # Check for duplicate conversation groups + if conversation_config.conversation_group in processed_groups: + raise EvaluationDataError( + "Duplicate conversation_group " + f"'{conversation_config.conversation_group}' found" + ) + processed_groups.add(conversation_config.conversation_group) + + # Store the conversation + self.conversations.append(conversation_config) + + logger.info( + "Loaded conversation '%s' with %d evaluations", + conversation_config.conversation_group, + len(conversation_config.conversation), ) - eval_type = eval_data.get("eval_type", "judge-llm") - if eval_type not in ["judge-llm", "script", "sub-string"]: - raise ConfigurationError(f"Invalid eval_type: {eval_type}") + except ValidationError as e: + error_details = self._format_pydantic_error(e) + conversation_group = conversation_data.get( + "conversation_group", f"conversation_{idx}" + ) + raise EvaluationDataError( + f"Validation error in conversation '{conversation_group}': {error_details}" + ) from e + except EvaluationDataError: + raise + except Exception as e: + raise EvaluationDataError( + f"Error processing conversation {idx}: {e}" + ) from e + + def _format_pydantic_error(self, error: ValidationError) -> str: + """Format Pydantic validation error.""" + errors = [] + for err in error.errors(): + field = " -> ".join(str(loc) for loc in err["loc"]) + message = err["msg"] + errors.append(f"{field}: {message}") + return "; ".join(errors) + + def _log_loaded_data_stats(self) -> None: + """Log statistics about loaded data.""" + if not self.conversations: + raise EvaluationDataError("No valid conversations found in eval data file") + + # Calculate statistics from conversations + eval_types: dict[str, int] = {} + conversation_stats = {} + total_evaluations = 0 + + for conversation in self.conversations: + conv_group = conversation.conversation_group + conversation_stats[conv_group] = len(conversation.conversation) + total_evaluations += len(conversation.conversation) + + for eval_config in conversation.conversation: + eval_types[eval_config.eval_type] = ( + eval_types.get(eval_config.eval_type, 0) + 1 + ) - # Validate type-specific requirements - if eval_type == "judge-llm" and "expected_response" not in eval_data: - raise ConfigurationError( - "eval_type 'judge-llm' requires 'expected_response' field" - ) + if total_evaluations == 0: + raise EvaluationDataError("No valid evaluations found in eval data file") - if eval_type == "sub-string" and "expected_keywords" not in eval_data: - raise ConfigurationError( - "eval_type 'sub-string' requires 'expected_keywords' field" + # Check for duplicate eval_ids across all conversations + all_eval_ids = [] + for conversation in self.conversations: + all_eval_ids.extend( + [eval_config.eval_id for eval_config in conversation.conversation] ) - if eval_type == "script" and "eval_verify_script" not in eval_data: - raise ConfigurationError( - "eval_type 'script' requires 'eval_verify_script' field" + duplicate_ids = [ + eval_id for eval_id in all_eval_ids if all_eval_ids.count(eval_id) > 1 + ] + if duplicate_ids: + logger.warning( + "Duplicate eval_id(s) found across conversations: %s", + set(duplicate_ids), ) - def get_eval_data(self) -> list[EvaluationDataConfig]: - """Get all evaluation configurations.""" - return self.eval_data + logger.info("āœ… Data validation complete:") + logger.info(" %d conversations", len(self.conversations)) + logger.info(" %d total evaluations", total_evaluations) + logger.info(" Evaluation types: %s", dict(eval_types)) + + for conv_group, count in conversation_stats.items(): + logger.debug(" %s: %d evaluations", conv_group, count) + + def get_conversations(self) -> list[ConversationDataConfig]: + """Get all conversation configurations.""" + return self.conversations def get_eval_count(self) -> int: - """Get the number of evaluation configurations.""" - return len(self.eval_data) + """Get the total number of evaluation configurations.""" + return sum( + len(conversation.conversation) for conversation in self.conversations + ) diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py index ad51b92e..428b1fff 100644 --- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py +++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/evaluator.py @@ -1,14 +1,17 @@ """Evaluation runner that orchestrates different evaluation types.""" import logging -from typing import Optional +from typing import TYPE_CHECKING, Optional -from ..utils.api_client import AgentHttpClient from ..utils.exceptions import AgentAPIError, JudgeModelError, ScriptExecutionError -from ..utils.judge import JudgeModelManager from ..utils.prompt import ANSWER_CORRECTNESS_PROMPT -from .models import EvaluationDataConfig, EvaluationResult -from .script_runner import ScriptRunner +from .utils import create_error_result, create_success_result + +if TYPE_CHECKING: + from ..utils.api_client import AgentHttpClient + from ..utils.judge import JudgeModelManager + from .models import EvaluationDataConfig, EvaluationResult + from .script_runner import ScriptRunner logger = logging.getLogger(__name__) @@ -18,96 +21,49 @@ class EvaluationRunner: def __init__( self, - agent_client: AgentHttpClient, - judge_manager: Optional[JudgeModelManager] = None, - kubeconfig: Optional[str] = None, + agent_client: "AgentHttpClient", + script_runner: "ScriptRunner", + judge_manager: Optional["JudgeModelManager"] = None, ): """Initialize evaluation runner.""" self.agent_client = agent_client self.judge_manager = judge_manager - self.kubeconfig = kubeconfig + self.script_runner = script_runner def run_evaluation( - self, data_config: EvaluationDataConfig, agent_provider: str, agent_model: str - ) -> EvaluationResult: + self, + data_config: "EvaluationDataConfig", + agent_provider: str, + agent_model: str, + conversation_id: Optional[str] = None, + ) -> "EvaluationResult": """Run a single evaluation based on configuration.""" try: - # Execute setup script if provided - if data_config.eval_setup_script: - try: - script_runner = ScriptRunner(kubeconfig=self.kubeconfig) - success = script_runner.run_script(data_config.eval_setup_script) - if not success: - raise ScriptExecutionError( - "Setup script returned non-zero exit code" - ) - logger.debug( - "Setup script executed successfully for %s", data_config.eval_id - ) - except ScriptExecutionError as e: - logger.error( - "Setup script failed for %s: %s", data_config.eval_id, e - ) - return EvaluationResult( - eval_id=data_config.eval_id, - query=data_config.eval_query, - response="", - eval_type=data_config.eval_type, - result="ERROR", - error=f"Setup script failed: {e}", - ) - - response = self.agent_client.query_agent( - data_config.eval_query, agent_provider, agent_model - ) + # Query the agent + api_input = { + "query": data_config.eval_query, + "provider": agent_provider, + "model": agent_model, + "conversation_id": conversation_id, + } + + response, conversation_id = self.agent_client.query_agent(api_input) - # Evaluate response based on type - success = self._evaluate_response(data_config, response) - - # Execute cleanup script if provided - if data_config.eval_cleanup_script: - try: - cleanup_runner = ScriptRunner(kubeconfig=self.kubeconfig) - cleanup_success = cleanup_runner.run_script( - data_config.eval_cleanup_script - ) - if cleanup_success: - logger.debug( - "Cleanup script executed successfully for %s", - data_config.eval_id, - ) - else: - logger.warning( - "Cleanup script failed for %s", data_config.eval_id - ) - except ScriptExecutionError as e: - logger.warning( - "Cleanup script failed for %s: %s", data_config.eval_id, e - ) - - return EvaluationResult( - eval_id=data_config.eval_id, - query=data_config.eval_query, - response=response, - eval_type=data_config.eval_type, - result="PASS" if success else "FAIL", + # Evaluate agent action based on eval type + success = self._evaluate_agent_action(data_config, response) + + return create_success_result( + data_config, response, success, conversation_id ) except (AgentAPIError, ScriptExecutionError, JudgeModelError) as e: logger.error("Evaluation failed for %s: %s", data_config.eval_id, e) - return EvaluationResult( - eval_id=data_config.eval_id, - query=data_config.eval_query, - response="", - eval_type=data_config.eval_type, - result="ERROR", - error=str(e), - ) + return create_error_result(data_config, str(e), conversation_id) - def _evaluate_response( - self, data_config: EvaluationDataConfig, response: str + def _evaluate_agent_action( + self, data_config: "EvaluationDataConfig", response: str ) -> bool: - """Evaluate response based on configuration type.""" + """Evaluate agent action based on configuration type.""" match data_config.eval_type: case "script": return self._evaluate_script(data_config) @@ -119,27 +75,27 @@ def _evaluate_response( logger.error("Unknown evaluation type: %s", data_config.eval_type) return False - def _evaluate_script(self, data_config: EvaluationDataConfig) -> bool: + def _evaluate_script(self, data_config: "EvaluationDataConfig") -> bool: """Evaluate using script execution.""" if not data_config.eval_verify_script: logger.error("No verify script provided for script evaluation") return False - script_runner = ScriptRunner(kubeconfig=self.kubeconfig) - return script_runner.run_script(data_config.eval_verify_script) + return self.script_runner.run_script(data_config.eval_verify_script) def _evaluate_substring( - self, data_config: EvaluationDataConfig, response: str + self, data_config: "EvaluationDataConfig", response: str ) -> bool: """Evaluate using substring matching.""" if not data_config.expected_keywords: return False response_lower = response.lower() - return any( - keyword.lower() in response_lower - for keyword in data_config.expected_keywords - ) + # All keywords must be present for evaluation to pass + for keyword in data_config.expected_keywords: + if keyword.lower() not in response_lower: + return False + return True def _extract_numeric_result(self, response: Optional[str]) -> int: """Extract numeric result from judge response.""" @@ -156,7 +112,7 @@ def _extract_numeric_result(self, response: Optional[str]) -> int: return int(response) def _evaluate_judge_llm( - self, data_config: EvaluationDataConfig, response: str + self, data_config: "EvaluationDataConfig", response: str ) -> bool: """Evaluate using judge LLM.""" if not self.judge_manager: @@ -179,6 +135,6 @@ def _evaluate_judge_llm( result = self._extract_numeric_result(judge_resp) return result == 1 - def get_judge_manager(self) -> Optional[JudgeModelManager]: + def get_judge_manager(self) -> Optional["JudgeModelManager"]: """Get the judge model manager.""" return self.judge_manager diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/models.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/models.py index 65c1140b..568888cd 100644 --- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/models.py +++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/models.py @@ -1,30 +1,292 @@ """Data models for agent evaluation.""" -from dataclasses import dataclass -from typing import Optional +from pathlib import Path +from typing import Any, Callable, Optional, Union +from pydantic import BaseModel, Field, ValidationInfo, field_validator, model_validator -@dataclass -class EvaluationResult: - """Evaluation result data structure.""" +VALID_EVAL_TYPES = ["judge-llm", "script", "sub-string"] +VALID_EVAL_RESULTS = ["PASS", "FAIL", "ERROR"] - eval_id: str - query: str - response: str - eval_type: str - result: str - error: Optional[str] = None +def _validate_eval_type(eval_type: str) -> str: + """Validate evaluation type.""" + if eval_type not in VALID_EVAL_TYPES: + raise ValueError( + f"eval_type must be one of {VALID_EVAL_TYPES}, got '{eval_type}'" + ) + return eval_type -@dataclass -class EvaluationDataConfig: # pylint: disable=too-many-instance-attributes + +def _validate_script_path( + script_file: Optional[Union[str, Path]], script_name: str +) -> Optional[Path]: + """Validate script path exists and convert to absolute Path.""" + if script_file is not None: + if isinstance(script_file, str): + script_file = script_file.strip() + if not script_file: + raise ValueError(f"{script_name} cannot be empty string") + script_file = Path(script_file) + + # Convert to absolute path + script_path = script_file.resolve() + + # Validate file exists + if not script_path.exists(): + raise ValueError(f"{script_name} file not found: {script_path}") + + if not script_path.is_file(): + raise ValueError(f"{script_name} is not a file: {script_path}") + + return script_path + + return None + + +def _calculate_stats_by_category( + results: list["EvaluationResult"], + key_extractor: Callable[["EvaluationResult"], str], +) -> dict[str, dict[str, Union[int, float]]]: + """Calculate statistics grouped by a category from each result.""" + category_stats: dict[str, dict[str, Union[int, float]]] = {} + + for result in results: + category = key_extractor(result) + if category not in category_stats: + category_stats[category] = {"passed": 0, "failed": 0, "errored": 0} + + if result.result == "PASS": + category_stats[category]["passed"] += 1 + elif result.result == "FAIL": + category_stats[category]["failed"] += 1 + elif result.result == "ERROR": + category_stats[category]["errored"] += 1 + + # Calculate success rates + for stats in category_stats.values(): + total = stats["passed"] + stats["failed"] + stats["errored"] + stats["total"] = total + stats["success_rate"] = ( + round((stats["passed"] / total) * 100, 2) if total > 0 else 0.0 + ) + + return category_stats + + +class EvaluationDataConfig(BaseModel): """Single evaluation data configuration.""" - eval_id: str - eval_query: str - eval_type: str = "judge-llm" - expected_response: Optional[str] = None - expected_keywords: Optional[list[str]] = None - eval_setup_script: Optional[str] = None - eval_verify_script: Optional[str] = None - eval_cleanup_script: Optional[str] = None + eval_id: str = Field(..., min_length=1, description="Unique evaluation identifier") + eval_query: str = Field(..., min_length=1, description="Query to send to the agent") + eval_type: str = Field( + ..., description="Type of evaluation (judge-llm, sub-string, script)" + ) + expected_response: Optional[str] = Field( + None, min_length=1, description="Expected response for judge-llm" + ) + expected_keywords: Optional[list[str]] = Field( + None, min_length=1, description="List of expected keywords for sub-string" + ) + eval_verify_script: Optional[Path] = Field( + None, description="Script path for script evaluation" + ) + conversation_group: Optional[str] = Field(None, min_length=1) + description: Optional[str] = Field( + None, min_length=1, max_length=500, description="Description of this evaluation" + ) + + @field_validator("eval_type") + @classmethod + def validate_eval_type(cls, v: str) -> str: + """Validate evaluation type.""" + return _validate_eval_type(v) + + @field_validator("expected_keywords") + @classmethod + def validate_keywords(cls, v: Optional[list[str]]) -> Optional[list[str]]: + """Ensure keywords is a list and validate content.""" + if v is not None: + if not isinstance(v, list): + v = [v] + # Remove empty strings and validate + v = [keyword.strip() for keyword in v if keyword and keyword.strip()] + if not v: + raise ValueError("expected_keywords cannot be empty after filtering") + return v + + @field_validator("eval_verify_script") + @classmethod + def validate_script_path(cls, v: Optional[Union[str, Path]]) -> Optional[Path]: + """Validate verify script path exists and convert to absolute Path.""" + return _validate_script_path(v, "eval_verify_script") + + @model_validator(mode="after") + def validate_eval_requirements(self) -> "EvaluationDataConfig": + """Validate eval type specific requirements.""" + if self.eval_type == "judge-llm": + if not self.expected_response: + raise ValueError( + "eval_type 'judge-llm' requires non-empty 'expected_response'" + ) + + elif self.eval_type == "sub-string": + if not self.expected_keywords or len(self.expected_keywords) == 0: + raise ValueError( + "eval_type 'sub-string' requires non-empty 'expected_keywords'" + ) + + elif self.eval_type == "script": + if not self.eval_verify_script: + raise ValueError( + "eval_type 'script' requires non-empty 'eval_verify_script'" + ) + + return self + + +class ConversationDataConfig(BaseModel): + """Configuration for a conversation group.""" + + conversation_group: str = Field( + ..., min_length=1, max_length=100, description="Conversation group identifier" + ) + conversation: list[EvaluationDataConfig] = Field( + ..., min_length=1, description="List of evaluations in this conversation group" + ) + description: Optional[str] = Field( + None, + min_length=1, + max_length=500, + description="Description of this conversation group", + ) + setup_script: Optional[Path] = Field( + None, description="Setup script path for conversation group" + ) + cleanup_script: Optional[Path] = Field( + None, description="Cleanup script path for conversation group" + ) + + @field_validator("conversation_group") + @classmethod + def validate_conversation_group(cls, v: str) -> str: + """Validate conversation group name.""" + v = v.strip() + if not v: + raise ValueError("conversation_group cannot be empty") + + return v + + @field_validator("setup_script", "cleanup_script") + @classmethod + def validate_script_path( + cls, v: Optional[Union[str, Path]], info: ValidationInfo + ) -> Optional[Path]: + """Validate script path exists and convert to absolute Path.""" + if info.field_name is None: + raise ValueError("Set a script name for field validator") + + return _validate_script_path(v, info.field_name) + + @model_validator(mode="after") + def validate_conversation_data(self) -> "ConversationDataConfig": + """Validate conversation data consistency.""" + if not self.conversation: + raise ValueError( + f"Conversation '{self.conversation_group}' must have at least one evaluation" + ) + + # Set conversation group for all evaluations + for eval_config in self.conversation: + eval_config.conversation_group = self.conversation_group + + # Check for duplicate eval_ids within conversation + eval_ids = [eval_config.eval_id for eval_config in self.conversation] + duplicates = [eval_id for eval_id in eval_ids if eval_ids.count(eval_id) > 1] + if duplicates: + raise ValueError( + f"Duplicate eval_id(s) in conversation '{self.conversation_group}': {duplicates}" + ) + + return self + + +class EvaluationResult(BaseModel): + """Result of a single evaluation.""" + + eval_id: str = Field(..., min_length=1, description="Evaluation identifier") + query: str = Field(..., min_length=1, description="Query sent to agent") + response: str = Field(..., description="Agent response") + eval_type: str = Field(..., description="Type of evaluation performed") + result: str = Field(..., description="Evaluation result") + conversation_group: Optional[str] = Field(None, description="Conversation group") + conversation_id: Optional[str] = Field(None, description="Conversation ID") + error: Optional[str] = Field(None, description="Error message if any") + + @field_validator("result") + @classmethod + def validate_result(cls, v: str) -> str: + """Validate result is one of the allowed values.""" + if v not in VALID_EVAL_RESULTS: + raise ValueError(f"Result must be one of {VALID_EVAL_RESULTS}, got '{v}'") + return v + + @field_validator("eval_type") + @classmethod + def validate_eval_type(cls, v: str) -> str: + """Validate evaluation type.""" + return _validate_eval_type(v) + + +class EvaluationStats(BaseModel): + """Statistics for evaluation runs.""" + + total_evaluations: int = Field(..., ge=0, description="Total number of evaluations") + total_conversations: int = Field( + ..., ge=0, description="Total number of conversations" + ) + passed: int = Field(..., ge=0, description="Number of passed evaluations") + failed: int = Field(..., ge=0, description="Number of failed evaluations") + errored: int = Field(..., ge=0, description="Number of errored evaluations") + success_rate: float = Field( + ..., ge=0.0, le=100.0, description="Success rate percentage" + ) + by_conversation: dict[str, dict[str, Any]] = Field( + default_factory=dict, description="Statistics by conversation" + ) + by_eval_type: dict[str, dict[str, Any]] = Field( + default_factory=dict, description="Statistics by evaluation type" + ) + + @classmethod + def from_results(cls, results: list[EvaluationResult]) -> "EvaluationStats": + """Create comprehensive statistics from evaluation results.""" + total = len(results) + passed = sum(1 for r in results if r.result == "PASS") + failed = sum(1 for r in results if r.result == "FAIL") + errored = sum(1 for r in results if r.result == "ERROR") + success_rate = (passed / total * 100) if total > 0 else 0.0 + + # Count unique conversations + conversations: set[str] = set( + r.conversation_group for r in results if r.conversation_group + ) + + # Calculate statistics by conversation + by_conversation = _calculate_stats_by_category( + results, lambda r: r.conversation_group or "unknown" + ) + + # Calculate statistics by eval_type + by_eval_type = _calculate_stats_by_category(results, lambda r: r.eval_type) + + return cls( + total_evaluations=total, + total_conversations=len(conversations), + passed=passed, + failed=failed, + errored=errored, + success_rate=success_rate, + by_conversation=by_conversation, + by_eval_type=by_eval_type, + ) diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py index 010cbcd0..8c3c3e28 100644 --- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py +++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/results.py @@ -1,13 +1,14 @@ """Results management for agent evaluation.""" +import json import logging from datetime import datetime from pathlib import Path -from typing import Optional import pandas as pd -from .models import EvaluationResult +from ..utils.exceptions import AgentEvaluationError +from .models import EvaluationResult, EvaluationStats logger = logging.getLogger(__name__) @@ -15,48 +16,76 @@ class ResultsManager: """Manages evaluation results and output.""" - def __init__(self, result_dir: str): + def __init__(self, results: list[EvaluationResult]): """Initialize results manager.""" - self.result_dir = result_dir - self.result_path = Path(result_dir) - - def save_results( - self, - results: list[EvaluationResult], - filename: Optional[str] = None, - ) -> None: - """Save evaluation results to CSV file.""" - # Create directory if it doesn't exist - self.result_path.mkdir(parents=True, exist_ok=True) - - # Generate filename with timestamp if not provided - if filename is None: + self.results = results + + self.results_stats = EvaluationStats.from_results(results) + + def save_results(self, result_dir: str) -> None: + """Save evaluation results/statistics to CSV and JSON files.""" + if not self.results: + logger.warning("No result to save") + return + + try: + output_dir = Path(result_dir) + output_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"agent_goal_eval_results_{timestamp}.csv" + csv_file = output_dir / f"agent_goal_eval_results_{timestamp}.csv" + json_file = output_dir / f"agent_goal_eval_summary_{timestamp}.json" - # Create full file path - file_path = self.result_path / filename + # Save CSV results + self._save_csv_results(csv_file) + # Save summary JSON + self._save_json_summary(json_file) - # Convert results to DataFrame + except Exception as e: + logger.error("Failed to save results: %s", e) + raise AgentEvaluationError(f"Failed to save results: {e}") from e + + def _save_csv_results(self, file_path: Path) -> None: + """Save results to CSV file.""" data = [] - for result in results: + for result in self.results: data.append( { + "conversation_group": result.conversation_group, + "conversation_id": result.conversation_id, "eval_id": result.eval_id, "query": result.query, "response": result.response, "eval_type": result.eval_type, "result": result.result, - "error": result.error or "", + "error": result.error, } ) df = pd.DataFrame(data) - # Save to CSV using pandas df.to_csv(file_path, index=False, encoding="utf-8") logger.info("Results saved to %s", file_path) - def get_output_dir(self) -> str: - """Get the output directory path.""" - return str(self.result_path) + def _save_json_summary(self, file_path: Path) -> None: + """Save eval summary to JSON file.""" + statistics = { + "summary": { + "total_evaluations": self.results_stats.total_evaluations, + "total_conversations": self.results_stats.total_conversations, + "passed": self.results_stats.passed, + "failed": self.results_stats.failed, + "errored": self.results_stats.errored, + "success_rate": round(self.results_stats.success_rate, 2), + }, + "by_conversation": self.results_stats.by_conversation, + "by_eval_type": self.results_stats.by_eval_type, + } + + with open(file_path, "w", encoding="utf-8") as f: + json.dump(statistics, f, indent=2, ensure_ascii=False) + logger.info("Summary saved to %s", file_path) + + def get_results_stats(self) -> EvaluationStats: + """Get result stats/summary.""" + return self.results_stats diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/script_runner.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/script_runner.py index f4ea0ad8..b3ba1208 100644 --- a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/script_runner.py +++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/script_runner.py @@ -1,10 +1,10 @@ -"""Script execution for evaluation.""" +"""Script execution module for evaluation.""" import logging import os import subprocess from pathlib import Path -from typing import Optional +from typing import Optional, Union from ..utils.exceptions import ScriptExecutionError @@ -25,18 +25,16 @@ def get_environment(self) -> dict: env["KUBECONFIG"] = self.kubeconfig return env - def run_script(self, script_path: str, input_text: Optional[str] = None) -> bool: - """ - Execute a script and return success status. + def run_script(self, script_path: Union[str, Path]) -> bool: + """Execute a script and return success status.""" + if isinstance(script_path, str): + script_path = Path(script_path) + script_path = script_path.resolve() - Path normalization: Relative paths are converted to absolute path. - """ - script_file = Path(script_path).resolve() - - if not script_file.exists(): + if not script_path.exists(): raise ScriptExecutionError(f"Script not found: {script_path}") - if not script_file.is_file(): + if not script_path.is_file(): raise ScriptExecutionError(f"Script path is not a file: {script_path}") try: @@ -44,17 +42,13 @@ def run_script(self, script_path: str, input_text: Optional[str] = None) -> bool env = self.get_environment() # Make script executable - script_file.chmod(0o755) - - # Prepare command - cmd = ["bash", str(script_file)] + script_path.chmod(0o755) # Run script - logger.debug("Running script: %s", script_file) + logger.debug("Running script: %s", script_path) result = subprocess.run( - cmd, - input=input_text, + [str(script_path)], text=True, capture_output=True, env=env, diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/utils.py b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/utils.py new file mode 100644 index 00000000..d59e1716 --- /dev/null +++ b/lsc_agent_eval/src/lsc_agent_eval/core/agent_goal_eval/utils.py @@ -0,0 +1,42 @@ +"""Utility functions for evaluation processing.""" + +from typing import Optional + +from .models import EvaluationDataConfig, EvaluationResult + + +def create_error_result( + eval_config: EvaluationDataConfig, + error_message: str, + conversation_id: Optional[str] = None, +) -> EvaluationResult: + """Create a standardized error result.""" + return EvaluationResult( + eval_id=eval_config.eval_id, + query=eval_config.eval_query, + response="", + eval_type=eval_config.eval_type, + result="ERROR", + conversation_group=eval_config.conversation_group, + conversation_id=conversation_id, + error=error_message, + ) + + +def create_success_result( + eval_config: EvaluationDataConfig, + response: str, + success: bool, + conversation_id: Optional[str] = None, +) -> EvaluationResult: + """Create a standardized success/fail result.""" + return EvaluationResult( + eval_id=eval_config.eval_id, + query=eval_config.eval_query, + response=response, + eval_type=eval_config.eval_type, + result="PASS" if success else "FAIL", + conversation_group=eval_config.conversation_group, + conversation_id=conversation_id, + error=None, + ) diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/utils/api_client.py b/lsc_agent_eval/src/lsc_agent_eval/core/utils/api_client.py index d5d6bb55..53645bf4 100644 --- a/lsc_agent_eval/src/lsc_agent_eval/core/utils/api_client.py +++ b/lsc_agent_eval/src/lsc_agent_eval/core/utils/api_client.py @@ -47,18 +47,13 @@ def _read_token_file(self, token_file: str) -> str: raise AgentAPIError(f"Error reading token file: {e}") from e def query_agent( - self, query: str, provider: str, model: str, timeout: int = 300 - ) -> str: + self, api_input: dict[str, str], timeout: int = 300 + ) -> tuple[str, str]: """Query the agent and return response.""" if not self.client: raise AgentAPIError("HTTP client not initialized") try: - api_input = { - "query": query, - "provider": provider, - "model": model, - } response = self.client.post( "/v1/query", json=api_input, @@ -69,8 +64,11 @@ def query_agent( response_data = response.json() if "response" not in response_data: raise AgentAPIError("Agent response missing 'response' field") + agent_response = response_data["response"].strip() - return response_data["response"].strip() + conversation_id = response_data.get("conversation_id", "").strip() + + return agent_response, conversation_id except httpx.TimeoutException as e: raise AgentAPIError(f"Agent query timeout after {timeout} seconds") from e diff --git a/lsc_agent_eval/src/lsc_agent_eval/core/utils/exceptions.py b/lsc_agent_eval/src/lsc_agent_eval/core/utils/exceptions.py index 2ba54ff0..869b5e7e 100644 --- a/lsc_agent_eval/src/lsc_agent_eval/core/utils/exceptions.py +++ b/lsc_agent_eval/src/lsc_agent_eval/core/utils/exceptions.py @@ -5,8 +5,8 @@ class AgentEvaluationError(Exception): """Base exception for agent evaluation errors.""" -class ConfigurationError(AgentEvaluationError): - """Configuration-related errors.""" +class EvaluationDataError(AgentEvaluationError): + """Evaluation data loading, parsing, and validation errors.""" class AgentAPIError(AgentEvaluationError): diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_agent_goal_eval.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_agent_goal_eval.py index 87bf578e..d0b57f8b 100644 --- a/lsc_agent_eval/tests/core/agent_goal_eval/test_agent_goal_eval.py +++ b/lsc_agent_eval/tests/core/agent_goal_eval/test_agent_goal_eval.py @@ -1,14 +1,17 @@ """Tests for agent goal evaluation orchestrator.""" -from unittest.mock import Mock, patch +from unittest.mock import MagicMock, Mock, patch import pytest from lsc_agent_eval.core.agent_goal_eval.agent_goal_eval import AgentGoalEval from lsc_agent_eval.core.agent_goal_eval.models import ( + ConversationDataConfig, EvaluationDataConfig, EvaluationResult, + EvaluationStats, ) +from lsc_agent_eval.core.utils.exceptions import AgentEvaluationError class TestAgentGoalEval: @@ -30,22 +33,25 @@ def mock_args(self): return args @pytest.fixture - def sample_configs(self): - """Sample evaluation configurations.""" - return [ - EvaluationDataConfig( - eval_id="test_001", - eval_query="What is Kubernetes?", - eval_type="judge-llm", - expected_response="Kubernetes is a container orchestration platform", - ), - EvaluationDataConfig( - eval_id="test_002", - eval_query="Deploy nginx", - eval_type="script", - eval_verify_script="./verify.sh", - ), - ] + def sample_conversation(self): + """Sample conversation data configuration.""" + return ConversationDataConfig( + conversation_group="test_conv", + conversation=[ + EvaluationDataConfig( + eval_id="test_001", + eval_query="What is Openshift?", + eval_type="judge-llm", + expected_response="OpenShift is Red Hat's enterprise Kubernetes platform.", + ), + EvaluationDataConfig( + eval_id="test_002", + eval_query="Deploy nginx", + eval_type="sub-string", + expected_keywords=["nginx", "deployment"], + ), + ], + ) @pytest.fixture def sample_results(self): @@ -57,13 +63,17 @@ def sample_results(self): response="Kubernetes is a container orchestration platform", eval_type="judge-llm", result="PASS", + conversation_group="test_conv", + conversation_id="conv-id-123", ), EvaluationResult( eval_id="test_002", query="Deploy nginx", - response="kubectl create deployment nginx --image=nginx", - eval_type="script", + response="oc create deployment nginx --image=nginx", + eval_type="sub-string", result="PASS", + conversation_group="test_conv", + conversation_id="conv-id-123", ), ] @@ -73,10 +83,10 @@ def sample_results(self): @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient") @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager") @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager") + @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner") def test_init_with_judge_manager( self, - mock_results_manager, + mock_script_runner, mock_evaluation_runner, mock_judge_manager, mock_agent_client, @@ -90,22 +100,22 @@ def test_init_with_judge_manager( mock_config_manager.assert_called_once_with("test_data.yaml") mock_agent_client.assert_called_once_with("http://localhost:8080", None) mock_judge_manager.assert_called_once_with("openai", "gpt-4") + mock_script_runner.assert_called_once_with(None) mock_evaluation_runner.assert_called_once_with( mock_agent_client.return_value, + mock_script_runner.return_value, mock_judge_manager.return_value, - kubeconfig=None, ) - mock_results_manager.assert_called_once_with("results/") @patch( "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager" ) @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient") @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager") + @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner") def test_init_without_judge_manager( self, - mock_results_manager, + mock_script_runner, mock_evaluation_runner, mock_agent_client, mock_config_manager, @@ -121,8 +131,8 @@ def test_init_without_judge_manager( assert evaluator.judge_manager is None mock_evaluation_runner.assert_called_once_with( mock_agent_client.return_value, + mock_script_runner.return_value, None, - kubeconfig=None, ) @patch( @@ -131,10 +141,10 @@ def test_init_without_judge_manager( @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient") @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager") @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager") + @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner") def test_init_with_kubeconfig( self, - mock_results_manager, + mock_script_runner, mock_evaluation_runner, mock_judge_manager, mock_agent_client, @@ -146,10 +156,11 @@ def test_init_with_kubeconfig( AgentGoalEval(mock_args) + mock_script_runner.assert_called_once_with("~/kubeconfig") mock_evaluation_runner.assert_called_once_with( mock_agent_client.return_value, + mock_script_runner.return_value, mock_judge_manager.return_value, - kubeconfig="~/kubeconfig", ) @patch( @@ -158,23 +169,34 @@ def test_init_with_kubeconfig( @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient") @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager") @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner") + @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner") @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager") def test_run_evaluation_success( self, mock_results_manager, + mock_script_runner, mock_evaluation_runner, mock_judge_manager, mock_agent_client, mock_config_manager, mock_args, - sample_configs, + sample_conversation, sample_results, ): """Test successful evaluation execution.""" # Setup mocks - mock_config_manager.return_value.get_eval_data.return_value = sample_configs + mock_config_manager.return_value.get_conversations.return_value = [ + sample_conversation + ] + mock_config_manager.return_value.get_eval_count.return_value = 2 mock_evaluation_runner.return_value.run_evaluation.side_effect = sample_results + # Mock results manager + mock_results_mgr_instance = MagicMock() + mock_results_manager.return_value = mock_results_mgr_instance + mock_stats = EvaluationStats.from_results(sample_results) + mock_results_mgr_instance.get_results_stats.return_value = mock_stats + evaluator = AgentGoalEval(mock_args) # Capture print output @@ -185,121 +207,15 @@ def test_run_evaluation_success( assert mock_evaluation_runner.return_value.run_evaluation.call_count == 2 # Verify results were saved - mock_results_manager.return_value.save_results.assert_called_once_with( - sample_results + mock_results_mgr_instance.save_results.assert_called_once_with( + mock_args.result_dir ) # Verify summary was printed mock_print.assert_called() - @patch( - "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager" - ) - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager") - def test_run_evaluation_with_errors( - self, - mock_results_manager, - mock_evaluation_runner, - mock_judge_manager, - mock_agent_client, - mock_config_manager, - mock_args, - sample_configs, - capsys, - ): - """Test evaluation execution with errors.""" - # Setup results with errors - results_with_errors = [ - EvaluationResult( - eval_id="test_001", - query="What is Kubernetes?", - response="Kubernetes is a container orchestration platform", - eval_type="judge-llm", - result="PASS", - ), - EvaluationResult( - eval_id="test_002", - query="Deploy nginx", - response="", - eval_type="script", - result="ERROR", - error="Script execution failed", - ), - ] - - mock_config_manager.return_value.get_eval_data.return_value = sample_configs - mock_evaluation_runner.return_value.run_evaluation.side_effect = ( - results_with_errors - ) - - evaluator = AgentGoalEval(mock_args) - - evaluator.run_evaluation() - - # Capture stdout/stderr output - captured = capsys.readouterr() - - # Verify error messages are printed to stdout - assert "āœ… test_001: PASS" in captured.out - assert "āš ļø test_002: ERROR" in captured.out - assert " Query: Deploy nginx" in captured.out - assert " Evaluation type: script" in captured.out - assert " Response: " in captured.out - assert " Error message: Script execution failed" in captured.out - - # Verify evaluations were run - assert mock_evaluation_runner.return_value.run_evaluation.call_count == 2 - - # Verify results were saved - mock_results_manager.return_value.save_results.assert_called_once_with( - results_with_errors - ) - - @patch( - "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager" - ) - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager") - def test_run_evaluation_exception( - self, - mock_results_manager, - mock_evaluation_runner, - mock_judge_manager, - mock_agent_client, - mock_config_manager, - mock_args, - ): - """Test evaluation execution with exception.""" - mock_config_manager.return_value.get_eval_data.side_effect = Exception( - "Config error" - ) - - evaluator = AgentGoalEval(mock_args) - - with patch( - "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.logger" - ) as mock_logger: - with pytest.raises(Exception, match="Config error"): - evaluator.run_evaluation() - - # Verify error was logged - mock_logger.error.assert_called() - args, kwargs = mock_logger.error.call_args - assert args[0] == "Evaluation failed: %s" - assert str(args[1]) == "Config error" - - def test_print_summary_all_pass(self, mock_args): - """Test print summary with all passing results.""" - results = [ - EvaluationResult("test_001", "query1", "response1", "judge-llm", "PASS"), - EvaluationResult("test_002", "query2", "response2", "script", "PASS"), - ] - + def test_get_result_summary_success(self, mock_args): + """Test result summary with available results.""" with ( patch( "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager" @@ -313,32 +229,17 @@ def test_print_summary_all_pass(self, mock_args): patch( "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner" ), - patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager"), + patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner"), ): - evaluator = AgentGoalEval(mock_args) + evaluator.result_summary = {"TOTAL": 5, "PASS": 3, "FAIL": 1, "ERROR": 1} - with patch("builtins.print") as mock_print: - evaluator._print_summary(results) - - # Check that summary was printed - print_calls = [call[0][0] for call in mock_print.call_args_list] - summary_text = "\n".join(print_calls) - - assert "Total Evaluations: 2" in summary_text - assert "Passed: 2" in summary_text - assert "Failed: 0" in summary_text - assert "Errored: 0" in summary_text - assert "Success Rate: 100.0%" in summary_text - - def test_print_summary_mixed_results(self, mock_args): - """Test print summary with mixed results.""" - results = [ - EvaluationResult("test_001", "query1", "response1", "judge-llm", "PASS"), - EvaluationResult("test_002", "query2", "response2", "script", "FAIL"), - EvaluationResult("test_003", "query3", "response3", "script", "ERROR"), - ] + result = evaluator.get_result_summary() + assert result == {"TOTAL": 5, "PASS": 3, "FAIL": 1, "ERROR": 1} + + def test_get_result_summary_no_results(self, mock_args): + """Test result summary with no available results.""" with ( patch( "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager" @@ -352,23 +253,12 @@ def test_print_summary_mixed_results(self, mock_args): patch( "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner" ), - patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager"), + patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner"), ): - evaluator = AgentGoalEval(mock_args) - with patch("builtins.print") as mock_print: - evaluator._print_summary(results) - - # Check that summary was printed - print_calls = [call[0][0] for call in mock_print.call_args_list] - summary_text = "\n".join(print_calls) - - assert "Total Evaluations: 3" in summary_text - assert "Passed: 1" in summary_text - assert "Failed: 1" in summary_text - assert "Errored: 1" in summary_text - assert "Success Rate: 33.3%" in summary_text + with pytest.raises(AgentEvaluationError, match="No results available"): + evaluator.get_result_summary() def test_cleanup_with_client(self, mock_args): """Test cleanup method with client.""" @@ -385,7 +275,7 @@ def test_cleanup_with_client(self, mock_args): patch( "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner" ), - patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager"), + patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner"), ): mock_client = Mock() @@ -412,7 +302,7 @@ def test_cleanup_exception(self, mock_args): patch( "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner" ), - patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager"), + patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ScriptRunner"), ): mock_client = Mock() @@ -431,63 +321,3 @@ def test_cleanup_exception(self, mock_args): args, kwargs = mock_logger.warning.call_args assert args[0] == "Error during cleanup: %s" assert str(args[1]) == "Cleanup error" - - @patch( - "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager" - ) - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager") - def test_run_evaluation_cleanup_called( - self, - mock_results_manager, - mock_evaluation_runner, - mock_judge_manager, - mock_agent_client, - mock_config_manager, - mock_args, - sample_configs, - sample_results, - ): - """Test that cleanup is called even on success.""" - mock_config_manager.return_value.get_eval_data.return_value = sample_configs - mock_evaluation_runner.return_value.run_evaluation.side_effect = sample_results - - evaluator = AgentGoalEval(mock_args) - - with patch.object(evaluator, "_cleanup") as mock_cleanup: - evaluator.run_evaluation() - - # Verify cleanup was called - mock_cleanup.assert_called_once() - - @patch( - "lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentGoalEvalDataManager" - ) - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.AgentHttpClient") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.JudgeModelManager") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.EvaluationRunner") - @patch("lsc_agent_eval.core.agent_goal_eval.agent_goal_eval.ResultsManager") - def test_run_evaluation_cleanup_called_on_exception( - self, - mock_results_manager, - mock_evaluation_runner, - mock_judge_manager, - mock_agent_client, - mock_config_manager, - mock_args, - ): - """Test that cleanup is called even on exception.""" - mock_config_manager.return_value.get_eval_data.side_effect = Exception( - "Config error" - ) - - evaluator = AgentGoalEval(mock_args) - - with patch.object(evaluator, "_cleanup") as mock_cleanup: - with pytest.raises(Exception): - evaluator.run_evaluation() - - # Verify cleanup was called - mock_cleanup.assert_called_once() diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_eval_data.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_eval_data.py index 71f6c291..865e60d1 100644 --- a/lsc_agent_eval/tests/core/agent_goal_eval/test_eval_data.py +++ b/lsc_agent_eval/tests/core/agent_goal_eval/test_eval_data.py @@ -8,112 +8,128 @@ import yaml from lsc_agent_eval.core.agent_goal_eval.eval_data import AgentGoalEvalDataManager -from lsc_agent_eval.core.agent_goal_eval.models import EvaluationDataConfig -from lsc_agent_eval.core.utils.exceptions import ConfigurationError +from lsc_agent_eval.core.agent_goal_eval.models import ( + ConversationDataConfig, + EvaluationDataConfig, +) +from lsc_agent_eval.core.utils.exceptions import EvaluationDataError class TestAgentGoalEvalDataManager: """Test AgentGoalEvalDataManager.""" @pytest.fixture - def valid_eval_data(self): - """Valid evaluation data for testing.""" - return [ - { - "eval_id": "test_001", - "eval_query": "What is Kubernetes?", - "eval_type": "judge-llm", - "expected_response": "Kubernetes is a container orchestration platform", - }, - { - "eval_id": "test_002", - "eval_query": "Deploy nginx", - "eval_type": "script", - "eval_verify_script": "./scripts/verify_nginx.sh", - }, - { - "eval_id": "test_003", - "eval_query": "Show pods", - "eval_type": "sub-string", - "expected_keywords": ["pod", "running"], - }, - ] + def valid_conversation_yaml_content(self): + """Valid YAML content with conversation-based structure.""" + return """ +- conversation_group: conv1 + description: Test namespace detection using substring matching + conversation: + - eval_id: eval1 + eval_query: is there a openshift-monitoring namespace ? + eval_type: sub-string + expected_keywords: + - 'yes' + - openshift-monitoring + description: Check for openshift-monitoring namespace existence + +- conversation_group: conv2 + description: Test namespace detection using LLM judge + conversation: + - eval_id: eval1 + eval_query: is there a openshift-lightspeed namespace ? + eval_type: judge-llm + expected_response: there is a openshift-lightspeed namespace. + description: Verify openshift-lightspeed namespace with LLM evaluation +""" @pytest.fixture - def valid_yaml_content(self, valid_eval_data): - """Valid YAML content as string.""" - return yaml.dump(valid_eval_data) - - def test_init_success(self, valid_yaml_content): + def multiturn_conversation_yaml_content(self): + """Valid YAML content with multi-turn conversation.""" + return """ +- conversation_group: conv1 + description: Basic conversation flow testing cluster operations + conversation: + - eval_id: eval1 + eval_query: Hi! + eval_type: judge-llm + expected_response: Hello! I'm an AI assistant for the Assisted Installer. + description: Initial greeting to start conversation + - eval_id: eval2 + eval_query: Get me active clusters + eval_type: judge-llm + expected_response: Active clusters are x1, x2. + description: Request for cluster information + - eval_id: eval3 + eval_query: Thank you + eval_type: judge-llm + expected_response: You're welcome! + description: Closing statement +""" + + def test_init_success(self, valid_conversation_yaml_content): """Test successful initialization.""" with ( - patch("builtins.open", mock_open(read_data=valid_yaml_content)), + patch( + "builtins.open", mock_open(read_data=valid_conversation_yaml_content) + ), patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_file", return_value=True), ): manager = AgentGoalEvalDataManager("test.yaml") - assert len(manager.eval_data) == 3 - assert manager.eval_data_file == Path("test.yaml") - assert isinstance(manager.eval_data[0], EvaluationDataConfig) + assert manager.eval_data_file == "test.yaml" + assert len(manager.conversations) == 2 + assert len(manager.get_conversations()) == 2 + assert manager.get_eval_count() == 2 def test_init_file_not_found(self): """Test initialization with non-existent file.""" - with patch("pathlib.Path.exists", return_value=False): - with pytest.raises(ConfigurationError, match="Eval data file not found"): - AgentGoalEvalDataManager("nonexistent.yaml") + with pytest.raises(EvaluationDataError, match="Eval data file not found"): + AgentGoalEvalDataManager("nonexistent.yaml") - def test_init_path_not_file(self): - """Test initialization when path is not a file.""" - with ( - patch("pathlib.Path.exists", return_value=True), - patch("pathlib.Path.is_file", return_value=False), - ): - - with pytest.raises(ConfigurationError, match="path is not a file"): - AgentGoalEvalDataManager("directory/") + def test_validate_eval_data_file_not_yaml(self): + """Test loading invalid YAML file.""" + invalid_yaml = "invalid: yaml: content: [" - def test_validate_eval_data_file_exists(self): - """Test file validation when file exists.""" with ( + patch("builtins.open", mock_open(read_data=invalid_yaml)), patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_file", return_value=True), - patch("builtins.open", mock_open(read_data="[]")), ): - # Should not raise exception - manager = AgentGoalEvalDataManager("test.yaml") - assert manager.eval_data_file == Path("test.yaml") - - def test_load_eval_data_invalid_yaml(self): - """Test loading invalid YAML content.""" - invalid_yaml = "invalid: yaml: content: [" + with pytest.raises(EvaluationDataError, match="Invalid YAML"): + AgentGoalEvalDataManager("test.yaml") + def test_load_eval_data_file_read_error(self): + """Test loading when file read fails.""" with ( - patch("builtins.open", mock_open(read_data=invalid_yaml)), + patch("builtins.open", side_effect=IOError("Read error")), patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_file", return_value=True), ): - with pytest.raises(ConfigurationError, match="Invalid YAML"): + with pytest.raises( + EvaluationDataError, match="Error loading eval data file" + ): AgentGoalEvalDataManager("test.yaml") def test_load_eval_data_not_list(self): """Test loading YAML that is not a list.""" - yaml_dict = yaml.dump({"key": "value"}) + non_list_yaml = yaml.dump({"not": "a list"}) with ( - patch("builtins.open", mock_open(read_data=yaml_dict)), + patch("builtins.open", mock_open(read_data=non_list_yaml)), patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_file", return_value=True), ): - with pytest.raises(ConfigurationError, match="must contain a list"): + with pytest.raises(EvaluationDataError, match="must contain a list"): AgentGoalEvalDataManager("test.yaml") def test_load_eval_data_empty_list(self): - """Test loading empty evaluation list.""" + """Test loading YAML file with empty list.""" empty_yaml = yaml.dump([]) with ( @@ -122,25 +138,25 @@ def test_load_eval_data_empty_list(self): patch("pathlib.Path.is_file", return_value=True), ): - manager = AgentGoalEvalDataManager("test.yaml") - assert len(manager.eval_data) == 0 - - def test_load_eval_data_file_read_error(self): - """Test loading when file read fails.""" - with ( - patch("builtins.open", side_effect=IOError("Read error")), - patch("pathlib.Path.exists", return_value=True), - patch("pathlib.Path.is_file", return_value=True), - ): - with pytest.raises( - ConfigurationError, match="Error loading eval data file" + EvaluationDataError, match="must contain at least one conversation" ): AgentGoalEvalDataManager("test.yaml") - def test_validate_eval_data_missing_eval_id(self): - """Test validation with missing eval_id.""" - invalid_data = [{"eval_query": "test query"}] + def test_validate_conversation_missing_group(self): + """Test validation with missing conversation_group.""" + invalid_data = [ + { + "conversation": [ + { + "eval_id": "test1", + "eval_query": "test query", + "eval_type": "judge-llm", + "expected_response": "test response", + } + ] + } + ] yaml_content = yaml.dump(invalid_data) with ( @@ -149,14 +165,16 @@ def test_validate_eval_data_missing_eval_id(self): patch("pathlib.Path.is_file", return_value=True), ): - with pytest.raises( - ConfigurationError, match="Missing required field 'eval_id'" - ): + with pytest.raises(EvaluationDataError, match=".*Field required.*"): AgentGoalEvalDataManager("test.yaml") - def test_validate_eval_data_missing_eval_query(self): - """Test validation with missing eval_query.""" - invalid_data = [{"eval_id": "test_001"}] + def test_validate_conversation_missing_conversation_list(self): + """Test validation with missing conversation list.""" + invalid_data = [ + { + "conversation_group": "test_conv", + } + ] yaml_content = yaml.dump(invalid_data) with ( @@ -165,18 +183,21 @@ def test_validate_eval_data_missing_eval_query(self): patch("pathlib.Path.is_file", return_value=True), ): - with pytest.raises( - ConfigurationError, match="Missing required field 'eval_query'" - ): + with pytest.raises(EvaluationDataError, match=".*Field required.*"): AgentGoalEvalDataManager("test.yaml") - def test_validate_eval_data_invalid_eval_type(self): - """Test validation with invalid eval_type.""" + def test_validate_eval_missing_eval_id(self): + """Test validation with missing eval_id.""" invalid_data = [ { - "eval_id": "test_001", - "eval_query": "test query", - "eval_type": "invalid_type", + "conversation_group": "test_conv", + "conversation": [ + { + "eval_query": "test query", + "eval_type": "judge-llm", + "expected_response": "test response", + } + ], } ] yaml_content = yaml.dump(invalid_data) @@ -187,18 +208,21 @@ def test_validate_eval_data_invalid_eval_type(self): patch("pathlib.Path.is_file", return_value=True), ): - with pytest.raises( - ConfigurationError, match="Invalid eval_type: invalid_type" - ): + with pytest.raises(EvaluationDataError, match=".*Field required.*"): AgentGoalEvalDataManager("test.yaml") - def test_validate_eval_data_judge_llm_missing_expected_response(self): - """Test validation for judge-llm type missing expected_response.""" + def test_validate_eval_missing_eval_query(self): + """Test validation with missing eval_query.""" invalid_data = [ { - "eval_id": "test_001", - "eval_query": "test query", - "eval_type": "judge-llm", + "conversation_group": "test_conv", + "conversation": [ + { + "eval_id": "test1", + "eval_type": "judge-llm", + "expected_response": "test response", + } + ], } ] yaml_content = yaml.dump(invalid_data) @@ -209,18 +233,21 @@ def test_validate_eval_data_judge_llm_missing_expected_response(self): patch("pathlib.Path.is_file", return_value=True), ): - with pytest.raises( - ConfigurationError, match="requires 'expected_response' field" - ): + with pytest.raises(EvaluationDataError, match=".*Field required.*"): AgentGoalEvalDataManager("test.yaml") - def test_validate_eval_data_sub_string_missing_keywords(self): - """Test validation for sub-string type missing expected_keywords.""" + def test_validate_eval_missing_eval_type(self): + """Test validation with missing eval_type.""" invalid_data = [ { - "eval_id": "test_001", - "eval_query": "test query", - "eval_type": "sub-string", + "conversation_group": "test_conv", + "conversation": [ + { + "eval_id": "test1", + "eval_query": "test query", + "expected_response": "test response", + } + ], } ] yaml_content = yaml.dump(invalid_data) @@ -231,15 +258,23 @@ def test_validate_eval_data_sub_string_missing_keywords(self): patch("pathlib.Path.is_file", return_value=True), ): - with pytest.raises( - ConfigurationError, match="requires 'expected_keywords' field" - ): + with pytest.raises(EvaluationDataError, match=".*Field required.*"): AgentGoalEvalDataManager("test.yaml") - def test_validate_eval_data_script_missing_verify_script(self): - """Test validation for script type missing eval_verify_script.""" + def test_validate_eval_invalid_eval_type(self): + """Test validation with invalid eval_type.""" invalid_data = [ - {"eval_id": "test_001", "eval_query": "test query", "eval_type": "script"} + { + "conversation_group": "test_conv", + "conversation": [ + { + "eval_id": "test1", + "eval_query": "test query", + "eval_type": "invalid_type", + "expected_response": "test response", + } + ], + } ] yaml_content = yaml.dump(invalid_data) @@ -250,21 +285,25 @@ def test_validate_eval_data_script_missing_verify_script(self): ): with pytest.raises( - ConfigurationError, match="requires 'eval_verify_script' field" + EvaluationDataError, match=".*eval_type must be one of.*" ): AgentGoalEvalDataManager("test.yaml") - def test_validate_eval_data_default_eval_type(self): - """Test validation with default eval_type (judge-llm).""" - data_with_default_type = [ + def test_validate_judge_llm_missing_expected_response(self): + """Test validation for judge-llm missing expected_response.""" + invalid_data = [ { - "eval_id": "test_001", - "eval_query": "test query", - "expected_response": "test response", - # eval_type not specified, should default to judge-llm + "conversation_group": "test_conv", + "conversation": [ + { + "eval_id": "test1", + "eval_query": "test query", + "eval_type": "judge-llm", + } + ], } ] - yaml_content = yaml.dump(data_with_default_type) + yaml_content = yaml.dump(invalid_data) with ( patch("builtins.open", mock_open(read_data=yaml_content)), @@ -272,205 +311,209 @@ def test_validate_eval_data_default_eval_type(self): patch("pathlib.Path.is_file", return_value=True), ): - manager = AgentGoalEvalDataManager("test.yaml") - assert len(manager.eval_data) == 1 - assert manager.eval_data[0].eval_type == "judge-llm" + with pytest.raises(EvaluationDataError, match=".*expected_response.*"): + AgentGoalEvalDataManager("test.yaml") + + def test_validate_sub_string_missing_keywords(self): + """Test validation for sub-string missing expected_keywords.""" + invalid_data = [ + { + "conversation_group": "test_conv", + "conversation": [ + { + "eval_id": "test1", + "eval_query": "test query", + "eval_type": "sub-string", + } + ], + } + ] + yaml_content = yaml.dump(invalid_data) - def test_get_eval_data(self, valid_yaml_content): - """Test get_eval_data method.""" with ( - patch("builtins.open", mock_open(read_data=valid_yaml_content)), + patch("builtins.open", mock_open(read_data=yaml_content)), patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_file", return_value=True), ): - manager = AgentGoalEvalDataManager("test.yaml") - eval_data = manager.get_eval_data() + with pytest.raises(EvaluationDataError, match=".*expected_keywords.*"): + AgentGoalEvalDataManager("test.yaml") - assert isinstance(eval_data, list) - assert len(eval_data) == 3 - assert all(isinstance(item, EvaluationDataConfig) for item in eval_data) - assert eval_data[0].eval_id == "test_001" - assert eval_data[1].eval_id == "test_002" - assert eval_data[2].eval_id == "test_003" + def test_validate_script_missing_verify_script(self): + """Test validation for script missing eval_verify_script.""" + invalid_data = [ + { + "conversation_group": "test_conv", + "conversation": [ + { + "eval_id": "test1", + "eval_query": "test query", + "eval_type": "script", + } + ], + } + ] + yaml_content = yaml.dump(invalid_data) - def test_get_eval_count(self, valid_yaml_content): - """Test get_eval_count method.""" with ( - patch("builtins.open", mock_open(read_data=valid_yaml_content)), + patch("builtins.open", mock_open(read_data=yaml_content)), patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_file", return_value=True), ): - manager = AgentGoalEvalDataManager("test.yaml") - count = manager.get_eval_count() - - assert count == 3 - assert count == len(manager.eval_data) + with pytest.raises(EvaluationDataError, match=".*eval_verify_script.*"): + AgentGoalEvalDataManager("test.yaml") - def test_get_eval_count_empty(self): - """Test get_eval_count with empty data.""" - empty_yaml = yaml.dump([]) + def test_duplicate_conversation_groups(self): + """Test validation with duplicate conversation_group names.""" + invalid_data = [ + { + "conversation_group": "duplicate_group", + "conversation": [ + { + "eval_id": "test1", + "eval_query": "test query 1", + "eval_type": "judge-llm", + "expected_response": "test response 1", + } + ], + }, + { + "conversation_group": "duplicate_group", + "conversation": [ + { + "eval_id": "test2", + "eval_query": "test query 2", + "eval_type": "judge-llm", + "expected_response": "test response 2", + } + ], + }, + ] + yaml_content = yaml.dump(invalid_data) with ( - patch("builtins.open", mock_open(read_data=empty_yaml)), + patch("builtins.open", mock_open(read_data=yaml_content)), patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_file", return_value=True), ): - manager = AgentGoalEvalDataManager("test.yaml") - count = manager.get_eval_count() - - assert count == 0 - - def test_judge_llm_validation_success(self): - """Test successful validation for judge-llm type.""" - judge_llm_data = [ - { - "eval_id": "test_judge", - "eval_query": "What is Docker?", - "eval_type": "judge-llm", - "expected_response": "Docker is a containerization platform", - } - ] - yaml_content = yaml.dump(judge_llm_data) + with pytest.raises( + EvaluationDataError, match="Duplicate conversation_group" + ): + AgentGoalEvalDataManager("test.yaml") + def test_get_conversations(self, valid_conversation_yaml_content): + """Test get conversations method.""" with ( - patch("builtins.open", mock_open(read_data=yaml_content)), + patch( + "builtins.open", mock_open(read_data=valid_conversation_yaml_content) + ), patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_file", return_value=True), ): manager = AgentGoalEvalDataManager("test.yaml") - assert len(manager.eval_data) == 1 - assert manager.eval_data[0].eval_type == "judge-llm" - assert ( - manager.eval_data[0].expected_response - == "Docker is a containerization platform" - ) - - def test_script_validation_success(self): - """Test successful validation for script type.""" - script_data = [ - { - "eval_id": "test_script", - "eval_query": "Deploy application", - "eval_type": "script", - "eval_verify_script": "./verify_deployment.sh", - } - ] - yaml_content = yaml.dump(script_data) + conversations = manager.get_conversations() + + assert len(conversations) == 2 + assert isinstance(conversations[0], ConversationDataConfig) + assert conversations[0].conversation_group == "conv1" + assert conversations[1].conversation_group == "conv2" + def test_get_eval_data_via_conversations(self, valid_conversation_yaml_content): + """Test getting evaluation data via conversations.""" with ( - patch("builtins.open", mock_open(read_data=yaml_content)), + patch( + "builtins.open", mock_open(read_data=valid_conversation_yaml_content) + ), patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_file", return_value=True), ): manager = AgentGoalEvalDataManager("test.yaml") - assert len(manager.eval_data) == 1 - assert manager.eval_data[0].eval_type == "script" - assert manager.eval_data[0].eval_verify_script == "./verify_deployment.sh" + conversations = manager.get_conversations() - def test_sub_string_validation_success(self): - """Test successful validation for sub-string type.""" - sub_string_data = [ - { - "eval_id": "test_substring", - "eval_query": "List services", - "eval_type": "sub-string", - "expected_keywords": ["service", "active", "running"], - } - ] - yaml_content = yaml.dump(sub_string_data) + eval_data = [] + for conversation in conversations: + eval_data.extend(conversation.conversation) + assert len(eval_data) == 2 + assert isinstance(eval_data[0], EvaluationDataConfig) + assert eval_data[0].eval_id == "eval1" + assert eval_data[1].eval_id == "eval1" + + def test_get_eval_count(self, valid_conversation_yaml_content): + """Test get_eval_count method.""" with ( - patch("builtins.open", mock_open(read_data=yaml_content)), + patch( + "builtins.open", mock_open(read_data=valid_conversation_yaml_content) + ), patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_file", return_value=True), ): manager = AgentGoalEvalDataManager("test.yaml") - assert len(manager.eval_data) == 1 - assert manager.eval_data[0].eval_type == "sub-string" - assert manager.eval_data[0].expected_keywords == [ - "service", - "active", - "running", - ] - - def test_mixed_eval_types(self): - """Test loading data with mixed evaluation types.""" - mixed_data = [ - { - "eval_id": "judge_test", - "eval_query": "What is Kubernetes?", - "eval_type": "judge-llm", - "expected_response": "Container orchestration", - }, - { - "eval_id": "script_test", - "eval_query": "Deploy nginx", - "eval_type": "script", - "eval_verify_script": "./verify.sh", - }, - { - "eval_id": "substring_test", - "eval_query": "List pods", - "eval_type": "sub-string", - "expected_keywords": ["pod", "running"], - }, - ] - yaml_content = yaml.dump(mixed_data) + count = manager.get_eval_count() + assert count == 2 + + def test_conversation_count_via_conversations( + self, valid_conversation_yaml_content + ): + """Test getting conversation count via conversations list.""" with ( - patch("builtins.open", mock_open(read_data=yaml_content)), + patch( + "builtins.open", mock_open(read_data=valid_conversation_yaml_content) + ), patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_file", return_value=True), ): manager = AgentGoalEvalDataManager("test.yaml") - assert len(manager.eval_data) == 3 + count = len(manager.conversations) - types = [item.eval_type for item in manager.eval_data] - assert "judge-llm" in types - assert "script" in types - assert "sub-string" in types - - def test_eval_data_with_optional_fields(self): - """Test evaluation data with optional fields.""" - data_with_optional = [ - { - "eval_id": "test_with_optional", - "eval_query": "Deploy app", - "eval_type": "script", - "eval_verify_script": "./verify.sh", - "eval_setup_script": "./setup.sh", - "eval_cleanup_script": "./cleanup.sh", - } - ] - yaml_content = yaml.dump(data_with_optional) + assert count == 2 + def test_multiturn_conversation_loading(self, multiturn_conversation_yaml_content): + """Test loading multi-turn conversation.""" with ( - patch("builtins.open", mock_open(read_data=yaml_content)), + patch( + "builtins.open", + mock_open(read_data=multiturn_conversation_yaml_content), + ), patch("pathlib.Path.exists", return_value=True), patch("pathlib.Path.is_file", return_value=True), ): manager = AgentGoalEvalDataManager("test.yaml") - assert len(manager.eval_data) == 1 - eval_item = manager.eval_data[0] - assert eval_item.eval_setup_script == "./setup.sh" - assert eval_item.eval_cleanup_script == "./cleanup.sh" + + assert len(manager.conversations) == 1 + assert manager.get_eval_count() == 3 + + conversations = manager.get_conversations() + conv = conversations[0] + assert conv.conversation_group == "conv1" + assert len(conv.conversation) == 3 + assert conv.conversation[0].eval_id == "eval1" + assert conv.conversation[1].eval_id == "eval2" + assert conv.conversation[2].eval_id == "eval3" def test_load_real_yaml_file_integration(self): """Integration test with a real temporary YAML file.""" eval_data = [ { - "eval_id": "integration_test", - "eval_query": "Test query", - "eval_type": "judge-llm", - "expected_response": "Test response", + "conversation_group": "integration_test", + "description": "Integration test conversation", + "conversation": [ + { + "eval_id": "integration_test_eval", + "eval_query": "Test query", + "eval_type": "judge-llm", + "expected_response": "Test response", + "description": "Integration test evaluation", + } + ], } ] @@ -480,7 +523,13 @@ def test_load_real_yaml_file_integration(self): try: manager = AgentGoalEvalDataManager(temp_file_path) - assert len(manager.eval_data) == 1 - assert manager.eval_data[0].eval_id == "integration_test" + + assert len(manager.conversations) == 1 + assert manager.get_eval_count() == 1 + + conversations = manager.get_conversations() + assert conversations[0].conversation_group == "integration_test" + assert conversations[0].description == "Integration test conversation" + finally: Path(temp_file_path).unlink() # Clean up temporary file diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_evaluator.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_evaluator.py index a4c1d96b..69159b3a 100644 --- a/lsc_agent_eval/tests/core/agent_goal_eval/test_evaluator.py +++ b/lsc_agent_eval/tests/core/agent_goal_eval/test_evaluator.py @@ -1,6 +1,8 @@ """Tests for evaluation runner.""" -from unittest.mock import Mock, patch +import os +import tempfile +from unittest.mock import Mock import pytest @@ -9,6 +11,7 @@ EvaluationDataConfig, EvaluationResult, ) +from lsc_agent_eval.core.agent_goal_eval.script_runner import ScriptRunner from lsc_agent_eval.core.utils.api_client import AgentHttpClient from lsc_agent_eval.core.utils.exceptions import AgentAPIError, ScriptExecutionError from lsc_agent_eval.core.utils.judge import JudgeModelManager @@ -21,9 +24,24 @@ class TestEvaluationRunner: def mock_agent_client(self): """Mock agent client.""" mock_client = Mock(spec=AgentHttpClient) - mock_client.query_agent.return_value = "Test agent response" + + # Mock agent API: return conversation_id from input or generate one + def mock_query_agent(api_input, timeout=300): + return ( + "Test agent response", + api_input.get("conversation_id", "generated-conversation-id"), + ) + + mock_client.query_agent.side_effect = mock_query_agent return mock_client + @pytest.fixture + def mock_script_runner(self): + """Mock script runner.""" + mock_runner = Mock(spec=ScriptRunner) + mock_runner.run_script.return_value = True + return mock_runner + @pytest.fixture def mock_judge_manager(self): """Mock judge manager.""" @@ -36,385 +54,361 @@ def sample_config_judge_llm(self): """Sample judge-llm evaluation configuration.""" return EvaluationDataConfig( eval_id="test_001", - eval_query="What is Kubernetes?", + eval_query="What is Openshift Virtualization?", eval_type="judge-llm", - expected_response="Kubernetes is a container orchestration platform", + expected_response="OpenShift Virtualization is an extension of the OpenShift Container Platform", ) @pytest.fixture - def sample_config_script(self): + def get_test_script_path(self): + """Create a temporary test script file and cleanup.""" + # Setup + with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f: + f.write('#!/bin/bash\necho "test script"\nexit 0') + script_path = f.name + os.chmod(script_path, 0o755) + + yield script_path + + # Cleanup + os.unlink(script_path) + + @pytest.fixture + def sample_config_script(self, get_test_script_path): """Sample script evaluation configuration.""" return EvaluationDataConfig( eval_id="test_002", eval_query="Deploy nginx", eval_type="script", - eval_verify_script="./verify.sh", + eval_verify_script=get_test_script_path, ) @pytest.fixture def sample_config_substring(self): - """Sample substring evaluation configuration.""" + """Sample sub-string evaluation configuration.""" return EvaluationDataConfig( eval_id="test_003", - eval_query="What is Docker?", + eval_query="What is Podman?", eval_type="sub-string", - expected_keywords=["container", "docker"], + expected_keywords=["container", "podman"], ) - def test_init(self, mock_agent_client, mock_judge_manager): + def test_init(self, mock_agent_client, mock_script_runner, mock_judge_manager): """Test EvaluationRunner initialization.""" runner = EvaluationRunner( - mock_agent_client, mock_judge_manager, kubeconfig="~/kubeconfig" + mock_agent_client, + mock_script_runner, + mock_judge_manager, ) assert runner.agent_client == mock_agent_client + assert runner.script_runner == mock_script_runner assert runner.judge_manager == mock_judge_manager - assert runner.kubeconfig == "~/kubeconfig" - def test_init_without_judge_manager(self, mock_agent_client): + def test_init_without_judge_manager(self, mock_agent_client, mock_script_runner): """Test EvaluationRunner initialization without judge manager.""" - runner = EvaluationRunner(mock_agent_client) + runner = EvaluationRunner(mock_agent_client, mock_script_runner) assert runner.agent_client == mock_agent_client + assert runner.script_runner == mock_script_runner assert runner.judge_manager is None - @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner") def test_run_evaluation_judge_llm_success( self, - mock_script_runner, mock_agent_client, + mock_script_runner, mock_judge_manager, sample_config_judge_llm, ): """Test successful judge-llm evaluation.""" - # Mock agent response - mock_agent_client.query_agent.return_value = ( - "Kubernetes is a container orchestration platform" + runner = EvaluationRunner( + mock_agent_client, mock_script_runner, mock_judge_manager ) - # Mock judge response - mock_judge_manager.evaluate_response.return_value = "1" - - runner = EvaluationRunner(mock_agent_client, mock_judge_manager) - result = runner.run_evaluation(sample_config_judge_llm, "openai", "gpt-4") + result = runner.run_evaluation( + sample_config_judge_llm, + "watsonx", + "ibm/granite-3-3-8b-instruct", + "conv-id-123", + ) assert isinstance(result, EvaluationResult) assert result.eval_id == "test_001" - assert result.result == "PASS" + assert result.query == "What is Openshift Virtualization?" assert result.eval_type == "judge-llm" + assert result.result == "PASS" + assert result.conversation_id == "conv-id-123" assert result.error is None - # Verify agent was queried + # Verify agent was called mock_agent_client.query_agent.assert_called_once_with( - "What is Kubernetes?", "openai", "gpt-4" + { + "query": "What is Openshift Virtualization?", + "provider": "watsonx", + "model": "ibm/granite-3-3-8b-instruct", + "conversation_id": "conv-id-123", + } ) # Verify judge was called mock_judge_manager.evaluate_response.assert_called_once() - @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner") - def test_run_evaluation_script_success( - self, mock_script_runner_class, mock_agent_client, sample_config_script + def test_run_evaluation_judge_llm_failure( + self, + mock_agent_client, + mock_script_runner, + mock_judge_manager, + sample_config_judge_llm, ): - """Test successful script evaluation.""" - # Mock agent response - mock_agent_client.query_agent.return_value = ( - "kubectl create deployment nginx --image=nginx" - ) + """Test failed judge-llm evaluation.""" + # Mock judge to return 0 (failure) + mock_judge_manager.evaluate_response.return_value = "0" - # Mock script runner instance - mock_script_runner_instance = Mock() - mock_script_runner_instance.run_script.return_value = True - mock_script_runner_class.return_value = mock_script_runner_instance + runner = EvaluationRunner( + mock_agent_client, mock_script_runner, mock_judge_manager + ) - runner = EvaluationRunner(mock_agent_client) - result = runner.run_evaluation(sample_config_script, "openai", "gpt-4") + result = runner.run_evaluation( + sample_config_judge_llm, + "openai", + "gpt-4", + "conv-id-123", + ) - assert isinstance(result, EvaluationResult) - assert result.eval_id == "test_002" - assert result.result == "PASS" - assert result.eval_type == "script" + assert result.result == "FAIL" assert result.error is None - # Verify ScriptRunner was created with the right kubeconfig - mock_script_runner_class.assert_called_with(kubeconfig=None) - # Verify script was executed - mock_script_runner_instance.run_script.assert_called_once_with("./verify.sh") - - @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner") - def test_run_evaluation_script_failure( - self, mock_script_runner_class, mock_agent_client, sample_config_script + def test_run_evaluation_script_success( + self, mock_agent_client, mock_script_runner, sample_config_script ): - """Test script evaluation failure.""" - # Mock agent response - mock_agent_client.query_agent.return_value = ( - "kubectl create deployment nginx --image=nginx" - ) - - # Mock script runner instance returning failure - mock_script_runner_instance = Mock() - mock_script_runner_instance.run_script.return_value = False - mock_script_runner_class.return_value = mock_script_runner_instance + """Test successful script evaluation.""" + runner = EvaluationRunner(mock_agent_client, mock_script_runner) - runner = EvaluationRunner(mock_agent_client) - result = runner.run_evaluation(sample_config_script, "openai", "gpt-4") + result = runner.run_evaluation( + sample_config_script, + "openai", + "gpt-4", + "conv-id-123", + ) assert isinstance(result, EvaluationResult) assert result.eval_id == "test_002" - assert result.result == "FAIL" assert result.eval_type == "script" + assert result.result == "PASS" assert result.error is None - @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner") - def test_run_evaluation_script_with_kubeconfig( - self, mock_script_runner_class, mock_agent_client, sample_config_script - ): - """Test script evaluation with kubeconfig.""" - # Mock agent response - mock_agent_client.query_agent.return_value = ( - "kubectl create deployment nginx --image=nginx" + # Verify agent was called + mock_agent_client.query_agent.assert_called_once() + + # Verify script was run + mock_script_runner.run_script.assert_called_once_with( + sample_config_script.eval_verify_script ) - # Mock script runner instance - mock_script_runner_instance = Mock() - mock_script_runner_instance.run_script.return_value = True - mock_script_runner_class.return_value = mock_script_runner_instance + def test_run_evaluation_script_failure( + self, mock_agent_client, mock_script_runner, sample_config_script + ): + """Test failed script evaluation.""" + # Mock script to return False (failure) + mock_script_runner.run_script.return_value = False - runner = EvaluationRunner(mock_agent_client, kubeconfig="~/kubeconfig") - result = runner.run_evaluation(sample_config_script, "openai", "gpt-4") + runner = EvaluationRunner(mock_agent_client, mock_script_runner) - assert result.result == "PASS" + result = runner.run_evaluation( + sample_config_script, + "openai", + "gpt-4", + "conv-id-123", + ) - # Verify ScriptRunner was created with kubeconfig - mock_script_runner_class.assert_called_with(kubeconfig="~/kubeconfig") - # Verify script was executed - mock_script_runner_instance.run_script.assert_called_once_with("./verify.sh") + assert result.result == "FAIL" + assert result.error is None - @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner") def test_run_evaluation_script_execution_error( - self, mock_script_runner_class, mock_agent_client, sample_config_script + self, mock_agent_client, mock_script_runner, sample_config_script ): """Test script evaluation with execution error.""" - # Mock agent response - mock_agent_client.query_agent.return_value = ( - "kubectl create deployment nginx --image=nginx" - ) - - # Mock script runner instance raising error - mock_script_runner_instance = Mock() - mock_script_runner_instance.run_script.side_effect = ScriptExecutionError( + # Mock script to raise exception + mock_script_runner.run_script.side_effect = ScriptExecutionError( "Script failed" ) - mock_script_runner_class.return_value = mock_script_runner_instance - runner = EvaluationRunner(mock_agent_client) - result = runner.run_evaluation(sample_config_script, "openai", "gpt-4") + runner = EvaluationRunner(mock_agent_client, mock_script_runner) + + result = runner.run_evaluation( + sample_config_script, + "openai", + "gpt-4", + "conv-id-123", + ) - assert isinstance(result, EvaluationResult) - assert result.eval_id == "test_002" assert result.result == "ERROR" - assert result.error == "Script failed" + assert "Script failed" in result.error def test_run_evaluation_substring_success( - self, mock_agent_client, sample_config_substring + self, mock_agent_client, mock_script_runner, sample_config_substring ): - """Test successful substring evaluation.""" + """Test successful sub-string evaluation.""" + # Mock agent response containing expected keywords - mock_agent_client.query_agent.return_value = "Docker is a container platform" + def mock_query_agent(api_input, timeout=300): + return ( + "Podman is an open-source container engine developed by Red Hat", + api_input.get("conversation_id", "test-conversation-id"), + ) - runner = EvaluationRunner(mock_agent_client) - result = runner.run_evaluation(sample_config_substring, "openai", "gpt-4") + mock_agent_client.query_agent.side_effect = mock_query_agent + + runner = EvaluationRunner(mock_agent_client, mock_script_runner) + + result = runner.run_evaluation( + sample_config_substring, + "openai", + "gpt-4", + "conv-id-123", + ) - assert isinstance(result, EvaluationResult) assert result.eval_id == "test_003" assert result.result == "PASS" assert result.eval_type == "sub-string" + assert result.error is None def test_run_evaluation_substring_failure( - self, mock_agent_client, sample_config_substring + self, mock_agent_client, mock_script_runner, sample_config_substring ): - """Test substring evaluation failure.""" + """Test sub-string evaluation failure.""" + # Mock agent response not containing expected keywords - mock_agent_client.query_agent.return_value = "This is about virtual machines" + def mock_query_agent(api_input, timeout=300): + return ( + "No information available", + api_input.get("conversation_id", "test-conversation-id"), + ) - runner = EvaluationRunner(mock_agent_client) - result = runner.run_evaluation(sample_config_substring, "openai", "gpt-4") + mock_agent_client.query_agent.side_effect = mock_query_agent + + runner = EvaluationRunner(mock_agent_client, mock_script_runner) + + result = runner.run_evaluation( + sample_config_substring, + "openai", + "gpt-4", + None, + ) - assert isinstance(result, EvaluationResult) assert result.eval_id == "test_003" assert result.result == "FAIL" assert result.eval_type == "sub-string" + assert result.error is None - @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner") - def test_run_evaluation_with_setup_script( - self, mock_script_runner_class, mock_agent_client, mock_judge_manager + def test_run_evaluation_agent_api_error( + self, mock_agent_client, mock_script_runner, sample_config_judge_llm ): - """Test evaluation with setup script.""" - config = EvaluationDataConfig( - eval_id="test_setup", - eval_query="Test query", - eval_type="judge-llm", - expected_response="Test response", - eval_setup_script="./setup.sh", + """Test evaluation with agent API error.""" + # Mock agent client to raise API error + mock_agent_client.query_agent.side_effect = AgentAPIError( + "API connection failed" ) - # Mock script runner instance for setup - mock_script_runner_instance = Mock() - mock_script_runner_instance.run_script.return_value = True - mock_script_runner_class.return_value = mock_script_runner_instance - - # Mock agent and judge responses - mock_agent_client.query_agent.return_value = "Test response" - mock_judge_manager.evaluate_response.return_value = "1" + runner = EvaluationRunner(mock_agent_client, mock_script_runner) - runner = EvaluationRunner(mock_agent_client, mock_judge_manager) - result = runner.run_evaluation(config, "openai", "gpt-4") + result = runner.run_evaluation( + sample_config_judge_llm, + "openai", + "gpt-4", + "conv-id-123", + ) - assert result.result == "PASS" - # Verify setup script was called - mock_script_runner_instance.run_script.assert_called_with("./setup.sh") + assert result.eval_id == "test_001" + assert result.result == "ERROR" + assert result.eval_type == "judge-llm" + assert "API connection failed" in result.error - @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner") - def test_run_evaluation_setup_script_failure( - self, mock_script_runner_class, mock_agent_client, mock_judge_manager + def test_substring_evaluation_logic( + self, mock_agent_client, mock_script_runner, mock_judge_manager ): - """Test evaluation with setup script failure.""" + """Test sub-string evaluation with different keyword combinations.""" + runner = EvaluationRunner( + mock_agent_client, mock_script_runner, mock_judge_manager + ) + config = EvaluationDataConfig( - eval_id="test_setup_fail", + eval_id="substring_test", eval_query="Test query", - eval_type="judge-llm", - expected_response="Test response", - eval_setup_script="./setup.sh", + eval_type="sub-string", + expected_keywords=["keyword1", "keyword2"], ) - # Mock failing setup script execution - mock_script_runner_instance = Mock() - mock_script_runner_instance.run_script.return_value = False - mock_script_runner_class.return_value = mock_script_runner_instance + # Test all keywords present - should PASS + def mock_query_agent_all_keywords(api_input, timeout=300): + return ( + "Response with keyword1 and keyword2", + api_input.get("conversation_id", "test-conversation-id"), + ) - runner = EvaluationRunner(mock_agent_client, mock_judge_manager) - result = runner.run_evaluation(config, "openai", "gpt-4") + mock_agent_client.query_agent.side_effect = mock_query_agent_all_keywords + result = runner.run_evaluation(config, "openai", "gpt-4", "conv-id-123") + assert result.result == "PASS" - assert result.result == "ERROR" - assert "Setup script failed" in result.error + # Test some keywords missing (only one present) - should FAIL + def mock_query_agent_one_keyword(api_input, timeout=300): + return ( + "Response with only keyword1", + api_input.get("conversation_id", "test-conversation-id"), + ) - @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner") - def test_run_evaluation_with_cleanup_script( - self, mock_script_runner_class, mock_agent_client, mock_judge_manager - ): - """Test evaluation with cleanup script.""" - config = EvaluationDataConfig( - eval_id="test_cleanup", - eval_query="Test query", - eval_type="judge-llm", - expected_response="Test response", - eval_cleanup_script="./cleanup.sh", - ) + mock_agent_client.query_agent.side_effect = mock_query_agent_one_keyword + result = runner.run_evaluation(config, "openai", "gpt-4", "conv-id-123") + assert result.result == "FAIL" - # Mock successful cleanup script execution - mock_script_runner_instance = Mock() - mock_script_runner_instance.run_script.return_value = True - mock_script_runner_class.return_value = mock_script_runner_instance + # Test no keywords present - should FAIL + def mock_query_agent_no_keywords(api_input, timeout=300): + return ( + "Response with no matching terms", + api_input.get("conversation_id", "test-conversation-id"), + ) - # Mock agent and judge responses - mock_agent_client.query_agent.return_value = "Test response" - mock_judge_manager.evaluate_response.return_value = "1" + mock_agent_client.query_agent.side_effect = mock_query_agent_no_keywords + result = runner.run_evaluation(config, "openai", "gpt-4", "conv-id-123") + assert result.result == "FAIL" - runner = EvaluationRunner(mock_agent_client, mock_judge_manager) - result = runner.run_evaluation(config, "openai", "gpt-4") + # Test case insensitive matching + def mock_query_agent_case_insensitive(api_input, timeout=300): + return ( + "Response with KEYWORD1 and Keyword2", + api_input.get("conversation_id", "test-conversation-id"), + ) + mock_agent_client.query_agent.side_effect = mock_query_agent_case_insensitive + result = runner.run_evaluation(config, "openai", "gpt-4", "conv-id-123") assert result.result == "PASS" - # Verify cleanup script was called - mock_script_runner_instance.run_script.assert_called_with("./cleanup.sh") - def test_run_evaluation_agent_api_error( - self, mock_agent_client, mock_judge_manager, sample_config_judge_llm + def test_conversation_id_propagation( + self, mock_agent_client, mock_script_runner, mock_judge_manager ): - """Test evaluation with agent API error.""" - # Mock agent API error - mock_agent_client.query_agent.side_effect = AgentAPIError( - "API connection failed" + """Test that conversation ID is properly propagated to results.""" + runner = EvaluationRunner( + mock_agent_client, mock_script_runner, mock_judge_manager ) - runner = EvaluationRunner(mock_agent_client, mock_judge_manager) - result = runner.run_evaluation(sample_config_judge_llm, "openai", "gpt-4") - - assert isinstance(result, EvaluationResult) - assert result.result == "ERROR" - assert "API connection failed" in result.error - - def test_run_evaluation_unknown_type(self, mock_agent_client): - """Test evaluation with unknown evaluation type.""" config = EvaluationDataConfig( - eval_id="test_unknown", + eval_id="conv_id_test", eval_query="Test query", - eval_type="unknown-type", + eval_type="judge-llm", + expected_response="Test response", ) - # Mock agent response - mock_agent_client.query_agent.return_value = "Test response" + test_conv_id = "conv-id-456" + result = runner.run_evaluation(config, "openai", "gpt-4", test_conv_id) - runner = EvaluationRunner(mock_agent_client) - result = runner.run_evaluation(config, "openai", "gpt-4") + assert result.conversation_id == test_conv_id - assert isinstance(result, EvaluationResult) - assert result.result == "FAIL" - - def test_get_judge_manager(self, mock_agent_client, mock_judge_manager): - """Test get_judge_manager method.""" - runner = EvaluationRunner(mock_agent_client, mock_judge_manager) - assert runner.get_judge_manager() == mock_judge_manager - - runner_no_judge = EvaluationRunner(mock_agent_client) - assert runner_no_judge.get_judge_manager() is None - - @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner") - def test_run_evaluation_judge_llm_failure( - self, - mock_script_runner, - mock_agent_client, - mock_judge_manager, - sample_config_judge_llm, - ): - """Test judge-llm evaluation failure.""" - # Mock agent response - mock_agent_client.query_agent.return_value = "Some incorrect response" - - # Mock judge response indicating failure - mock_judge_manager.evaluate_response.return_value = "0" - - runner = EvaluationRunner(mock_agent_client, mock_judge_manager) - result = runner.run_evaluation(sample_config_judge_llm, "openai", "gpt-4") - - assert isinstance(result, EvaluationResult) - assert result.eval_id == "test_001" - assert result.result == "FAIL" - assert result.eval_type == "judge-llm" - assert result.error is None - - @patch("lsc_agent_eval.core.agent_goal_eval.evaluator.ScriptRunner") - def test_run_evaluation_judge_llm_error( - self, - mock_script_runner, - mock_agent_client, - mock_judge_manager, - sample_config_judge_llm, - ): - """Test judge-llm evaluation error.""" - # Mock agent response - mock_agent_client.query_agent.return_value = "Some incorrect response" - - # Mock judge response indicating failure - mock_judge_manager.evaluate_response.return_value = "00" - - runner = EvaluationRunner(mock_agent_client, mock_judge_manager) - result = runner.run_evaluation(sample_config_judge_llm, "openai", "gpt-4") - - assert isinstance(result, EvaluationResult) - assert result.eval_id == "test_001" - assert result.result == "ERROR" - assert result.eval_type == "judge-llm" - assert result.error == ( - "Invalid response from the judge model. " - "Expected value either 0/1. Actual value: 00" + # Verify ID was passed to agent client + mock_agent_client.query_agent.assert_called_once_with( + { + "query": "Test query", + "provider": "openai", + "model": "gpt-4", + "conversation_id": test_conv_id, + } ) diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_models.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_models.py index e646d333..18c271f8 100644 --- a/lsc_agent_eval/tests/core/agent_goal_eval/test_models.py +++ b/lsc_agent_eval/tests/core/agent_goal_eval/test_models.py @@ -1,8 +1,16 @@ """Tests for agent evaluation models.""" +from pathlib import Path +from unittest.mock import mock_open, patch + +import pytest +from pydantic import ValidationError + from lsc_agent_eval.core.agent_goal_eval.models import ( + ConversationDataConfig, EvaluationDataConfig, EvaluationResult, + EvaluationStats, ) @@ -57,25 +65,35 @@ def test_evaluation_result_defaults(self): assert result.error is None + def test_evaluation_result_invalid_result_type(self): + """Test EvaluationResult with invalid result type.""" + with pytest.raises(ValidationError) as exc_info: + EvaluationResult( + eval_id="test_004", + query="Test query", + response="Test response", + eval_type="judge-llm", + result="INVALID", + ) -class TestEvaluationDataConfig: - """Test EvaluationDataConfig data class.""" + assert "Result must be one of" in str(exc_info.value) - def test_evaluation_data_config_minimal(self): - """Test creating minimal EvaluationDataConfig.""" - config = EvaluationDataConfig( - eval_id="test_001", - eval_query="What is Kubernetes?", - ) + def test_evaluation_result_invalid_eval_type(self): + """Test EvaluationResult with invalid eval type.""" + with pytest.raises(ValidationError) as exc_info: + EvaluationResult( + eval_id="test_005", + query="Test query", + response="Test response", + eval_type="invalid-type", + result="PASS", + ) - assert config.eval_id == "test_001" - assert config.eval_query == "What is Kubernetes?" - assert config.eval_type == "judge-llm" # default - assert config.expected_response is None - assert config.expected_keywords is None - assert config.eval_setup_script is None - assert config.eval_verify_script is None - assert config.eval_cleanup_script is None + assert "eval_type must be one of" in str(exc_info.value) + + +class TestEvaluationDataConfig: + """Test EvaluationDataConfig data class.""" def test_evaluation_data_config_judge_llm(self): """Test EvaluationDataConfig for judge-llm evaluation.""" @@ -91,19 +109,19 @@ def test_evaluation_data_config_judge_llm(self): assert config.eval_type == "judge-llm" assert config.expected_response == "Containers are lightweight virtualization" assert config.expected_keywords is None - assert config.eval_setup_script is None assert config.eval_verify_script is None - assert config.eval_cleanup_script is None + assert config.description is None - def test_evaluation_data_config_script(self): + @patch("builtins.open", mock_open()) + @patch("pathlib.Path.is_file", return_value=True) + @patch("pathlib.Path.exists", return_value=True) + def test_evaluation_data_config_script(self, mock_exists, mock_is_file): """Test EvaluationDataConfig for script evaluation.""" config = EvaluationDataConfig( eval_id="script_test", eval_query="Deploy nginx pod", eval_type="script", - eval_setup_script="./setup.sh", - eval_verify_script="./verify.sh", - eval_cleanup_script="./cleanup.sh", + eval_verify_script="/mock/script/path.sh", ) assert config.eval_id == "script_test" @@ -111,9 +129,11 @@ def test_evaluation_data_config_script(self): assert config.eval_type == "script" assert config.expected_response is None assert config.expected_keywords is None - assert config.eval_setup_script == "./setup.sh" - assert config.eval_verify_script == "./verify.sh" - assert config.eval_cleanup_script == "./cleanup.sh" + assert isinstance(config.eval_verify_script, Path) + + # Verify path validation was called + mock_exists.assert_called() + mock_is_file.assert_called() def test_evaluation_data_config_substring(self): """Test EvaluationDataConfig for sub-string evaluation.""" @@ -129,28 +149,240 @@ def test_evaluation_data_config_substring(self): assert config.eval_type == "sub-string" assert config.expected_response is None assert config.expected_keywords == ["isolation", "portability", "efficiency"] - assert config.eval_setup_script is None assert config.eval_verify_script is None - assert config.eval_cleanup_script is None - def test_evaluation_data_config_all_fields(self): - """Test EvaluationDataConfig with all fields.""" + def test_evaluation_data_config_with_description(self): + """Test EvaluationDataConfig with description.""" config = EvaluationDataConfig( eval_id="full_test", eval_query="What is OpenShift?", eval_type="judge-llm", expected_response="OpenShift is a Kubernetes platform", - expected_keywords=["kubernetes", "platform", "container"], - eval_setup_script="./setup.sh", - eval_verify_script="./verify.sh", - eval_cleanup_script="./cleanup.sh", + description="Test evaluation for OpenShift knowledge", ) assert config.eval_id == "full_test" assert config.eval_query == "What is OpenShift?" assert config.eval_type == "judge-llm" assert config.expected_response == "OpenShift is a Kubernetes platform" - assert config.expected_keywords == ["kubernetes", "platform", "container"] - assert config.eval_setup_script == "./setup.sh" - assert config.eval_verify_script == "./verify.sh" - assert config.eval_cleanup_script == "./cleanup.sh" + assert config.description == "Test evaluation for OpenShift knowledge" + assert config.expected_keywords is None + assert config.eval_verify_script is None + + def test_evaluation_data_config_missing_eval_type(self): + """Test EvaluationDataConfig with missing eval_type (should fail).""" + with pytest.raises(ValidationError) as exc_info: + EvaluationDataConfig( + eval_id="test_001", + eval_query="What is Kubernetes?", + ) + + assert "Field required" in str(exc_info.value) + + def test_evaluation_data_config_judge_llm_missing_expected_response(self): + """Test judge-llm evaluation missing expected_response.""" + with pytest.raises(ValidationError) as exc_info: + EvaluationDataConfig( + eval_id="test_judge", + eval_query="Test query", + eval_type="judge-llm", + ) + + assert "requires non-empty 'expected_response'" in str(exc_info.value) + + def test_evaluation_data_config_substring_missing_keywords(self): + """Test sub-string evaluation missing expected_keywords.""" + with pytest.raises(ValidationError) as exc_info: + EvaluationDataConfig( + eval_id="test_substring", + eval_query="Test query", + eval_type="sub-string", + ) + + assert "requires non-empty 'expected_keywords'" in str(exc_info.value) + + def test_evaluation_data_config_script_missing_verify_script(self): + """Test script evaluation missing eval_verify_script.""" + with pytest.raises(ValidationError) as exc_info: + EvaluationDataConfig( + eval_id="test_script", + eval_query="Test query", + eval_type="script", + ) + + assert "requires non-empty 'eval_verify_script'" in str(exc_info.value) + + def test_evaluation_data_config_script_nonexistent_file(self): + """Test script evaluation with non-existent script file.""" + with pytest.raises(ValidationError) as exc_info: + EvaluationDataConfig( + eval_id="test_script", + eval_query="Test query", + eval_type="script", + eval_verify_script="/non/existent/script.sh", + ) + + assert "file not found" in str(exc_info.value) + + +class TestConversationDataConfig: + """Test Conversation data config.""" + + def test_conversation_config_minimal(self): + """Test creating minimal Conversation data config.""" + config = ConversationDataConfig( + conversation_group="test_conv", + conversation=[ + EvaluationDataConfig( + eval_id="test_001", + eval_query="What is Kubernetes?", + eval_type="judge-llm", + expected_response="Kubernetes is a platform", + ) + ], + ) + + assert config.conversation_group == "test_conv" + assert len(config.conversation) == 1 + assert config.conversation[0].eval_id == "test_001" + assert config.description is None + assert config.setup_script is None + assert config.cleanup_script is None + + @patch("builtins.open", mock_open()) + @patch("pathlib.Path.is_file", return_value=True) + @patch("pathlib.Path.exists", return_value=True) + def test_conversation_config_with_scripts(self, mock_exists, mock_is_file): + """Test Conversation data config with setup and cleanup scripts.""" + config = ConversationDataConfig( + conversation_group="test_conv_scripts", + description="Test conversation with scripts", + setup_script="/mock/setup.sh", + cleanup_script="/mock/cleanup.sh", + conversation=[ + EvaluationDataConfig( + eval_id="test_001", + eval_query="Test query", + eval_type="judge-llm", + expected_response="Test response", + ) + ], + ) + + assert config.conversation_group == "test_conv_scripts" + assert config.description == "Test conversation with scripts" + assert isinstance(config.setup_script, Path) + assert isinstance(config.cleanup_script, Path) + + def test_conversation_config_empty_group_name(self): + """Test Conversation data config with empty group name.""" + with pytest.raises(ValidationError) as exc_info: + ConversationDataConfig( + conversation_group=" ", # Empty after strip + conversation=[ + EvaluationDataConfig( + eval_id="test_001", + eval_query="Test query", + eval_type="judge-llm", + expected_response="Test response", + ) + ], + ) + + assert "cannot be empty" in str(exc_info.value) + + def test_conversation_config_nonexistent_script(self): + """Test Conversation data config with non-existent script.""" + with pytest.raises(ValidationError) as exc_info: + ConversationDataConfig( + conversation_group="test_conv", + setup_script="/non/existent/setup.sh", + conversation=[ + EvaluationDataConfig( + eval_id="test_001", + eval_query="Test query", + eval_type="judge-llm", + expected_response="Test response", + ) + ], + ) + + assert "file not found" in str(exc_info.value) + + def test_conversation_config_duplicate_eval_ids(self): + """Test Conversation data config with duplicate eval_ids.""" + with pytest.raises(ValidationError) as exc_info: + ConversationDataConfig( + conversation_group="test_conv", + conversation=[ + EvaluationDataConfig( + eval_id="duplicate_id", + eval_query="First query", + eval_type="judge-llm", + expected_response="First response", + ), + EvaluationDataConfig( + eval_id="duplicate_id", + eval_query="Second query", + eval_type="judge-llm", + expected_response="Second response", + ), + ], + ) + + assert "Duplicate eval_id" in str(exc_info.value) + + +class TestEvaluationStats: + """Test Evaluation statistics data class.""" + + def test_evaluation_stats_from_results(self): + """Test Evaluation statistics creation method.""" + results = [ + EvaluationResult( + eval_id="test_001", + query="Query 1", + response="Response 1", + eval_type="judge-llm", + result="PASS", + conversation_group="conv1", + ), + EvaluationResult( + eval_id="test_002", + query="Query 2", + response="Response 2", + eval_type="script", + result="FAIL", + conversation_group="conv1", + ), + EvaluationResult( + eval_id="test_003", + query="Query 3", + response="Response 3", + eval_type="sub-string", + result="PASS", + conversation_group="conv2", + ), + ] + + stats = EvaluationStats.from_results(results) + + assert stats.total_evaluations == 3 + assert stats.total_conversations == 2 + assert stats.passed == 2 + assert stats.failed == 1 + assert stats.errored == 0 + assert abs(stats.success_rate - 66.67) < 0.01 + + # Check stats by conversation + assert "conv1" in stats.by_conversation + assert "conv2" in stats.by_conversation + assert stats.by_conversation["conv1"]["total"] == 2 + assert stats.by_conversation["conv1"]["passed"] == 1 + assert stats.by_conversation["conv2"]["total"] == 1 + assert stats.by_conversation["conv2"]["passed"] == 1 + + # Check stats by eval_type + assert "judge-llm" in stats.by_eval_type + assert "script" in stats.by_eval_type + assert "sub-string" in stats.by_eval_type diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_results.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_results.py index cffcac15..a20bf192 100644 --- a/lsc_agent_eval/tests/core/agent_goal_eval/test_results.py +++ b/lsc_agent_eval/tests/core/agent_goal_eval/test_results.py @@ -1,389 +1,298 @@ """Tests for results manager.""" -from unittest.mock import Mock, patch +import json +import tempfile +from pathlib import Path +from unittest.mock import mock_open, patch +import pandas as pd import pytest -from lsc_agent_eval.core.agent_goal_eval.models import EvaluationResult +from lsc_agent_eval.core.agent_goal_eval.models import EvaluationResult, EvaluationStats from lsc_agent_eval.core.agent_goal_eval.results import ResultsManager +from lsc_agent_eval.core.utils.exceptions import AgentEvaluationError class TestResultsManager: """Test ResultsManager.""" - def test_init(self): - """Test ResultsManager initialization.""" - manager = ResultsManager("test_results/") - - assert manager.result_dir == "test_results/" - - @patch("pathlib.Path.mkdir") - @patch("pandas.DataFrame.to_csv") - @patch("pandas.DataFrame") - def test_save_results_success(self, mock_dataframe, mock_to_csv, mock_mkdir): - """Test successful results saving.""" - # Setup test data - results = [ + @pytest.fixture + def sample_results(self): + """Sample evaluation results.""" + return [ EvaluationResult( eval_id="test_001", query="What is Kubernetes?", response="Kubernetes is a container orchestration platform", eval_type="judge-llm", result="PASS", + conversation_group="conv1", + conversation_id="conv-id-123", ), EvaluationResult( eval_id="test_002", query="Deploy nginx", - response="kubectl create deployment nginx --image=nginx", + response="oc create deployment nginx --image=nginx", eval_type="script", - result="PASS", + result="FAIL", + conversation_group="conv1", + conversation_id="conv-id-123", ), - ] - - # Setup mocks - mock_df_instance = Mock() - mock_dataframe.return_value = mock_df_instance - - # Run test - manager = ResultsManager("test_results/") - manager.save_results(results) - - # Verify directory creation - mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - - # Verify DataFrame was created with correct data - mock_dataframe.assert_called_once() - call_args = mock_dataframe.call_args[0][0] - assert len(call_args) == 2 - assert call_args[0]["eval_id"] == "test_001" - assert call_args[1]["eval_id"] == "test_002" - - # Verify to_csv was called - mock_df_instance.to_csv.assert_called_once() - - @patch("pathlib.Path.mkdir") - @patch("pandas.DataFrame.to_csv") - @patch("pandas.DataFrame") - def test_save_results_with_error(self, mock_dataframe, mock_to_csv, mock_mkdir): - """Test results saving with error field.""" - # Setup test data with error - results = [ - EvaluationResult( - eval_id="test_001", - query="Test query", - response="", - eval_type="script", - result="ERROR", - error="Script execution failed", - ), - ] - - # Setup mocks - mock_df_instance = Mock() - mock_dataframe.return_value = mock_df_instance - - # Run test - manager = ResultsManager("test_results/") - manager.save_results(results) - - # Verify DataFrame was created with error field - mock_dataframe.assert_called_once() - call_args = mock_dataframe.call_args[0][0] - assert len(call_args) == 1 - assert call_args[0]["error"] == "Script execution failed" - - @patch("pathlib.Path.mkdir") - @patch("pandas.DataFrame.to_csv") - @patch("pandas.DataFrame") - def test_save_results_empty_list(self, mock_dataframe, mock_to_csv, mock_mkdir): - """Test saving empty results list.""" - # Setup mocks - mock_df_instance = Mock() - mock_dataframe.return_value = mock_df_instance - - # Run test - manager = ResultsManager("test_results/") - manager.save_results([]) - - # Verify DataFrame was created with empty list - mock_dataframe.assert_called_once_with([]) - mock_df_instance.to_csv.assert_called_once() - - @patch("pathlib.Path.mkdir", side_effect=OSError("Permission denied")) - def test_save_results_mkdir_error(self, mock_mkdir): - """Test results saving with directory creation error.""" - results = [ EvaluationResult( - eval_id="test_001", - query="Test query", - response="Test response", - eval_type="judge-llm", + eval_id="test_003", + query="List pods", + response="pod1, pod2", + eval_type="sub-string", result="PASS", + conversation_group="conv2", + conversation_id="conv-id-456", ), ] - manager = ResultsManager("test_results/") + @pytest.fixture + def empty_results(self): + """Empty results list.""" + return [] - with pytest.raises(OSError, match="Permission denied"): - manager.save_results(results) - - @patch("pathlib.Path.mkdir") - @patch("pandas.DataFrame.to_csv", side_effect=IOError("File write error")) - @patch("pandas.DataFrame") - def test_save_results_file_error(self, mock_dataframe, mock_to_csv, mock_mkdir): - """Test results saving with file write error.""" - results = [ - EvaluationResult( - eval_id="test_001", - query="Test query", - response="Test response", - eval_type="judge-llm", - result="PASS", - ), - ] + def test_init(self, sample_results): + """Test ResultsManager initialization.""" + manager = ResultsManager(sample_results) - # Setup mocks - mock_df_instance = Mock() - mock_dataframe.return_value = mock_df_instance - mock_df_instance.to_csv.side_effect = IOError("File write error") + assert manager.results == sample_results + assert isinstance(manager.results_stats, EvaluationStats) + assert manager.results_stats.total_evaluations == 3 + assert manager.results_stats.passed == 2 + assert manager.results_stats.failed == 1 - manager = ResultsManager("test_results/") + def test_init_empty_results(self, empty_results): + """Test ResultsManager initialization with empty results.""" + manager = ResultsManager(empty_results) - with pytest.raises(IOError, match="File write error"): - manager.save_results(results) + assert manager.results == [] + assert manager.results_stats.total_evaluations == 0 @patch("pathlib.Path.mkdir") @patch("pandas.DataFrame.to_csv") - @patch("pandas.DataFrame") - @patch("lsc_agent_eval.core.agent_goal_eval.results.datetime") - def test_save_results_filename_generation( - self, mock_datetime, mock_dataframe, mock_to_csv, mock_mkdir + @patch("builtins.open", new_callable=mock_open) + def test_save_results_success( + self, mock_file_open, mock_to_csv, mock_mkdir, sample_results ): - """Test CSV filename generation with timestamp.""" - # Setup mock datetime - mock_datetime.now.return_value.strftime.return_value = "20240108_103000" + """Test successful results saving.""" + manager = ResultsManager(sample_results) - # Setup mocks - mock_df_instance = Mock() - mock_dataframe.return_value = mock_df_instance + manager.save_results("test_results/") - results = [ - EvaluationResult( - eval_id="test_001", - query="Test query", - response="Test response", - eval_type="judge-llm", - result="PASS", - ), - ] + # Verify directory creation + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - # Run test - manager = ResultsManager("test_results/") - manager.save_results(results) + # Verify CSV saving + mock_to_csv.assert_called_once() - # Verify to_csv was called with correct path - mock_df_instance.to_csv.assert_called_once() - call_args = mock_df_instance.to_csv.call_args - file_path = call_args[0][0] - assert "agent_goal_eval_results_20240108_103000.csv" in str(file_path) + # Verify JSON saving + mock_file_open.assert_called() @patch("pathlib.Path.mkdir") - @patch("pandas.DataFrame.to_csv") - @patch("pandas.DataFrame") - def test_save_results_csv_parameters(self, mock_dataframe, mock_to_csv, mock_mkdir): - """Test CSV parameters are correct.""" - # Setup mocks - mock_df_instance = Mock() - mock_dataframe.return_value = mock_df_instance + @patch("pandas.DataFrame.to_csv", side_effect=Exception("CSV error")) + def test_save_results_csv_error(self, mock_to_csv, mock_mkdir, sample_results): + """Test results saving with CSV error.""" + manager = ResultsManager(sample_results) - results = [ - EvaluationResult( - eval_id="test_001", - query="Test query", - response="Test response", - eval_type="judge-llm", - result="PASS", - ), - ] - - # Run test - manager = ResultsManager("test_results/") - manager.save_results(results) - - # Verify to_csv was called with correct parameters - mock_df_instance.to_csv.assert_called_once() - call_args = mock_df_instance.to_csv.call_args - assert not call_args[1]["index"] - assert call_args[1]["encoding"] == "utf-8" - - @patch("pathlib.Path.mkdir") - @patch("pandas.DataFrame.to_csv") - @patch("pandas.DataFrame") - def test_save_results_data_conversion( - self, mock_dataframe, mock_to_csv, mock_mkdir - ): - """Test EvaluationResult to dict conversion.""" - # Setup mocks - mock_df_instance = Mock() - mock_dataframe.return_value = mock_df_instance + with pytest.raises(AgentEvaluationError, match="Failed to save results"): + manager.save_results("test_results/") + @patch("pathlib.Path.mkdir", side_effect=OSError("Permission denied")) + def test_save_results_mkdir_error(self, mock_mkdir, sample_results): + """Test results saving with directory creation error.""" + manager = ResultsManager(sample_results) + + with pytest.raises(AgentEvaluationError, match="Failed to save results"): + manager.save_results("test_results/") + + def test_csv_data_conversion(self, sample_results): + """Test CSV data conversion.""" + manager = ResultsManager(sample_results) + + data = [] + for result in manager.results: + data.append( + { + "conversation_group": result.conversation_group, + "conversation_id": result.conversation_id, + "eval_id": result.eval_id, + "result": result.result, + "eval_type": result.eval_type, + "query": result.query, + "response": result.response, + "error": result.error, + } + ) + + assert len(data) == 3 + assert data[0]["eval_id"] == "test_001" + assert data[0]["result"] == "PASS" + assert data[1]["result"] == "FAIL" + assert data[2]["eval_type"] == "sub-string" + + def test_get_results_stats(self, sample_results): + """Test get results stats method.""" + manager = ResultsManager(sample_results) + stats = manager.get_results_stats() + + assert isinstance(stats, EvaluationStats) + assert stats.total_evaluations == 3 + assert stats.total_conversations == 2 + assert stats.passed == 2 + assert stats.failed == 1 + assert stats.errored == 0 + assert round(stats.success_rate, 2) == round(2 / 3 * 100, 2) + + # Check conversation breakdown + assert "conv1" in stats.by_conversation + assert "conv2" in stats.by_conversation + assert stats.by_conversation["conv1"]["total"] == 2 + assert stats.by_conversation["conv2"]["total"] == 1 + + # Check eval type breakdown + assert "judge-llm" in stats.by_eval_type + assert "script" in stats.by_eval_type + assert "sub-string" in stats.by_eval_type + + def test_results_with_errors(self): + """Test results with error conditions.""" results = [ EvaluationResult( - eval_id="test_001", - query="Test query", - response="Test response", + eval_id="test_error", + query="Failing query", + response="", eval_type="judge-llm", - result="PASS", - error=None, + result="ERROR", + error="API connection failed", + conversation_group="test_conv", + conversation_id="conv-id-789", ), ] - # Run test - manager = ResultsManager("test_results/") - manager.save_results(results) - - # Verify DataFrame was created with correct data - mock_dataframe.assert_called_once() - call_args = mock_dataframe.call_args[0][0] - expected_row = { - "eval_id": "test_001", - "query": "Test query", - "response": "Test response", - "eval_type": "judge-llm", - "result": "PASS", - "error": "", - } - assert call_args[0] == expected_row + manager = ResultsManager(results) + stats = manager.get_results_stats() - @patch("pathlib.Path.mkdir") - @patch("pandas.DataFrame.to_csv") - @patch("pandas.DataFrame") - def test_save_results_multiple_results( - self, mock_dataframe, mock_to_csv, mock_mkdir - ): - """Test saving multiple results.""" - # Setup test data - results = [ - EvaluationResult("test_001", "query1", "response1", "judge-llm", "PASS"), - EvaluationResult("test_002", "query2", "response2", "script", "FAIL"), - EvaluationResult("test_003", "query3", "response3", "sub-string", "PASS"), - ] + assert stats.total_evaluations == 1 + assert stats.passed == 0 + assert stats.failed == 0 + assert stats.errored == 1 + assert stats.success_rate == 0.0 - # Setup mocks - mock_df_instance = Mock() - mock_dataframe.return_value = mock_df_instance - - # Run test - manager = ResultsManager("test_results/") - manager.save_results(results) - - # Verify DataFrame was created with all results - mock_dataframe.assert_called_once() - call_args = mock_dataframe.call_args[0][0] - assert len(call_args) == 3 - - # Verify each result was converted correctly - assert call_args[0]["eval_id"] == "test_001" - assert call_args[1]["eval_id"] == "test_002" - assert call_args[2]["eval_id"] == "test_003" - - def test_result_dir_with_trailing_slash(self): - """Test result directory with trailing slash.""" - manager = ResultsManager("test_results/") - assert manager.result_dir == "test_results/" - - def test_result_dir_without_trailing_slash(self): - """Test result directory without trailing slash.""" - manager = ResultsManager("test_results") - assert manager.result_dir == "test_results" - - @patch("pathlib.Path.mkdir") - @patch("pandas.DataFrame.to_csv") - @patch("pandas.DataFrame") - def test_save_results_encoding(self, mock_dataframe, mock_to_csv, mock_mkdir): - """Test CSV file is saved with UTF-8 encoding.""" + def test_results_mixed_types(self): + """Test results with mixed evaluation types.""" results = [ EvaluationResult( - eval_id="test_001", - query="What is Kubernetes?", - response="Kubernetes is a container orchestration platform", + eval_id="judge_test", + query="Judge query", + response="Judge response", eval_type="judge-llm", result="PASS", + conversation_group="mixed_conv", + conversation_id="conv-id-mixed", ), - ] - - # Setup mocks - mock_df_instance = Mock() - mock_dataframe.return_value = mock_df_instance - - # Run test - manager = ResultsManager("test_results/") - manager.save_results(results) - - # Verify to_csv was called with UTF-8 encoding - call_args = mock_df_instance.to_csv.call_args - assert call_args[1]["encoding"] == "utf-8" - - @patch("pathlib.Path.mkdir") - @patch("pandas.DataFrame.to_csv") - @patch("pandas.DataFrame") - def test_save_results_no_index(self, mock_dataframe, mock_to_csv, mock_mkdir): - """Test CSV file index handling.""" - results = [ EvaluationResult( - eval_id="test_001", - query="Test query", - response="Test response", - eval_type="judge-llm", - result="PASS", + eval_id="script_test", + query="Script query", + response="Script response", + eval_type="script", + result="FAIL", + conversation_group="mixed_conv", + conversation_id="conv-id-mixed", ), - ] - - # Setup mocks - mock_df_instance = Mock() - mock_dataframe.return_value = mock_df_instance - - # Run test - manager = ResultsManager("test_results/") - manager.save_results(results) - - # Verify to_csv was called with index=False - call_args = mock_df_instance.to_csv.call_args - assert not call_args[1]["index"] - - @patch("pathlib.Path.mkdir") - @patch("pandas.DataFrame.to_csv") - @patch("pandas.DataFrame") - def test_save_results_none_error_handling( - self, mock_dataframe, mock_to_csv, mock_mkdir - ): - """Test handling of None error values.""" - results = [ EvaluationResult( - eval_id="test_001", - query="Test query", - response="Test response", - eval_type="judge-llm", + eval_id="substring_test", + query="Substring query", + response="Substring response", + eval_type="sub-string", result="PASS", - error=None, + conversation_group="mixed_conv", + conversation_id="conv-id-mixed", ), ] - # Setup mocks - mock_df_instance = Mock() - mock_dataframe.return_value = mock_df_instance - - # Run test - manager = ResultsManager("test_results/") - manager.save_results(results) - - # Verify None error is converted to empty string - mock_dataframe.assert_called_once() - call_args = mock_dataframe.call_args[0][0] - assert call_args[0]["error"] == "" - - def test_get_output_dir(self): - """Test get_output_dir method.""" - manager = ResultsManager("test_results/") - output_dir = manager.get_output_dir() - assert output_dir == str(manager.result_path) + manager = ResultsManager(results) + stats = manager.get_results_stats() + + assert stats.total_evaluations == 3 + assert stats.total_conversations == 1 + assert len(stats.by_eval_type) == 3 + assert stats.by_eval_type["judge-llm"]["passed"] == 1 + assert stats.by_eval_type["script"]["failed"] == 1 + assert stats.by_eval_type["sub-string"]["passed"] == 1 + + def test_json_statistics_structure(self, sample_results): + """Test JSON statistics structure.""" + manager = ResultsManager(sample_results) + stats = manager.get_results_stats() + + # Convert to dict as would be saved to JSON + stats_dict = stats.model_dump() + + assert "total_evaluations" in stats_dict + assert "total_conversations" in stats_dict + assert "passed" in stats_dict + assert "failed" in stats_dict + assert "errored" in stats_dict + assert "success_rate" in stats_dict + assert "by_conversation" in stats_dict + assert "by_eval_type" in stats_dict + + # Verify structure of nested stats + assert isinstance(stats_dict["by_conversation"], dict) + assert isinstance(stats_dict["by_eval_type"], dict) + + def test_filename_generation_format(self, sample_results): + """Test that filename generation follows expected format.""" + manager = ResultsManager(sample_results) + + with patch( + "lsc_agent_eval.core.agent_goal_eval.results.datetime" + ) as mock_datetime: + mock_datetime.now.return_value.strftime.return_value = "20240101_120000" + + with ( + patch.object(manager, "_save_csv_results"), + patch.object(manager, "_save_json_summary"), + patch("pathlib.Path.mkdir"), + ): + + manager.save_results("test_results/") + + # Verify the filename format is called correctly + mock_datetime.now.assert_called_once() + mock_datetime.now.return_value.strftime.assert_called_once_with( + "%Y%m%d_%H%M%S" + ) + + def test_integration_with_real_files(self, sample_results): + """Integration test with real temporary files.""" + manager = ResultsManager(sample_results) + + with tempfile.TemporaryDirectory() as temp_dir: + manager.save_results(temp_dir) + + # Check that files were created + result_files = list(Path(temp_dir).glob("agent_goal_eval_results_*.csv")) + summary_files = list(Path(temp_dir).glob("agent_goal_eval_summary_*.json")) + + assert len(result_files) == 1 + assert len(summary_files) == 1 + + # Verify CSV content + csv_data = pd.read_csv(result_files[0]) + assert len(csv_data) == 3 + assert "eval_id" in csv_data.columns + assert "result" in csv_data.columns + assert "conversation_group" in csv_data.columns + + # Verify JSON content + with open(summary_files[0], "r") as f: + json_data = json.load(f) + + assert json_data["summary"]["total_evaluations"] == 3 + assert json_data["summary"]["passed"] == 2 + assert "by_conversation" in json_data + assert "by_eval_type" in json_data diff --git a/lsc_agent_eval/tests/core/agent_goal_eval/test_script_runner.py b/lsc_agent_eval/tests/core/agent_goal_eval/test_script_runner.py index 2c1bee00..a73dff6d 100644 --- a/lsc_agent_eval/tests/core/agent_goal_eval/test_script_runner.py +++ b/lsc_agent_eval/tests/core/agent_goal_eval/test_script_runner.py @@ -40,8 +40,7 @@ def test_run_script_success( mock_exists.assert_called_once() mock_chmod.assert_called_once_with(0o755) mock_subprocess_run.assert_called_once_with( - ["bash", str(Path("test_script.sh").resolve())], - input=None, + [str(Path("test_script.sh").resolve())], text=True, capture_output=True, env=os.environ.copy(), @@ -82,8 +81,7 @@ def test_run_script_with_kubeconfig( expected_env = os.environ.copy() expected_env["KUBECONFIG"] = "./kubeconfig" mock_subprocess_run.assert_called_once_with( - ["bash", str(Path("test_script.sh").resolve())], - input=None, + [str(Path("test_script.sh").resolve())], text=True, capture_output=True, env=expected_env, @@ -186,8 +184,7 @@ def test_run_script_absolute_path( assert result mock_subprocess_run.assert_called_once_with( - ["bash", absolute_path], - input=None, + [absolute_path], text=True, capture_output=True, env=os.environ.copy(), @@ -217,8 +214,7 @@ def test_run_script_relative_path( expected_path = str(Path("scripts/test.sh").resolve()) mock_subprocess_run.assert_called_once_with( - ["bash", expected_path], - input=None, + [expected_path], text=True, capture_output=True, env=os.environ.copy(), @@ -248,8 +244,7 @@ def test_run_script_environment_preservation( # Verify environment includes test variable expected_env = os.environ.copy() mock_subprocess_run.assert_called_once_with( - ["bash", str(Path("test_script.sh").resolve())], - input=None, + [str(Path("test_script.sh").resolve())], text=True, capture_output=True, env=expected_env, @@ -278,8 +273,7 @@ def test_run_script_kubeconfig_absolute_path( expected_env = os.environ.copy() expected_env["KUBECONFIG"] = kubeconfig_path mock_subprocess_run.assert_called_once_with( - ["bash", str(Path("test_script.sh").resolve())], - input=None, + [str(Path("test_script.sh").resolve())], text=True, capture_output=True, env=expected_env, @@ -306,8 +300,7 @@ def test_run_script_no_kubeconfig( assert result mock_subprocess_run.assert_called_once_with( - ["bash", str(Path("test_script.sh").resolve())], - input=None, + [str(Path("test_script.sh").resolve())], text=True, capture_output=True, env=os.environ.copy(), @@ -337,35 +330,6 @@ def test_run_script_capture_output( assert result # Note: Instance method returns boolean, not the result object - @patch("subprocess.run") - @patch("pathlib.Path.is_file") - @patch("pathlib.Path.exists") - @patch("pathlib.Path.chmod") - def test_run_script_with_input_text( - self, mock_chmod, mock_exists, mock_is_file, mock_subprocess_run - ): - """Test script execution with input text.""" - mock_exists.return_value = True - mock_is_file.return_value = True - mock_result = Mock() - mock_result.returncode = 0 - mock_subprocess_run.return_value = mock_result - - input_text = "test input" - runner = ScriptRunner() - result = runner.run_script("test_script.sh", input_text=input_text) - - assert result - mock_subprocess_run.assert_called_once_with( - ["bash", str(Path("test_script.sh").resolve())], - input=input_text, - text=True, - capture_output=True, - env=os.environ.copy(), - timeout=300, - check=False, - ) - def test_script_runner_init(self): """Test ScriptRunner initialization.""" runner = ScriptRunner() diff --git a/lsc_agent_eval/tests/core/utils/test_api_client.py b/lsc_agent_eval/tests/core/utils/test_api_client.py index 8fed7817..345d15e5 100644 --- a/lsc_agent_eval/tests/core/utils/test_api_client.py +++ b/lsc_agent_eval/tests/core/utils/test_api_client.py @@ -67,7 +67,11 @@ def test_query_agent_success(self): """Test successful agent query.""" # Mock HTTP response mock_response = Mock() - mock_response.json.return_value = {"response": "Test agent response"} + response_text = "OpenShift Virtualization is an extension of the OpenShift Container Platform" + mock_response.json.return_value = { + "response": response_text, + "conversation_id": "conv-id-123", + } mock_response.raise_for_status.return_value = None # Mock HTTP client @@ -77,16 +81,18 @@ def test_query_agent_success(self): with patch("httpx.Client", return_value=mock_client): client = AgentHttpClient("http://localhost:8080") - result = client.query_agent("What is Kubernetes?", "openai", "gpt-4") + api_input = { + "query": "What is Openshift Virtualization?", + "provider": "watsonx", + "model": "ibm/granite-3-3-8b-instruct", + } + result_response, result_conversation_id = client.query_agent(api_input) - assert result == "Test agent response" + assert result_response == response_text + assert result_conversation_id == "conv-id-123" mock_client.post.assert_called_once_with( "/v1/query", - json={ - "query": "What is Kubernetes?", - "provider": "openai", - "model": "gpt-4", - }, + json=api_input, timeout=300, ) @@ -106,8 +112,9 @@ def test_query_agent_http_error(self): with patch("httpx.Client", return_value=mock_client): client = AgentHttpClient("http://localhost:8080") + api_input = {"query": "Test query", "provider": "openai", "model": "gpt-4"} with pytest.raises(AgentAPIError, match="Agent API error: 500"): - client.query_agent("Test query", "openai", "gpt-4") + client.query_agent(api_input) def test_query_agent_timeout(self): """Test agent query with timeout.""" @@ -118,8 +125,13 @@ def test_query_agent_timeout(self): with patch("httpx.Client", return_value=mock_client): client = AgentHttpClient("http://localhost:8080") + api_input = { + "query": "Test query", + "provider": "agent_provider", + "model": "agent_model", + } with pytest.raises(AgentAPIError, match="Agent query timeout"): - client.query_agent("Test query", "openai", "gpt-4") + client.query_agent(api_input) def test_query_agent_missing_response_field(self): """Test agent query with missing response field.""" @@ -135,10 +147,11 @@ def test_query_agent_missing_response_field(self): with patch("httpx.Client", return_value=mock_client): client = AgentHttpClient("http://localhost:8080") + api_input = {"query": "Test query", "provider": "openai", "model": "gpt-4"} with pytest.raises( AgentAPIError, match="Agent response missing 'response' field" ): - client.query_agent("Test query", "openai", "gpt-4") + client.query_agent(api_input) def test_query_agent_client_not_initialized(self): """Test agent query when client is not initialized.""" diff --git a/lsc_agent_eval/tests/core/utils/test_exceptions.py b/lsc_agent_eval/tests/core/utils/test_exceptions.py index 94152bea..4c63e9f5 100644 --- a/lsc_agent_eval/tests/core/utils/test_exceptions.py +++ b/lsc_agent_eval/tests/core/utils/test_exceptions.py @@ -3,7 +3,7 @@ from lsc_agent_eval.core.utils.exceptions import ( AgentAPIError, AgentEvaluationError, - ConfigurationError, + EvaluationDataError, JudgeModelError, ScriptExecutionError, ) @@ -25,20 +25,20 @@ def test_agent_evaluation_error_inheritance(self): assert isinstance(error, Exception) -class TestConfigurationError: - """Test ConfigurationError.""" +class TestEvaluationDataError: + """Test EvaluationDataError.""" - def test_configuration_error_creation(self): - """Test creating ConfigurationError.""" - error = ConfigurationError("Invalid configuration") + def test_evaluation_data_error_creation(self): + """Test creating EvaluationDataError.""" + error = EvaluationDataError("Invalid configuration") assert str(error) == "Invalid configuration" - assert isinstance(error, ConfigurationError) + assert isinstance(error, EvaluationDataError) assert isinstance(error, AgentEvaluationError) - def test_configuration_error_inheritance(self): - """Test ConfigurationError inheritance.""" - error = ConfigurationError("Config error") - assert isinstance(error, ConfigurationError) + def test_evaluation_data_error_inheritance(self): + """Test EvaluationDataError inheritance.""" + error = EvaluationDataError("Config error") + assert isinstance(error, EvaluationDataError) assert isinstance(error, AgentEvaluationError) assert isinstance(error, Exception) @@ -103,7 +103,7 @@ class TestExceptionHierarchy: def test_all_exceptions_inherit_from_base(self): """Test that all custom exceptions inherit from AgentEvaluationError.""" exceptions = [ - ConfigurationError("config error"), + EvaluationDataError("config error"), AgentAPIError("api error"), ScriptExecutionError("script error"), JudgeModelError("judge error"), @@ -112,13 +112,3 @@ def test_all_exceptions_inherit_from_base(self): for exc in exceptions: assert isinstance(exc, AgentEvaluationError) assert isinstance(exc, Exception) - - def test_exception_with_none_message(self): - """Test exceptions with None message.""" - error = AgentEvaluationError(None) - assert str(error) == "None" - - def test_exception_with_empty_message(self): - """Test exceptions with empty message.""" - error = AgentEvaluationError("") - assert str(error) == ""