diff --git a/.gitignore b/.gitignore index d6ac9e1c..3eed4326 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ ocm_token.txt .venv .python-version + +# Evaluation output folder +eval_output*/ diff --git a/test/evals/README.md b/test/evals/README.md index 96168244..64afe5d4 100644 --- a/test/evals/README.md +++ b/test/evals/README.md @@ -1,7 +1,7 @@ # Agent Task Completion Evaluation Evaluation mechanism to validate Agent task completion (e2e) -- Supports `script` (similar to [k8s-bench](https://github.com/GoogleCloudPlatform/kubectl-ai/tree/main/k8s-bench)), `sub-string` and `judge-llm` based evaluation. -- Refer [eval data setup](https://github.com/asamal4/lightspeed-evaluation/blob/agent-goal-eval/agent_eval/data/agent_goal_eval.yaml) +- Refer [LCORE-Eval repo](https://github.com/lightspeed-core/lightspeed-evaluation/tree/main/lsc_agent_eval) for setup. +- Supports `sub-string`, `judge-llm` and `script` based evaluation. - Currently it is single-turn evaluation process. ## Prerequisites @@ -16,64 +16,3 @@ pip install git+https://github.com/lightspeed-core/lightspeed-evaluation.git#sub ## Running tests `make test-eval` runs the tests. - -Example output: - -``` -Refreshing OCM token... -Running agent evaluation tests... -2025-07-21 09:18:39,195 - lsc_agent_eval.core.utils.judge - INFO - Setting up LiteLLM for gemini/gemini-2.5-flash -2025-07-21 09:18:39,195 - lsc_agent_eval.core.utils.judge - WARNING - Using generic provider format for gemini -Running 4 evaluation(s)... -================================================== -[1/4] Running: basic_introduction -2025-07-21 09:18:40,039 - lsc_agent_eval.core.utils.api_client - INFO - Agent response > -Hello! I'm an AI assistant for the Assisted Installer. I can help you create OpenShift clusters, list available versions, get cluster information, and more. What would you like to do today? -✅ basic_introduction: PASSED -[2/4] Running: basic_cluster_request -2025-07-21 09:18:46,006 - lsc_agent_eval.core.utils.api_client - INFO - Agent response > -I can help with that. What would you like to name your cluster? What OpenShift version do you want to install? What is the base domain for your cluster? Will this be a single-node cluster (True/False)? -✅ basic_cluster_request: PASSED -[3/4] Running: list_versions -2025-07-21 09:18:52,458 - lsc_agent_eval.core.utils.api_client - INFO - Agent response > -Here are the available OpenShift versions and their support levels: - -**Production:** -* 4.19.3 (default) -* 4.19.3-multi -* 4.18.19 -* 4.18.19-multi - -**Maintenance:** -* 4.17.35 -* 4.17.35-multi -* 4.16.43 -* 4.16.43-multi -* 4.15.54 -* 4.15.54-multi - -**Extended Support:** -* 4.14.51 -* 4.14.51-multi -* 4.12.71 - -**Beta:** -* 4.20.0-ec.4 -* 4.20.0-ec.4-multi - -**End-of-Life:** -* 4.11.59 -* 4.10.67 -* 4.9.17 -✅ list_versions: PASSED -[4/4] Running: available_operators -2025-07-21 09:18:58,051 - lsc_agent_eval.core.utils.api_client - INFO - Agent response > -There are two operator bundles available: - -* **Virtualization**: Run virtual machines alongside containers on one platform. This bundle includes operators like `mtv`, `node-healthcheck`, `nmstate`, `node-maintenance`, `kube-descheduler`, `cnv`, `self-node-remediation`, and `fence-agents-remediation`. -* **OpenShift AI**: Train, serve, monitor and manage AI/ML models and applications using GPUs. This bundle includes operators like `openshift-ai`, `amd-gpu`, `node-feature-discovery`, `pipelines`, `servicemesh`, `authorino`, `kmm`, `odf`, `serverless`, and `nvidia-gpu`. -✅ available_operators: PASSED -================================================== -FINAL RESULTS: 4/4 passed -🎉 All evaluations passed! -``` diff --git a/test/evals/eval.py b/test/evals/eval.py index 06f5c271..56b23645 100644 --- a/test/evals/eval.py +++ b/test/evals/eval.py @@ -1,88 +1,44 @@ -import sys -import logging +"""Agent goal evaluation.""" + import argparse +import logging +import sys + from lsc_agent_eval import AgentGoalEval # Configure logging to show all messages from agent_eval library logging.basicConfig( level=logging.WARNING, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[ - logging.StreamHandler(sys.stdout) - ] + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler(sys.stdout)], ) # Enable specific loggers we want to see -logging.getLogger('lsc_agent_eval').setLevel(logging.INFO) - -def print_test_result(result, config): - """Print test result in human readable format.""" - if result.result == "PASS": - print(f"✅ {result.eval_id}: PASSED") - else: - print(f"❌ {result.eval_id}: {result.result}") - print(f" Evaluation Type: {result.eval_type}") - print(f" Query: {result.query}") - print(f" Response: {result.response}") +logging.getLogger("lsc_agent_eval").setLevel(logging.INFO) - # Show expected values based on eval type - if config.eval_type == "sub-string" and config.expected_key_words: - print(f" Expected Keywords: {config.expected_key_words}") - elif config.eval_type == "judge-llm" and config.expected_response: - print(f" Expected Response: {config.expected_response}") - elif config.eval_type == "script" and config.eval_verify_script: - print(f" Verification Script: {config.eval_verify_script}") - - if result.error: - print(f" Error: {result.error}") - print() # Create proper Namespace object for AgentGoalEval args = argparse.Namespace() -args.eval_data_yaml = 'eval_data.yaml' -args.agent_endpoint = 'http://localhost:8090' -args.agent_provider = 'gemini' -args.agent_model = 'gemini/gemini-2.5-flash' +args.eval_data_yaml = "eval_data.yaml" +args.agent_endpoint = "http://localhost:8090" +args.agent_provider = "gemini" +args.agent_model = "gemini/gemini-2.5-flash" # Set up judge model for LLM evaluation -args.judge_provider = 'gemini' -args.judge_model = 'gemini-2.5-flash' -args.agent_auth_token_file = 'ocm_token.txt' -args.result_dir = 'results' +args.judge_provider = "gemini" +args.judge_model = "gemini-2.5-flash" +args.agent_auth_token_file = "ocm_token.txt" +args.result_dir = "eval_output" evaluator = AgentGoalEval(args) -configs = evaluator.data_manager.get_eval_data() - -print(f"Running {len(configs)} evaluation(s)...") -print("=" * 50) - -passed = 0 -failed = 0 - -for i, config in enumerate(configs, 1): - print(f"[{i}/{len(configs)}] Running: {config.eval_id}") - - result = evaluator.evaluation_runner.run_evaluation( - config, args.agent_provider, args.agent_model - ) - - # Count results as we go - if result.result == "PASS": - passed += 1 - else: - failed += 1 - - # Print result immediately - print_test_result(result, config) - -# Print final summary -print("=" * 50) -total = len(configs) - -print(f"FINAL RESULTS: {passed}/{total} passed") - -if failed > 0: - print(f"❌ {failed} evaluation(s) failed!") +# Run Evaluation +evaluator.run_evaluation() +# Get result summary +result_summary = evaluator.get_result_summary() + +failed_evals_count = result_summary["FAIL"] + result_summary["ERROR"] +if failed_evals_count: + print(f"❌ {failed_evals_count} evaluation(s) failed!") sys.exit(1) -else: - print("🎉 All evaluations passed!") - sys.exit(0) + +print("🎉 All evaluations passed!") +sys.exit(0)