rh-ecosystem-edge · openshift-merge-bot · Jul 28, 2025 · Jul 23, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,6 @@
 ocm_token.txt
 .venv
 .python-version
+
+# Evaluation output folder
+eval_output*/
diff --git a/test/evals/README.md b/test/evals/README.md
@@ -1,7 +1,7 @@
 # Agent Task Completion Evaluation
 Evaluation mechanism to validate Agent task completion (e2e)
-- Supports `script` (similar to [k8s-bench](https://github.com/GoogleCloudPlatform/kubectl-ai/tree/main/k8s-bench)), `sub-string` and `judge-llm` based evaluation.
-- Refer [eval data setup](https://github.com/asamal4/lightspeed-evaluation/blob/agent-goal-eval/agent_eval/data/agent_goal_eval.yaml)
+- Refer [LCORE-Eval repo](https://github.com/lightspeed-core/lightspeed-evaluation/tree/main/lsc_agent_eval) for setup.
+- Supports `sub-string`, `judge-llm` and `script` based evaluation.
 - Currently it is single-turn evaluation process.
 
 ## Prerequisites
@@ -16,64 +16,3 @@ pip install git+https://github.com/lightspeed-core/lightspeed-evaluation.git#sub
 ## Running tests
 
 `make test-eval` runs the tests.
-
-Example output:
-
-```
-Refreshing OCM token...
-Running agent evaluation tests...
-2025-07-21 09:18:39,195 - lsc_agent_eval.core.utils.judge - INFO - Setting up LiteLLM for gemini/gemini-2.5-flash
-2025-07-21 09:18:39,195 - lsc_agent_eval.core.utils.judge - WARNING - Using generic provider format for gemini
-Running 4 evaluation(s)...
-==================================================
-[1/4] Running: basic_introduction
-2025-07-21 09:18:40,039 - lsc_agent_eval.core.utils.api_client - INFO - Agent response >
-Hello! I'm an AI assistant for the Assisted Installer. I can help you create OpenShift clusters, list available versions, get cluster information, and more. What would you like to do today?
-✅ basic_introduction: PASSED
-[2/4] Running: basic_cluster_request
-2025-07-21 09:18:46,006 - lsc_agent_eval.core.utils.api_client - INFO - Agent response >
-I can help with that. What would you like to name your cluster? What OpenShift version do you want to install? What is the base domain for your cluster? Will this be a single-node cluster (True/False)?
-✅ basic_cluster_request: PASSED
-[3/4] Running: list_versions
-2025-07-21 09:18:52,458 - lsc_agent_eval.core.utils.api_client - INFO - Agent response >
-Here are the available OpenShift versions and their support levels:
-
-**Production:**
-* 4.19.3 (default)
-* 4.19.3-multi
-* 4.18.19
-* 4.18.19-multi
-
-**Maintenance:**
-* 4.17.35
-* 4.17.35-multi
-* 4.16.43
-* 4.16.43-multi
-* 4.15.54
-* 4.15.54-multi
-
-**Extended Support:**
-* 4.14.51
-* 4.14.51-multi
-* 4.12.71
-
-**Beta:**
-* 4.20.0-ec.4
-* 4.20.0-ec.4-multi
-
-**End-of-Life:**
-* 4.11.59
-* 4.10.67
-* 4.9.17
-✅ list_versions: PASSED
-[4/4] Running: available_operators
-2025-07-21 09:18:58,051 - lsc_agent_eval.core.utils.api_client - INFO - Agent response >
-There are two operator bundles available:
-
-*   **Virtualization**: Run virtual machines alongside containers on one platform. This bundle includes operators like `mtv`, `node-healthcheck`, `nmstate`, `node-maintenance`, `kube-descheduler`, `cnv`, `self-node-remediation`, and `fence-agents-remediation`.
-*   **OpenShift AI**: Train, serve, monitor and manage AI/ML models and applications using GPUs. This bundle includes operators like `openshift-ai`, `amd-gpu`, `node-feature-discovery`, `pipelines`, `servicemesh`, `authorino`, `kmm`, `odf`, `serverless`, and `nvidia-gpu`.
-✅ available_operators: PASSED
-==================================================
-FINAL RESULTS: 4/4 passed
-🎉 All evaluations passed!
-```
diff --git a/test/evals/eval.py b/test/evals/eval.py
@@ -1,88 +1,44 @@
-import sys
-import logging
+"""Agent goal evaluation."""
+
 import argparse
+import logging
+import sys
+
 from lsc_agent_eval import AgentGoalEval
 
 # Configure logging to show all messages from agent_eval library
 logging.basicConfig(
     level=logging.WARNING,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.StreamHandler(sys.stdout)
-    ]
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
 )
 
 # Enable specific loggers we want to see
-logging.getLogger('lsc_agent_eval').setLevel(logging.INFO)
-
-def print_test_result(result, config):
-    """Print test result in human readable format."""
-    if result.result == "PASS":
-        print(f"✅ {result.eval_id}: PASSED")
-    else:
-        print(f"❌ {result.eval_id}: {result.result}")
-        print(f"   Evaluation Type: {result.eval_type}")
-        print(f"   Query: {result.query}")
-        print(f"   Response: {result.response}")
+logging.getLogger("lsc_agent_eval").setLevel(logging.INFO)
 
-        # Show expected values based on eval type
-        if config.eval_type == "sub-string" and config.expected_key_words:
-            print(f"   Expected Keywords: {config.expected_key_words}")
-        elif config.eval_type == "judge-llm" and config.expected_response:
-            print(f"   Expected Response: {config.expected_response}")
-        elif config.eval_type == "script" and config.eval_verify_script:
-            print(f"   Verification Script: {config.eval_verify_script}")
-
-        if result.error:
-            print(f"   Error: {result.error}")
-        print()
 
 # Create proper Namespace object for AgentGoalEval
 args = argparse.Namespace()
-args.eval_data_yaml = 'eval_data.yaml'
-args.agent_endpoint = 'http://localhost:8090'
-args.agent_provider = 'gemini'
-args.agent_model = 'gemini/gemini-2.5-flash'
+args.eval_data_yaml = "eval_data.yaml"
+args.agent_endpoint = "http://localhost:8090"
+args.agent_provider = "gemini"
+args.agent_model = "gemini/gemini-2.5-flash"
 # Set up judge model for LLM evaluation
-args.judge_provider = 'gemini'
-args.judge_model = 'gemini-2.5-flash'
-args.agent_auth_token_file = 'ocm_token.txt'
-args.result_dir = 'results'
+args.judge_provider = "gemini"
+args.judge_model = "gemini-2.5-flash"
+args.agent_auth_token_file = "ocm_token.txt"
+args.result_dir = "eval_output"
 
 evaluator = AgentGoalEval(args)
-configs = evaluator.data_manager.get_eval_data()
-
-print(f"Running {len(configs)} evaluation(s)...")
-print("=" * 50)
-
-passed = 0
-failed = 0
-
-for i, config in enumerate(configs, 1):
-    print(f"[{i}/{len(configs)}] Running: {config.eval_id}")
-
-    result = evaluator.evaluation_runner.run_evaluation(
-        config, args.agent_provider, args.agent_model
-    )
-
-    # Count results as we go
-    if result.result == "PASS":
-        passed += 1
-    else:
-        failed += 1
-
-    # Print result immediately
-    print_test_result(result, config)
-
-# Print final summary
-print("=" * 50)
-total = len(configs)
-
-print(f"FINAL RESULTS: {passed}/{total} passed")
-
-if failed > 0:
-    print(f"❌ {failed} evaluation(s) failed!")
+# Run Evaluation
+evaluator.run_evaluation()
+# Get result summary
+result_summary = evaluator.get_result_summary()
+
+failed_evals_count = result_summary["FAIL"] + result_summary["ERROR"]
+if failed_evals_count:
+    print(f"❌ {failed_evals_count} evaluation(s) failed!")
     sys.exit(1)
-else:
-    print("🎉 All evaluations passed!")
-    sys.exit(0)
+
+print("🎉 All evaluations passed!")
+sys.exit(0)