rh-ecosystem-edge · eranco74 · Jul 22, 2025 · Jul 16, 2025 · Jul 16, 2025 · Jul 18, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,6 @@
 /.env
 /config/lightspeed-stack.yaml
+.vscode
+ocm_token.txt
+.venv
+.python-version
diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@
 .PHONY: all \
 	build-images \
 	build-inspector build-assisted-mcp build-lightspeed-stack build-lightspeed-plus-llama-stack build-ui \
-	generate run resume stop rm logs query query-interactive mcphost help
+	generate run resume stop rm logs query query-interactive mcphost test-eval help
 
 all: help ## Show help information
 
@@ -68,6 +68,12 @@ mcphost: ## Attach to mcphost
 	@echo "Attaching to mcphost..."
 	./scripts/mcphost.sh
 
+test-eval: ## Run agent evaluation tests
+	@echo "Refreshing OCM token..."
+	@. utils/ocm-token.sh && get_ocm_token && echo "$$OCM_TOKEN" > test/evals/ocm_token.txt
+	@echo "Running agent evaluation tests..."
+	@cd test/evals && python eval.py
+
 help: ## Show this help message
 	@echo "Available targets:"
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-30s\033[0m %s\n", $$1, $$2}'
@@ -77,4 +83,5 @@ help: ## Show this help message
 	@echo "  make run"
 	@echo "  make logs"
 	@echo "  make query"
-	@echo "  make query-interactive" 
+	@echo "  make query-interactive"
+	@echo "  make test-eval"
diff --git a/scripts/eval.py b/scripts/eval.py
diff --git a/test/evals/AGENT_E2E_EVAL.md b/test/evals/AGENT_E2E_EVAL.md
diff --git a/test/evals/README.md b/test/evals/README.md
@@ -0,0 +1,79 @@
+# Agent Task Completion Evaluation
+Evaluation mechanism to validate Agent task completion (e2e)
+- Supports `script` (similar to [k8s-bench](https://github.com/GoogleCloudPlatform/kubectl-ai/tree/main/k8s-bench)), `sub-string` and `judge-llm` based evaluation.
+- Refer [eval data setup](https://github.com/asamal4/lightspeed-evaluation/blob/agent-goal-eval/agent_eval/data/agent_goal_eval.yaml)
+- Currently it is single-turn evaluation process.
+
+## Prerequisites
+- **Python**: Version 3.11.1 to 3.12.9
+- **Assisted Chat API**: Must be running (`make build-images run`)
+- Install lightspeed-core **agent e2e eval**
+```bash
+pip install git+https://github.com/lightspeed-core/lightspeed-evaluation.git#subdirectory=lsc_agent_eval
+```
+- `GEMINI_API_KEY` env var is set
+
+## Running tests
+
+`make test-eval` runs the tests.
+
+Example output:
+
+```
+Refreshing OCM token...
+Running agent evaluation tests...
+2025-07-21 09:18:39,195 - lsc_agent_eval.core.utils.judge - INFO - Setting up LiteLLM for gemini/gemini-2.5-flash
+2025-07-21 09:18:39,195 - lsc_agent_eval.core.utils.judge - WARNING - Using generic provider format for gemini
+Running 4 evaluation(s)...
+==================================================
+[1/4] Running: basic_introduction
+2025-07-21 09:18:40,039 - lsc_agent_eval.core.utils.api_client - INFO - Agent response >
+Hello! I'm an AI assistant for the Assisted Installer. I can help you create OpenShift clusters, list available versions, get cluster information, and more. What would you like to do today?
+✅ basic_introduction: PASSED
+[2/4] Running: basic_cluster_request
+2025-07-21 09:18:46,006 - lsc_agent_eval.core.utils.api_client - INFO - Agent response >
+I can help with that. What would you like to name your cluster? What OpenShift version do you want to install? What is the base domain for your cluster? Will this be a single-node cluster (True/False)?
+✅ basic_cluster_request: PASSED
+[3/4] Running: list_versions
+2025-07-21 09:18:52,458 - lsc_agent_eval.core.utils.api_client - INFO - Agent response >
+Here are the available OpenShift versions and their support levels:
+
+**Production:**
+* 4.19.3 (default)
+* 4.19.3-multi
+* 4.18.19
+* 4.18.19-multi
+
+**Maintenance:**
+* 4.17.35
+* 4.17.35-multi
+* 4.16.43
+* 4.16.43-multi
+* 4.15.54
+* 4.15.54-multi
+
+**Extended Support:**
+* 4.14.51
+* 4.14.51-multi
+* 4.12.71
+
+**Beta:**
+* 4.20.0-ec.4
+* 4.20.0-ec.4-multi
+
+**End-of-Life:**
+* 4.11.59
+* 4.10.67
+* 4.9.17
+✅ list_versions: PASSED
+[4/4] Running: available_operators
+2025-07-21 09:18:58,051 - lsc_agent_eval.core.utils.api_client - INFO - Agent response >
+There are two operator bundles available:
+
+*   **Virtualization**: Run virtual machines alongside containers on one platform. This bundle includes operators like `mtv`, `node-healthcheck`, `nmstate`, `node-maintenance`, `kube-descheduler`, `cnv`, `self-node-remediation`, and `fence-agents-remediation`.
+*   **OpenShift AI**: Train, serve, monitor and manage AI/ML models and applications using GPUs. This bundle includes operators like `openshift-ai`, `amd-gpu`, `node-feature-discovery`, `pipelines`, `servicemesh`, `authorino`, `kmm`, `odf`, `serverless`, and `nvidia-gpu`.
+✅ available_operators: PASSED
+==================================================
+FINAL RESULTS: 4/4 passed
+🎉 All evaluations passed!
+```
diff --git a/test/evals/basic_evals.yaml b/test/evals/basic_evals.yaml
diff --git a/test/evals/eval.py b/test/evals/eval.py
@@ -0,0 +1,88 @@
+import sys
+import logging
+import argparse
+from lsc_agent_eval import AgentGoalEval
+
+# Configure logging to show all messages from agent_eval library
+logging.basicConfig(
+    level=logging.WARNING,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+
+# Enable specific loggers we want to see
+logging.getLogger('lsc_agent_eval').setLevel(logging.INFO)
+
+def print_test_result(result, config):
+    """Print test result in human readable format."""
+    if result.result == "PASS":
+        print(f"✅ {result.eval_id}: PASSED")
+    else:
+        print(f"❌ {result.eval_id}: {result.result}")
+        print(f"   Evaluation Type: {result.eval_type}")
+        print(f"   Query: {result.query}")
+        print(f"   Response: {result.response}")
+
+        # Show expected values based on eval type
+        if config.eval_type == "sub-string" and config.expected_key_words:
+            print(f"   Expected Keywords: {config.expected_key_words}")
+        elif config.eval_type == "judge-llm" and config.expected_response:
+            print(f"   Expected Response: {config.expected_response}")
+        elif config.eval_type == "script" and config.eval_verify_script:
+            print(f"   Verification Script: {config.eval_verify_script}")
+
+        if result.error:
+            print(f"   Error: {result.error}")
+        print()
+
+# Create proper Namespace object for AgentGoalEval
+args = argparse.Namespace()
+args.eval_data_yaml = 'eval_data.yaml'
+args.agent_endpoint = 'http://localhost:8090'
+args.agent_provider = 'gemini'
+args.agent_model = 'gemini/gemini-2.5-flash'
+# Set up judge model for LLM evaluation
+args.judge_provider = 'gemini'
+args.judge_model = 'gemini-2.5-flash'
+args.agent_auth_token_file = 'ocm_token.txt'
+args.result_dir = 'results'
+
+evaluator = AgentGoalEval(args)
+configs = evaluator.data_manager.get_eval_data()
+
+print(f"Running {len(configs)} evaluation(s)...")
+print("=" * 50)
+
+passed = 0
+failed = 0
+
+for i, config in enumerate(configs, 1):
+    print(f"[{i}/{len(configs)}] Running: {config.eval_id}")
+
+    result = evaluator.evaluation_runner.run_evaluation(
+        config, args.agent_provider, args.agent_model
+    )
+
+    # Count results as we go
+    if result.result == "PASS":
+        passed += 1
+    else:
+        failed += 1
+
+    # Print result immediately
+    print_test_result(result, config)
+
+# Print final summary
+print("=" * 50)
+total = len(configs)
+
+print(f"FINAL RESULTS: {passed}/{total} passed")
+
+if failed > 0:
+    print(f"❌ {failed} evaluation(s) failed!")
+    sys.exit(1)
+else:
+    print("🎉 All evaluations passed!")
+    sys.exit(0)
diff --git a/test/evals/eval_data.yaml b/test/evals/eval_data.yaml
@@ -0,0 +1,19 @@
+- eval_id: basic_introduction
+  eval_query: Hi!
+  eval_type: judge-llm
+  expected_response: "Hello! I'm the Assisted Installer, your guide for OpenShift cluster installation. How can I help you today?"
+
+- eval_id: basic_cluster_request
+  eval_query: I want to install an OCP cluster
+  eval_type: judge-llm
+  expected_response: "Great, I can help you with that. To create a cluster, I'll need some information from you. First, what would you like to call your cluster? And what base domain would you like to use? And finally, what OpenShift version would you like to install?"
+
+- eval_id: list_versions
+  eval_query: List the available OpenShift versions
+  eval_type: judge-llm
+  expected_response: "There are several versions of OpenShift available. The most recent production version is 4.19, 4.20 pre release versions are available as well as several previous verrsions."
+
+- eval_id: available_operators
+  eval_query: What operators are available?
+  eval_type: judge-llm
+  expected_response: "The operators that can be installed onto clusters are OpenShift AI and OpenShift Virtualization."