diff --git a/.gitignore b/.gitignore index 23cf4fcd..121a4627 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ /.env /config/lightspeed-stack.yaml +.vscode +ocm_token.txt +.venv +.python-version diff --git a/Makefile b/Makefile index e4f79fa4..d3352da8 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ .PHONY: all \ build-images \ build-inspector build-assisted-mcp build-lightspeed-stack build-lightspeed-plus-llama-stack build-ui \ - generate run resume stop rm logs query query-interactive mcphost help + generate run resume stop rm logs query query-interactive mcphost test-eval help all: help ## Show help information @@ -68,6 +68,12 @@ mcphost: ## Attach to mcphost @echo "Attaching to mcphost..." ./scripts/mcphost.sh +test-eval: ## Run agent evaluation tests + @echo "Refreshing OCM token..." + @. utils/ocm-token.sh && get_ocm_token && echo "$$OCM_TOKEN" > test/evals/ocm_token.txt + @echo "Running agent evaluation tests..." + @cd test/evals && python eval.py + help: ## Show this help message @echo "Available targets:" @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-30s\033[0m %s\n", $$1, $$2}' @@ -77,4 +83,5 @@ help: ## Show this help message @echo " make run" @echo " make logs" @echo " make query" - @echo " make query-interactive" + @echo " make query-interactive" + @echo " make test-eval" diff --git a/scripts/eval.py b/scripts/eval.py deleted file mode 100755 index 5f941823..00000000 --- a/scripts/eval.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 - -import yaml - -# Rough placeholder script for future evals / scoring - -# def load_evals(file_path): -# with open(file_path, 'r') as file: -# evals = yaml.safe_load(file) -# return evals - -# def run_evaluation(prompt, expected): -# # TODO: Run scoring function or something? Check for MCP tool calls? -# pass - -# def chat(prompts): -# for prompt in prompts: -# prompt = eval.get('prompt') -# expected = eval.get('expected') - -# if prompt is not None and expected is not None: -# result = run_evaluation(prompt, expected) -# results.append(result) -# else: -# raise ValueError("Each eval must have 'prompt' and 'expected' fields") - -# def execute_evaluations(evaluations): -# results = [] -# for eval in evaluations: -# prompts = eval.get('prompts', []) -# chat(prompts) - -# print("Evaluation Results:") -# for result in results: -# print(result) - -# if __name__ == "__main__": -# evals = load_evals('test/evals/basic_evals.yaml') -# execute_evaluations(evals["evaluations"]) - - - diff --git a/test/evals/AGENT_E2E_EVAL.md b/test/evals/AGENT_E2E_EVAL.md deleted file mode 100644 index 0783ce77..00000000 --- a/test/evals/AGENT_E2E_EVAL.md +++ /dev/null @@ -1,53 +0,0 @@ -# Agent Task Completion Evaluation -Evaluation mechanism to validate Agent task completion (e2e) -- Supports `script` (similar to [k8s-bench](https://github.com/GoogleCloudPlatform/kubectl-ai/tree/main/k8s-bench)), `sub-string` and `judge-llm` based evaluation. -- Refer [eval data setup](https://github.com/asamal4/lightspeed-evaluation/blob/agent-goal-eval/agent_eval/data/agent_goal_eval.yaml) -- Currently it is single-turn evaluation process. - -## Prerequisites -- **Python**: Version 3.11.1 to 3.12.9 -- **Assisted Chat API**: Must be running -- Install lightspeed-core **agent e2e eval** -```bash -python -m pip install git+https://github.com/asamal4/lightspeed-evaluation.git@agent-goal-eval#subdirectory=agent_eval -``` -- Add `OCM Token` to a text file (Ex: ocm_token.txt) -- Create **eval data yaml** file (Ex: eval_data.yaml) [reference](https://github.com/asamal4/lightspeed-evaluation/blob/agent-goal-eval/agent_eval/data/agent_goal_eval.yaml) -- Refer [Eval README](https://github.com/asamal4/lightspeed-evaluation/blob/agent-goal-eval/agent_eval/README.md) for **judge model** setup - -## Sample Code -```python -from agent_eval import AgentGoalEval # TODO: will change the package name - -# Create Eval config/args (Alternatively Namespace can be used) -class EvalArgs: - def __init__(self): - self.eval_data_yaml = 'eval_data.yaml' - self.agent_endpoint = 'http://localhost:8090' - self.agent_provider = 'gemini' - self.agent_model = 'gemini/gemini-2.5-flash' - self.judge_provider = None - self.judge_model = None - self.agent_auth_token_file = 'ocm_token.txt' # TODO: will move to env variable. - self.result_dir = 'results/' -args = EvalArgs() - -# Run evaluation -evaluator = AgentGoalEval(args) -evaluator.get_eval_result() -``` - -### Result -- Test summary is stored in **agent_goal_eval_results.csv** -- Console output -```text -======================= -EVALUATION SUMMARY -======================= -Total Evaluations: 4 -Passed: 2 -Failed: 1 -Errored: 1 -Success Rate: 50.0% -======================= -``` diff --git a/test/evals/README.md b/test/evals/README.md new file mode 100644 index 00000000..96168244 --- /dev/null +++ b/test/evals/README.md @@ -0,0 +1,79 @@ +# Agent Task Completion Evaluation +Evaluation mechanism to validate Agent task completion (e2e) +- Supports `script` (similar to [k8s-bench](https://github.com/GoogleCloudPlatform/kubectl-ai/tree/main/k8s-bench)), `sub-string` and `judge-llm` based evaluation. +- Refer [eval data setup](https://github.com/asamal4/lightspeed-evaluation/blob/agent-goal-eval/agent_eval/data/agent_goal_eval.yaml) +- Currently it is single-turn evaluation process. + +## Prerequisites +- **Python**: Version 3.11.1 to 3.12.9 +- **Assisted Chat API**: Must be running (`make build-images run`) +- Install lightspeed-core **agent e2e eval** +```bash +pip install git+https://github.com/lightspeed-core/lightspeed-evaluation.git#subdirectory=lsc_agent_eval +``` +- `GEMINI_API_KEY` env var is set + +## Running tests + +`make test-eval` runs the tests. + +Example output: + +``` +Refreshing OCM token... +Running agent evaluation tests... +2025-07-21 09:18:39,195 - lsc_agent_eval.core.utils.judge - INFO - Setting up LiteLLM for gemini/gemini-2.5-flash +2025-07-21 09:18:39,195 - lsc_agent_eval.core.utils.judge - WARNING - Using generic provider format for gemini +Running 4 evaluation(s)... +================================================== +[1/4] Running: basic_introduction +2025-07-21 09:18:40,039 - lsc_agent_eval.core.utils.api_client - INFO - Agent response > +Hello! I'm an AI assistant for the Assisted Installer. I can help you create OpenShift clusters, list available versions, get cluster information, and more. What would you like to do today? +✅ basic_introduction: PASSED +[2/4] Running: basic_cluster_request +2025-07-21 09:18:46,006 - lsc_agent_eval.core.utils.api_client - INFO - Agent response > +I can help with that. What would you like to name your cluster? What OpenShift version do you want to install? What is the base domain for your cluster? Will this be a single-node cluster (True/False)? +✅ basic_cluster_request: PASSED +[3/4] Running: list_versions +2025-07-21 09:18:52,458 - lsc_agent_eval.core.utils.api_client - INFO - Agent response > +Here are the available OpenShift versions and their support levels: + +**Production:** +* 4.19.3 (default) +* 4.19.3-multi +* 4.18.19 +* 4.18.19-multi + +**Maintenance:** +* 4.17.35 +* 4.17.35-multi +* 4.16.43 +* 4.16.43-multi +* 4.15.54 +* 4.15.54-multi + +**Extended Support:** +* 4.14.51 +* 4.14.51-multi +* 4.12.71 + +**Beta:** +* 4.20.0-ec.4 +* 4.20.0-ec.4-multi + +**End-of-Life:** +* 4.11.59 +* 4.10.67 +* 4.9.17 +✅ list_versions: PASSED +[4/4] Running: available_operators +2025-07-21 09:18:58,051 - lsc_agent_eval.core.utils.api_client - INFO - Agent response > +There are two operator bundles available: + +* **Virtualization**: Run virtual machines alongside containers on one platform. This bundle includes operators like `mtv`, `node-healthcheck`, `nmstate`, `node-maintenance`, `kube-descheduler`, `cnv`, `self-node-remediation`, and `fence-agents-remediation`. +* **OpenShift AI**: Train, serve, monitor and manage AI/ML models and applications using GPUs. This bundle includes operators like `openshift-ai`, `amd-gpu`, `node-feature-discovery`, `pipelines`, `servicemesh`, `authorino`, `kmm`, `odf`, `serverless`, and `nvidia-gpu`. +✅ available_operators: PASSED +================================================== +FINAL RESULTS: 4/4 passed +🎉 All evaluations passed! +``` diff --git a/test/evals/basic_evals.yaml b/test/evals/basic_evals.yaml deleted file mode 100644 index e9b8a4ad..00000000 --- a/test/evals/basic_evals.yaml +++ /dev/null @@ -1,74 +0,0 @@ -description: | - This file contains a set of evaluations for the Assisted Installer Chat. Each evaluation consists of a description and a set of prompts with approximate expected responses. The purpose is to test the chat's ability to assist users in installing OpenShift clusters, providing information about available versions, and listing available operators. -evaluations: - - description: | - Basic interaction / introduction - prompts: - - prompt: | - Hi! - expected_response: | - Hello! I am Assisted Installer Chat, created by Red Hat. How can I help you today? - - description: | - Responds correctly to the most basic request - installing a cluster - prompts: - - prompt: | - I want to install an OCP cluster - expected_response: | - I can help you create an OpenShift cluster. I need some more information first. What would you like the cluster to be named? What is the base domain for the cluster? What OpenShift version do you want to install? Do you want to create a single-node cluster? - - description: | - Responds correctly a user interested in the available versions - prompts: - - prompt: | - List versions - expected_response: "Here is a list of available OpenShift versions: 4.10.67, 4.11.59, 4.12.71, 4.14.51, 4.14.51-multi, 4.15.53, 4.15.53-multi, 4.16.42, 4.16.42-multi, 4.17.34, 4.17.34-multi, 4.18.18, 4.18.18-multi, 4.19.1, 4.19.1-multi, 4.20.0-ec.3, 4.20.0-ec.3-multi, 4.9.17. \n" - - description: | - A more deep installation conversation, happy flow, user provides all the details. - prompts: - - prompt: | - I want to install an OCP cluster - expected_response: | - Hello! I am Assisted Installer Chat, created by Red Hat. How can I help you today? To create an OpenShift - cluster, I need a few more details. Could you please provide the following: - - • The desired cluster name - • The base domain for the cluster - • The OpenShift version you'd like to install - • Whether you want a single-node cluster or a multi-node cluster - - prompt: | - The name should be foo, the domain: foo.bar and the version should be the default version - expected_response: "OK. The default version is 4.19.1. Is this a single node cluster?" - expected_tool_calls: - - name: assisted__list_versions - args: {} - - prompt: | - Single - expected_response: | - I've created a single-node OpenShift cluster named 'foo' with base domain 'foo.bar' using version 4.19.1. The cluster ID is a797a26d-2727-4109-ac29-64fc07e21b9a and the InfraEnv ID is 14de509e-729e-43f0-8b54-0cd27d98b6d0. - expected_tool_calls: - - name: assisted__create_cluster - args: - name: foo - base_domain: foo.bar - version: 4.19.1 - single_node: true - - prompt: "I want the discovery ISO URL" - expected_response: | - OK. I have the information for the InfraEnv ID: c2221a86-d570-4701-b07a-3bfcf2583a31 - - The discovery ISO URL is: https://api.openshift.com/api/assisted-images/bytoken/eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NTE5MDQ4MDAsInN1YiI6ImMyMjIxYTg2LWQ1NzAtNDcwMS1iMDdhLTNiZmNmMjU4M2EzMSJ9.DsuulrNOR9t0CmnmvBb8SVHRNo8hJ40YhX9dG0flkZQ/4.19/x86_64/minimal.iso - expected_tool_calls: - - name: assisted__infraenv_info - args: - infra_env_id: 14de509e-729e-43f0-8b54-0cd27d98b6d0 - - description: | - Ask for available operators - prompts: - - prompt: | - What operators are available? - expected_response: | - Hello! I am Assisted Installer Chat, created by Red Hat. How can I help you today? - - The available operator bundles are: Virtualization: Run virtual machines alongside containers on one platform. Operators included are kube-descheduler, mtv, nmstate, node-maintenance, fence-agents-remediation, cnv, node-healthcheck, self-node-remediation. OpenShift AI: Train, serve, monitor and manage AI/ML models and applications using GPUs. Operators included are node-feature-discovery, pipelines, serverless, odf, authorino, kmm, servicemesh, openshift-ai, nvidia-gpu, amd-gpu. - expected_tool_calls: - - name: assisted__list_operator_bundles - args: {} diff --git a/test/evals/eval.py b/test/evals/eval.py new file mode 100644 index 00000000..06f5c271 --- /dev/null +++ b/test/evals/eval.py @@ -0,0 +1,88 @@ +import sys +import logging +import argparse +from lsc_agent_eval import AgentGoalEval + +# Configure logging to show all messages from agent_eval library +logging.basicConfig( + level=logging.WARNING, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout) + ] +) + +# Enable specific loggers we want to see +logging.getLogger('lsc_agent_eval').setLevel(logging.INFO) + +def print_test_result(result, config): + """Print test result in human readable format.""" + if result.result == "PASS": + print(f"✅ {result.eval_id}: PASSED") + else: + print(f"❌ {result.eval_id}: {result.result}") + print(f" Evaluation Type: {result.eval_type}") + print(f" Query: {result.query}") + print(f" Response: {result.response}") + + # Show expected values based on eval type + if config.eval_type == "sub-string" and config.expected_key_words: + print(f" Expected Keywords: {config.expected_key_words}") + elif config.eval_type == "judge-llm" and config.expected_response: + print(f" Expected Response: {config.expected_response}") + elif config.eval_type == "script" and config.eval_verify_script: + print(f" Verification Script: {config.eval_verify_script}") + + if result.error: + print(f" Error: {result.error}") + print() + +# Create proper Namespace object for AgentGoalEval +args = argparse.Namespace() +args.eval_data_yaml = 'eval_data.yaml' +args.agent_endpoint = 'http://localhost:8090' +args.agent_provider = 'gemini' +args.agent_model = 'gemini/gemini-2.5-flash' +# Set up judge model for LLM evaluation +args.judge_provider = 'gemini' +args.judge_model = 'gemini-2.5-flash' +args.agent_auth_token_file = 'ocm_token.txt' +args.result_dir = 'results' + +evaluator = AgentGoalEval(args) +configs = evaluator.data_manager.get_eval_data() + +print(f"Running {len(configs)} evaluation(s)...") +print("=" * 50) + +passed = 0 +failed = 0 + +for i, config in enumerate(configs, 1): + print(f"[{i}/{len(configs)}] Running: {config.eval_id}") + + result = evaluator.evaluation_runner.run_evaluation( + config, args.agent_provider, args.agent_model + ) + + # Count results as we go + if result.result == "PASS": + passed += 1 + else: + failed += 1 + + # Print result immediately + print_test_result(result, config) + +# Print final summary +print("=" * 50) +total = len(configs) + +print(f"FINAL RESULTS: {passed}/{total} passed") + +if failed > 0: + print(f"❌ {failed} evaluation(s) failed!") + sys.exit(1) +else: + print("🎉 All evaluations passed!") + sys.exit(0) diff --git a/test/evals/eval_data.yaml b/test/evals/eval_data.yaml new file mode 100644 index 00000000..0b97ae89 --- /dev/null +++ b/test/evals/eval_data.yaml @@ -0,0 +1,19 @@ +- eval_id: basic_introduction + eval_query: Hi! + eval_type: judge-llm + expected_response: "Hello! I'm the Assisted Installer, your guide for OpenShift cluster installation. How can I help you today?" + +- eval_id: basic_cluster_request + eval_query: I want to install an OCP cluster + eval_type: judge-llm + expected_response: "Great, I can help you with that. To create a cluster, I'll need some information from you. First, what would you like to call your cluster? And what base domain would you like to use? And finally, what OpenShift version would you like to install?" + +- eval_id: list_versions + eval_query: List the available OpenShift versions + eval_type: judge-llm + expected_response: "There are several versions of OpenShift available. The most recent production version is 4.19, 4.20 pre release versions are available as well as several previous verrsions." + +- eval_id: available_operators + eval_query: What operators are available? + eval_type: judge-llm + expected_response: "The operators that can be installed onto clusters are OpenShift AI and OpenShift Virtualization."