diff --git a/test/evals/eval.py b/test/evals/eval.py index 3001aa5..896fcf4 100644 --- a/test/evals/eval.py +++ b/test/evals/eval.py @@ -88,7 +88,7 @@ def parse_args(): result_summary = evaluator.get_result_summary() failed_evals_count = result_summary["FAIL"] + result_summary["ERROR"] -if failed_evals_count > 2: +if failed_evals_count: print(f"❌ {failed_evals_count} evaluation(s) failed!") sys.exit(1) diff --git a/test/evals/eval_data.yaml b/test/evals/eval_data.yaml index 5254a30..acc81c8 100644 --- a/test/evals/eval_data.yaml +++ b/test/evals/eval_data.yaml @@ -40,8 +40,8 @@ conversation: - eval_id: static_networking_support eval_query: Create a cluster with static networking - eval_types: [response_eval:sub-string] - expected_keywords: ["I do not support creating clusters with static networking", "assisted-installer web-based wizard"] + eval_types: [response_eval:accuracy] + expected_response: "I do not support creating clusters with static networking, however, you can use the assisted-installer web-based wizard to configure a cluster with static networking." - conversation_group: sno_requirements_conv conversation: @@ -88,7 +88,7 @@ conversation: - eval_id: create_eval_test_multinode eval_types: [tool_eval, response_eval:accuracy, response_eval:sub-string] - eval_query: Create a multi-node cluster named 'eval-test-multinode-uniq-cluster-name' with OpenShift 4.18.22 and domain test.local + eval_query: Create a multi-node cluster named 'eval-test-multinode-uniq-cluster-name' with OpenShift 4.18.22 and domain test.local. expected_tool_calls: - - tool_name: create_cluster arguments: @@ -96,8 +96,8 @@ version: "4\\.18\\.22" base_domain: "test\\.local" single_node: "(?i:false)" - cpu_architecture: None - ssh_public_key: None + cpu_architecture: "x86_64" + ssh_public_key: "" expected_keywords: ["eval-test-multinode-uniq-cluster-name", "ID", "Discovery ISO", "cluster"] expected_response: I have created a cluster with name eval-test-multinode-uniq-cluster-name. Next, you'll need to download the Discovery ISO, then boot your hosts with it. Would you like me to get the Discovery ISO download URL? - eval_id: set_ssh_key_eval_test_ssh @@ -118,12 +118,8 @@ cluster_id: "[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}" expected_keywords: ["Discovery ISO"] - eval_id: host_booted_but_not_discovered - eval_query: I booted the hosts - eval_types: [tool_eval, response_eval:accuracy] - expected_tool_calls: - - - tool_name: cluster_info - arguments: - cluster_id: "[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}" + eval_query: I booted the hosts for the cluster you just created + eval_types: [response_eval:accuracy] expected_response: "hosts hasn't been discovered yet." - conversation_group: list_clusters_conv @@ -178,19 +174,3 @@ expected_response: I can help you install OpenShift on-premise using the Assisted Installer, either on bare metal servers or virtual machines (e.g., vSphere, KVM, libvirt). I do not support public cloud platforms like AWS, Azure, or GCP. description: Ensure the assistant concisely states supported on-prem baremetal/VM scope and references Assisted Installer -- conversation_group: cluster_id_from_name - conversation: - - eval_id: create_single_node_cluser - eval_query: Create a multi-node cluster named 'eval-test2-uniq-cluster-name' with OpenShift 4.18.22 and domain test.local. I do not have an SSH key to provide. - eval_types: [response_eval:accuracy, response_eval:sub-string] - expected_keywords: ["eval-test2-uniq-cluster-name", "ID", "Discovery ISO", "download", "cluster"] - expected_response: I have created a cluster with name eval-test-uniq-cluster-name. Next, you'll need to download the Discovery ISO, then boot your hosts with it. Would you like me to get the Discovery ISO download URL? - - eval_id: cluster_name_tool_call - eval_query: Show me information on cluster eval-test2-uniq-cluster-name - eval_types: [tool_eval, response_eval:sub-string] - expected_tool_calls: - - - tool_name: cluster_info - arguments: - cluster_id: "[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}" - expected_keywords: ["cluster", "eval-test2-uniq-cluster-name", "test.local", "4.18.22"] - description: Test handling requesting a cluster by name