diff --git a/sdk/ai/azure-ai-projects/assets.json b/sdk/ai/azure-ai-projects/assets.json index cf8a8fe386d2..24777c3b7249 100644 --- a/sdk/ai/azure-ai-projects/assets.json +++ b/sdk/ai/azure-ai-projects/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/ai/azure-ai-projects", - "Tag": "python/ai/azure-ai-projects_7cddb7d06f" + "Tag": "python/ai/azure-ai-projects_212aab4d9b" } diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/README.md b/sdk/ai/azure-ai-projects/samples/evaluations/README.md new file mode 100644 index 000000000000..a5294316b515 --- /dev/null +++ b/sdk/ai/azure-ai-projects/samples/evaluations/README.md @@ -0,0 +1,105 @@ +# Azure AI Projects - Evaluation Samples + +This folder contains samples demonstrating how to use Azure AI Foundry's evaluation capabilities with the `azure-ai-projects` SDK. + +## Prerequisites + +Before running any sample: + +```bash +pip install "azure-ai-projects>=2.0.0b1" python-dotenv +``` + +Set these environment variables: +- `AZURE_AI_PROJECT_ENDPOINT` - Your Azure AI Project endpoint (e.g., `https://.services.ai.azure.com/api/projects/`) +- `AZURE_AI_MODEL_DEPLOYMENT_NAME` - The model deployment name (e.g., `gpt-4o-mini`) + +## Sample Index + +### Getting Started + +| Sample | Description | +|--------|-------------| +| [sample_evaluations_builtin_with_inline_data.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_builtin_with_inline_data.py) | Basic evaluation with built-in evaluators using inline data | +| [sample_eval_catalog.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_catalog.py) | Browse and use evaluators from the evaluation catalog | + +### Agent Evaluation + +| Sample | Description | +|--------|-------------| +| [sample_agent_evaluation.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_evaluation.py) | Evaluate an agent's responses | +| [sample_agent_response_evaluation.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation.py) | Evaluate agent response quality | +| [sample_agent_response_evaluation_with_function_tool.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_response_evaluation_with_function_tool.py) | Evaluate agent with function tools | +| [sample_model_evaluation.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_model_evaluation.py) | Evaluate model responses directly | + +### Evaluator Types + +| Sample | Description | +|--------|-------------| +| [sample_evaluations_graders.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py) | OpenAI graders: label_model, text_similarity, string_check, score_model | +| [sample_evaluations_ai_assisted.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_ai_assisted.py) | AI-assisted evaluators: Similarity, ROUGE, METEOR, GLEU, F1, BLEU | +| [sample_eval_catalog_code_based_evaluators.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_catalog_code_based_evaluators.py) | Code-based evaluators from the catalog | +| [sample_eval_catalog_prompt_based_evaluators.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_catalog_prompt_based_evaluators.py) | Prompt-based evaluators from the catalog | + +### Insights & Analysis + +| Sample | Description | +|--------|-------------| +| [sample_evaluation_compare_insight.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_compare_insight.py) | Compare evaluation runs and generate insights | +| [sample_evaluation_cluster_insight.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_cluster_insight.py) | Generate cluster insights from evaluation runs | + +### Red Team Evaluations + +| Sample | Description | +|--------|-------------| +| [sample_redteam_evaluations.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_redteam_evaluations.py) | Security and safety evaluations using red team techniques | + +### Agentic Evaluators + +Located in the [agentic_evaluators](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators) subfolder: + +| Sample | Description | +|--------|-------------| +| [sample_coherence.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_coherence.py) | Evaluate response coherence | +| [sample_fluency.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_fluency.py) | Evaluate response fluency | +| [sample_groundedness.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_groundedness.py) | Evaluate response groundedness | +| [sample_relevance.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_relevance.py) | Evaluate response relevance | +| [sample_intent_resolution.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_intent_resolution.py) | Evaluate intent resolution | +| [sample_response_completeness.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_response_completeness.py) | Evaluate response completeness | +| [sample_task_adherence.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_task_adherence.py) | Evaluate task adherence | +| [sample_task_completion.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_task_completion.py) | Evaluate task completion | +| [sample_task_navigation_efficiency.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_task_navigation_efficiency.py) | Evaluate navigation efficiency | +| [sample_tool_call_accuracy.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_tool_call_accuracy.py) | Evaluate tool call accuracy | +| [sample_tool_call_success.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_tool_call_success.py) | Evaluate tool call success | +| [sample_tool_input_accuracy.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_tool_input_accuracy.py) | Evaluate tool input accuracy | +| [sample_tool_output_utilization.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_tool_output_utilization.py) | Evaluate tool output utilization | +| [sample_tool_selection.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_tool_selection.py) | Evaluate tool selection | +| [sample_generic_agentic_evaluator](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-projects/samples/evaluations/agentic_evaluators/sample_generic_agentic_evaluator) | Generic agentic evaluator example | + +### Advanced Samples + +These samples require additional setup or Azure services: + +| Sample | Description | Requirements | +|--------|-------------|--------------| +| [sample_evaluations_builtin_with_dataset_id.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_builtin_with_dataset_id.py) | Use uploaded dataset for evaluation | Azure Blob Storage | +| [sample_evaluations_builtin_with_traces.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_builtin_with_traces.py) | Evaluate against Application Insights traces | Azure Application Insights | +| [sample_scheduled_evaluations.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_scheduled_evaluations.py) | Schedule recurring evaluations | RBAC setup | +| [sample_continuous_evaluation_rule.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_continuous_evaluation_rule.py) | Set up continuous evaluation rules | Manual RBAC in Azure Portal | +| [sample_evaluations_score_model_grader_with_image.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_score_model_grader_with_image.py) | Evaluate with image data | Image file | +| [sample_evaluations_builtin_with_inline_data_oai.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_builtin_with_inline_data_oai.py) | Use OpenAI client directly | OpenAI SDK | + +## Running a Sample + +```bash +# Set environment variables +export AZURE_AI_PROJECT_ENDPOINT="https://your-account.services.ai.azure.com/api/projects/your-project" +export AZURE_AI_MODEL_DEPLOYMENT_NAME="gpt-4o-mini" + +# Run a sample +python sample_evaluations_builtin_with_inline_data.py +``` + +## Learn More + +- [Azure AI Foundry Documentation](https://learn.microsoft.com/azure/ai-studio/) diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_cluster_insight.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_cluster_insight.py index 98c18dd97d84..d0ff775ca3bf 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_cluster_insight.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluation_cluster_insight.py @@ -29,8 +29,6 @@ import os import time -import json -import tempfile from typing import Union from pprint import pprint from dotenv import load_dotenv @@ -39,7 +37,11 @@ from azure.identity import DefaultAzureCredential from azure.ai.projects import AIProjectClient from openai.types.eval_create_params import DataSourceConfigCustom, TestingCriterionLabelModel -from openai.types.evals.create_eval_jsonl_run_data_source_param import CreateEvalJSONLRunDataSourceParam, SourceFileID +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent, +) from openai.types.evals.run_create_response import RunCreateResponse from openai.types.evals.run_retrieve_response import RunRetrieveResponse @@ -85,36 +87,23 @@ ) print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") - # Create and upload JSONL data as a dataset - eval_data = [ - {"item": {"query": "I love programming!"}}, - {"item": {"query": "I hate bugs."}}, - {"item": {"query": "The weather is nice today."}}, - {"item": {"query": "This is the worst movie ever."}}, - {"item": {"query": "Python is an amazing language."}}, - ] - - with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: - for item in eval_data: - f.write(json.dumps(item) + "\n") - temp_file_path = f.name - - dataset = project_client.datasets.upload_file( - name="sentiment-eval-data", - version=str(int(time.time())), - file_path=temp_file_path, - ) - os.unlink(temp_file_path) - print(f"Dataset created (id: {dataset.id}, name: {dataset.name}, version: {dataset.version})") - - if not dataset.id: - raise ValueError("Dataset ID is None") - - # Create an eval run using the uploaded dataset + # Create an eval run using inline data eval_run: Union[RunCreateResponse, RunRetrieveResponse] = openai_client.evals.runs.create( eval_id=eval_object.id, - name="Eval Run", - data_source=CreateEvalJSONLRunDataSourceParam(source=SourceFileID(id=dataset.id, type="file_id"), type="jsonl"), + name="Eval Run with Inline Data", + data_source=CreateEvalJSONLRunDataSourceParam( + type="jsonl", + source=SourceFileContent( + type="file_content", + content=[ + SourceFileContentContent(item={"query": "I love programming!"}), + SourceFileContentContent(item={"query": "I hate bugs."}), + SourceFileContentContent(item={"query": "The weather is nice today."}), + SourceFileContentContent(item={"query": "This is the worst movie ever."}), + SourceFileContentContent(item={"query": "Python is an amazing language."}), + ], + ), + ), ) print(f"Evaluation run created (id: {eval_run.id})") @@ -142,7 +131,7 @@ print(f"Started insight generation (id: {clusterInsight.id})") while clusterInsight.state not in [OperationState.SUCCEEDED, OperationState.FAILED]: - print(f"Waiting for insight to be generated...") + print("Waiting for insight to be generated...") clusterInsight = project_client.insights.get(id=clusterInsight.id) print(f"Insight status: {clusterInsight.state}") time.sleep(5) @@ -150,12 +139,11 @@ if clusterInsight.state == OperationState.SUCCEEDED: print("\n✓ Cluster insights generated successfully!") pprint(clusterInsight) + else: + print("\n✗ Cluster insight generation failed.") else: print("\n✗ Evaluation run failed. Cannot generate cluster insights.") - project_client.datasets.delete(name=dataset.name, version=dataset.version) - print("Dataset deleted") - openai_client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_ai_assisted.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_ai_assisted.py index e14beec50fd5..4312245c1b12 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_ai_assisted.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_ai_assisted.py @@ -7,7 +7,8 @@ """ DESCRIPTION: Given an AIProjectClient, this sample demonstrates how to use the synchronous - `openai.evals.*` methods to create, get and list evaluation and and eval runs. + `openai.evals.*` methods to create, get and list evaluation and eval runs + with AI-assisted evaluators (Similarity, ROUGE, METEOR, GLEU, F1, BLEU). USAGE: python sample_evaluations_ai_assisted.py @@ -20,39 +21,27 @@ 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your Microsoft Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. - 3) DATASET_NAME - Optional. The name of the Dataset to create and use in this sample. - 4) DATASET_VERSION - Optional. The version of the Dataset to create and use in this sample. - 5) DATA_FOLDER - Optional. The folder path where the data files for upload are located. """ import os from azure.identity import DefaultAzureCredential from azure.ai.projects import AIProjectClient -from azure.ai.projects.models import ( - DatasetVersion, -) - import time from pprint import pprint -from openai.types.evals.create_eval_jsonl_run_data_source_param import CreateEvalJSONLRunDataSourceParam, SourceFileID +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent, +) from openai.types.eval_create_params import DataSourceConfigCustom from dotenv import load_dotenv -from datetime import datetime load_dotenv() endpoint = os.environ["AZURE_AI_PROJECT_ENDPOINT"] - model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") -dataset_name = os.environ.get("DATASET_NAME", "") -dataset_version = os.environ.get("DATASET_VERSION", "1") - -# Construct the paths to the data folder and data file used in this sample -script_dir = os.path.dirname(os.path.abspath(__file__)) -data_folder = os.environ.get("DATA_FOLDER", os.path.join(script_dir, "data_folder")) -data_file = os.path.join(data_folder, "sample_data_evaluation.jsonl") with ( DefaultAzureCredential() as credential, @@ -60,14 +49,6 @@ project_client.get_openai_client() as client, ): - print("Upload a single file and create a new Dataset to reference the file.") - dataset: DatasetVersion = project_client.datasets.upload_file( - name=dataset_name or f"eval-data-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S_UTC')}", - version=dataset_version, - file_path=data_file, - ) - pprint(dataset) - data_source_config = DataSourceConfigCustom( { "type": "custom", @@ -133,9 +114,9 @@ }, ] - print("Creating evaluation") + print("Creating evaluation with AI-assisted evaluators") eval_object = client.evals.create( - name="ai assisted evaluators test", + name="AI assisted evaluators test", data_source_config=data_source_config, testing_criteria=testing_criteria, # type: ignore ) @@ -146,13 +127,42 @@ print("Evaluation Response:") pprint(eval_object_response) - print("Creating evaluation run") + print("Creating evaluation run with inline data") eval_run_object = client.evals.runs.create( eval_id=eval_object.id, - name="dataset", - metadata={"team": "eval-exp", "scenario": "notifications-v1"}, + name="inline_data_ai_assisted_run", + metadata={"team": "eval-exp", "scenario": "ai-assisted-inline-v1"}, data_source=CreateEvalJSONLRunDataSourceParam( - source=SourceFileID(id=dataset.id or "", type="file_id"), type="jsonl" + type="jsonl", + source=SourceFileContent( + type="file_content", + content=[ + SourceFileContentContent( + item={ + "response": "The capital of France is Paris, which is also known as the City of Light.", + "ground_truth": "Paris is the capital of France.", + } + ), + SourceFileContentContent( + item={ + "response": "Python is a high-level programming language known for its simplicity and readability.", + "ground_truth": "Python is a popular programming language that is easy to learn.", + } + ), + SourceFileContentContent( + item={ + "response": "Machine learning is a subset of artificial intelligence that enables systems to learn from data.", + "ground_truth": "Machine learning allows computers to learn from data without being explicitly programmed.", + } + ), + SourceFileContentContent( + item={ + "response": "The sun rises in the east and sets in the west due to Earth's rotation.", + "ground_truth": "The sun appears to rise in the east and set in the west because of Earth's rotation.", + } + ), + ], + ), ), ) print(f"Eval Run created") @@ -174,8 +184,5 @@ time.sleep(5) print("Waiting for evaluation run to complete...") - project_client.datasets.delete(name=dataset.name, version=dataset.version) - print("Dataset deleted") - client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py index 806d734330fc..d8c89f72666c 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_graders.py @@ -7,7 +7,8 @@ """ DESCRIPTION: Given an AIProjectClient, this sample demonstrates how to use the synchronous - `openai.evals.*` methods to create, get and list evaluation and and eval runs. + `openai.evals.*` methods to create, get and list evaluation and eval runs + using various grader types (label_model, text_similarity, string_check, score_model). USAGE: python sample_evaluations_graders.py @@ -20,38 +21,27 @@ 1) AZURE_AI_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your Microsoft Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. 2) AZURE_AI_MODEL_DEPLOYMENT_NAME - Required. The name of the model deployment to use for evaluation. - 3) DATASET_NAME - Optional. The name of the Dataset to create and use in this sample. - 4) DATASET_VERSION - Optional. The version of the Dataset to create and use in this sample. - 5) DATA_FOLDER - Optional. The folder path where the data files for upload are located. """ import os from azure.identity import DefaultAzureCredential from azure.ai.projects import AIProjectClient -from azure.ai.projects.models import ( - DatasetVersion, -) import time from pprint import pprint -from openai.types.evals.create_eval_jsonl_run_data_source_param import CreateEvalJSONLRunDataSourceParam, SourceFileID +from openai.types.evals.create_eval_jsonl_run_data_source_param import ( + CreateEvalJSONLRunDataSourceParam, + SourceFileContent, + SourceFileContentContent, +) from openai.types.eval_create_params import DataSourceConfigCustom from dotenv import load_dotenv -from datetime import datetime load_dotenv() endpoint = os.environ["AZURE_AI_PROJECT_ENDPOINT"] - model_deployment_name = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "") -dataset_name = os.environ.get("DATASET_NAME", "") -dataset_version = os.environ.get("DATASET_VERSION", "1") - -# Construct the paths to the data folder and data file used in this sample -script_dir = os.path.dirname(os.path.abspath(__file__)) -data_folder = os.environ.get("DATA_FOLDER", os.path.join(script_dir, "data_folder")) -data_file = os.path.join(data_folder, "sample_data_evaluation.jsonl") with ( DefaultAzureCredential() as credential, @@ -59,14 +49,6 @@ project_client.get_openai_client() as client, ): - print("Upload a single file and create a new Dataset to reference the file.") - dataset: DatasetVersion = project_client.datasets.upload_file( - name=dataset_name or f"eval-data-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S_UTC')}", - version=dataset_version, - file_path=data_file, - ) - pprint(dataset) - data_source_config = DataSourceConfigCustom( { "type": "custom", @@ -130,7 +112,7 @@ }, ] - print("Creating evaluation") + print("Creating evaluation with graders") eval_object = client.evals.create( name="OpenAI graders test", data_source_config=data_source_config, @@ -143,13 +125,50 @@ print("Evaluation Response:") pprint(eval_object_response) - print("Creating Eval Run") + print("Creating Eval Run with inline data") eval_run_object = client.evals.runs.create( eval_id=eval_object.id, - name="dataset", - metadata={"team": "eval-exp", "scenario": "notifications-v1"}, + name="inline_data_graders_run", + metadata={"team": "eval-exp", "scenario": "graders-inline-v1"}, data_source=CreateEvalJSONLRunDataSourceParam( - source=SourceFileID(id=dataset.id or "", type="file_id"), type="jsonl" + type="jsonl", + source=SourceFileContent( + type="file_content", + content=[ + SourceFileContentContent( + item={ + "query": "I love this product! It works great.", + "context": "Product review context", + "ground_truth": "The product is excellent and performs well.", + "response": "The product is amazing and works perfectly.", + } + ), + SourceFileContentContent( + item={ + "query": "The weather is cloudy today.", + "context": "Weather observation", + "ground_truth": "Today's weather is overcast.", + "response": "The sky is covered with clouds today.", + } + ), + SourceFileContentContent( + item={ + "query": "What is the capital of France?", + "context": "Geography question about European capitals", + "ground_truth": "Paris", + "response": "The capital of France is Paris.", + } + ), + SourceFileContentContent( + item={ + "query": "Explain quantum computing", + "context": "Complex scientific concept explanation", + "ground_truth": "Quantum computing uses quantum mechanics principles", + "response": "Quantum computing leverages quantum mechanical phenomena like superposition and entanglement to process information.", + } + ), + ], + ), ), ) print(f"Eval Run created (id: {eval_run_object.id}, name: {eval_run_object.name})") @@ -166,13 +185,9 @@ output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) pprint(output_items) print(f"Eval Run Report URL: {run.report_url}") - break time.sleep(5) print("Waiting for eval run to complete...") - project_client.datasets.delete(name=dataset.name, version=dataset.version) - print("Dataset deleted") - client.evals.delete(eval_id=eval_object.id) print("Evaluation deleted") diff --git a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py index acb3c5c2fd04..d3370ffebb19 100644 --- a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py +++ b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py @@ -45,9 +45,9 @@ class TestSamplesEvaluations(AzureRecordedTestCase): """ Tests for evaluation samples. - Included samples (25): + Included samples (28): - Main evaluation samples (10): + Main evaluation samples (13): - sample_agent_evaluation.py - sample_model_evaluation.py - sample_agent_response_evaluation.py @@ -58,6 +58,9 @@ class TestSamplesEvaluations(AzureRecordedTestCase): - sample_eval_catalog_prompt_based_evaluators.py - sample_evaluation_compare_insight.py - sample_redteam_evaluations.py + - sample_evaluations_graders.py (OpenAI graders: label_model, text_similarity, string_check, score_model) + - sample_evaluations_ai_assisted.py (AI-assisted evaluators: Similarity, ROUGE, METEOR, GLEU, F1, BLEU) + - sample_evaluation_cluster_insight.py (cluster insights generation) Agentic evaluator samples (15): - sample_coherence.py @@ -81,11 +84,8 @@ class TestSamplesEvaluations(AzureRecordedTestCase): Blob Storage / Dataset Upload (incompatible with test proxy playback): - sample_evaluations_builtin_with_dataset_id.py: Uploads data to Azure Blob Storage before creating the evaluation. - - sample_evaluations_ai_assisted.py: Creates a Dataset with file upload. - - sample_evaluations_graders.py: Creates a Dataset with file upload. - sample_evaluations_score_model_grader_with_image.py: Uses image data which may involve file upload. - - sample_evaluation_cluster_insight.py: Creates a Dataset with file upload. Authentication incompatibility (mock credentials don't work): - sample_evaluations_builtin_with_inline_data_oai.py: Uses OpenAI client directly with @@ -120,6 +120,9 @@ class TestSamplesEvaluations(AzureRecordedTestCase): "sample_evaluation_compare_insight.py", "sample_agent_response_evaluation_with_function_tool.py", "sample_redteam_evaluations.py", + "sample_evaluations_graders.py", + "sample_evaluations_ai_assisted.py", + "sample_evaluation_cluster_insight.py", ], ), )