diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py index 2c9daebb8b41..d71d5b1fe7c6 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py @@ -45,6 +45,7 @@ from ._aoai.label_grader import AzureOpenAILabelGrader from ._aoai.string_check_grader import AzureOpenAIStringCheckGrader from ._aoai.text_similarity_grader import AzureOpenAITextSimilarityGrader +from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader _patch_all = [] @@ -99,6 +100,7 @@ "AzureOpenAILabelGrader", "AzureOpenAIStringCheckGrader", "AzureOpenAITextSimilarityGrader", + "AzureOpenAIScoreModelGrader", ] __all__.extend([p for p in _patch_all if p not in __all__]) \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/__init__.py index f5fa183c8914..ffab8bf38247 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/__init__.py @@ -4,7 +4,9 @@ from .aoai_grader import AzureOpenAIGrader +from .score_model_grader import AzureOpenAIScoreModelGrader __all__ = [ "AzureOpenAIGrader", + "AzureOpenAIScoreModelGrader", ] \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py index 820644c9bc6b..95ef77544cee 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py @@ -35,7 +35,7 @@ class AzureOpenAIGrader(): """ - id = "aoai://general" + id = "azureai://built-in/evaluators/azure-openai/custom_grader" def __init__(self, *, model_config : Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], grader_config: Dict[str, Any], **kwargs: Any): self._model_config = model_config diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py index 6d4752830c68..338584c0ae57 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py @@ -42,7 +42,7 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader): """ - id = "aoai://label_model" + id = "azureai://built-in/evaluators/azure-openai/label_grader" def __init__( self, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py new file mode 100644 index 000000000000..ec35e5d1372a --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py @@ -0,0 +1,104 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +from typing import Any, Dict, Union, List, Optional + +from azure.ai.evaluation._model_configurations import ( + AzureOpenAIModelConfiguration, + OpenAIModelConfiguration +) +from openai.types.graders import ScoreModelGrader +from azure.ai.evaluation._common._experimental import experimental + +from .aoai_grader import AzureOpenAIGrader + + +@experimental +class AzureOpenAIScoreModelGrader(AzureOpenAIGrader): + """ + Wrapper class for OpenAI's score model graders. + + Enables continuous scoring evaluation with custom prompts and flexible + conversation-style inputs. Supports configurable score ranges and + pass thresholds for binary classification. + + Supplying a ScoreModelGrader to the `evaluate` method will cause an + asynchronous request to evaluate the grader via the OpenAI API. The + results of the evaluation will then be merged into the standard + evaluation results. + + :param model_config: The model configuration to use for the grader. + :type model_config: Union[ + ~azure.ai.evaluation.AzureOpenAIModelConfiguration, + ~azure.ai.evaluation.OpenAIModelConfiguration + ] + :param input: The input messages for the grader. List of conversation + messages with role and content. + :type input: List[Dict[str, str]] + :param model: The model to use for the evaluation. + :type model: str + :param name: The name of the grader. + :type name: str + :param range: The range of the score. Defaults to [0, 1]. + :type range: Optional[List[float]] + :param pass_threshold: Score threshold for pass/fail classification. + Defaults to midpoint of range. + :type pass_threshold: Optional[float] + :param sampling_params: The sampling parameters for the model. + :type sampling_params: Optional[Dict[str, Any]] + :param kwargs: Additional keyword arguments to pass to the grader. + :type kwargs: Any + """ + + id = "azureai://built-in/evaluators/azure-openai/scorer_grader" + + def __init__( + self, + *, + model_config: Union[ + AzureOpenAIModelConfiguration, OpenAIModelConfiguration + ], + input: List[Dict[str, str]], + model: str, + name: str, + range: Optional[List[float]] = None, + pass_threshold: Optional[float] = None, + sampling_params: Optional[Dict[str, Any]] = None, + **kwargs: Any + ): + # Validate range and pass_threshold + if range is not None: + if len(range) != 2 or range[0] >= range[1]: + raise ValueError("range must be a list of two numbers [min, max] where min < max") + else: + range = [0.0, 1.0] # Default range + + if pass_threshold is not None: + if range and (pass_threshold < range[0] or pass_threshold > range[1]): + raise ValueError(f"pass_threshold {pass_threshold} must be within range {range}") + else: + pass_threshold = (range[0] + range[1]) / 2 # Default to midpoint + + # Store pass_threshold as instance attribute + self.pass_threshold = pass_threshold + + # Create OpenAI ScoreModelGrader instance + grader_kwargs = { + "input": input, + "model": model, + "name": name, + "type": "score_model" + } + + if range is not None: + grader_kwargs["range"] = range + if sampling_params is not None: + grader_kwargs["sampling_params"] = sampling_params + + grader = ScoreModelGrader(**grader_kwargs) + + super().__init__( + model_config=model_config, + grader_config=grader, + **kwargs + ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py index 627c53ed3497..ba3b056569fb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py @@ -38,7 +38,7 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader): """ - id = "aoai://string_check" + id = "azureai://built-in/evaluators/azure-openai/string_check_grader" def __init__( self, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py index 9289f3fd2538..06b7facab7e2 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py @@ -52,7 +52,7 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader): """ - id = "aoai://text_similarity" + id = "azureai://built-in/evaluators/azure-openai/text_similarity_grader" def __init__( self, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py index fe2a8effc0e9..249af27c83ea 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py @@ -316,12 +316,14 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]: AzureOpenAILabelGrader, AzureOpenAIStringCheckGrader, AzureOpenAITextSimilarityGrader, + AzureOpenAIScoreModelGrader, ) id_map = { AzureOpenAIGrader.id: AzureOpenAIGrader, AzureOpenAILabelGrader.id: AzureOpenAILabelGrader, AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader, AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader, + AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader, } for key in id_map.keys(): diff --git a/sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py b/sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py new file mode 100644 index 000000000000..b760a2ba3bd9 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py @@ -0,0 +1,470 @@ +#!/usr/bin/env python3 + +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +""" +Sample demonstrating the use of AzureOpenAIScoreModelGrader for continuous +scoring evaluation. + +This sample shows how to: +1. Configure an Azure OpenAI model for grading +2. Create a score model grader with custom prompts +3. Run evaluation using the evaluate() method +4. Interpret continuous scoring results + +Prerequisites: +- Azure OpenAI resource with API key and endpoint +- Model deployment (e.g., gpt-4, gpt-4o-mini) +- Sample conversation data in JSONL format +- Environment variables configured in .env file +""" + +import json +import os +from dotenv import load_dotenv +import pandas as pd +from azure.ai.evaluation import evaluate, AzureOpenAIScoreModelGrader +from azure.ai.evaluation import AzureOpenAIModelConfiguration + +# Load environment variables +load_dotenv() + + +def create_sample_data() -> str: + """Create sample conversation data for testing.""" + sample_conversations = [ + { + "conversation": { + "messages": [ + { + "content": "How can I improve my Python coding skills?", + "role": "user" + }, + { + "content": ( + "Here are some effective ways to improve your " + "Python skills: 1) Practice coding daily with " + "platforms like LeetCode or HackerRank, 2) Work " + "on real projects that interest you, 3) Read " + "other people's code on GitHub, 4) Join Python " + "communities and forums, 5) Take online courses " + "or tutorials. The key is consistent practice and " + "building projects that challenge you." + ), + "role": "assistant" + } + ] + }, + "expected_quality": "high", + "context": "Programming advice request" + }, + { + "conversation": { + "messages": [ + { + "content": "What is Python?", + "role": "user" + }, + { + "content": "Python is a programming language.", + "role": "assistant" + } + ] + }, + "expected_quality": "low", + "context": "Basic definition request" + }, + { + "conversation": { + "messages": [ + { + "content": ( + "Can you explain machine learning concepts " + "for a beginner?" + ), + "role": "user" + }, + { + "content": ( + "Machine learning is a subset of artificial " + "intelligence where computers learn patterns from " + "data without being explicitly programmed. Think " + "of it like teaching a child to recognize animals " + "- you show them many pictures of cats and dogs, " + "and eventually they learn to distinguish between " + "them. In ML, we feed algorithms lots of data so " + "they can make predictions or decisions on new, " + "unseen data. Common types include supervised " + "learning (learning from labeled examples), " + "unsupervised learning (finding hidden patterns), " + "and reinforcement learning (learning through " + "trial and error)." + ), + "role": "assistant" + } + ] + }, + "expected_quality": "high", + "context": "Educational explanation request" + }, + { + "conversation": { + "messages": [ + { + "content": "Help me debug this code", + "role": "user" + }, + { + "content": "Sure, please share your code.", + "role": "assistant" + } + ] + }, + "expected_quality": "medium", + "context": "Debug assistance request" + }, + { + "conversation": { + "messages": [ + { + "content": ( + "What are the best practices for writing " + "clean Python code?" + ), + "role": "user" + }, + { + "content": ( + "Here are key best practices for writing clean " + "Python code: 1) Follow PEP 8 style guidelines " + "for consistent formatting, 2) Use descriptive " + "variable and function names that explain their " + "purpose, 3) Write docstrings for functions and " + "classes, 4) Keep functions small and focused on " + "a single task, 5) Use type hints to improve code " + "clarity, 6) Handle exceptions appropriately, " + "7) Write unit tests for your code, 8) Use " + "virtual environments for dependency management, " + "9) Comment complex logic but avoid obvious " + "comments, 10) Refactor code regularly to improve " + "readability and maintainability." + ), + "role": "assistant" + } + ] + }, + "expected_quality": "high", + "context": "Best practices inquiry" + } + ] + + # Create JSONL file + filename = "sample_conversations.jsonl" + with open(filename, 'w') as f: + for conv in sample_conversations: + f.write(json.dumps(conv) + '\n') + + print(f"Created sample data file: {filename}") + return filename + + +def demonstrate_score_model_grader(): + """Demonstrate the AzureOpenAIScoreModelGrader usage with real credentials.""" + + # Create sample data + data_file = create_sample_data() + + print("=== Azure OpenAI Score Model Grader Demo ===\n") + + try: + # 1. Configure Azure OpenAI model using environment variables + model_config = AzureOpenAIModelConfiguration( + azure_endpoint=os.environ.get("4o_mini_target_endpoint"), + api_key=os.environ.get("4o_mini_target_endpoint_key"), + azure_deployment=os.environ.get( + "4o_mini_target_endpoint_deployment_name" + ), + api_version="2024-12-01-preview" + ) + + print("βœ… Model configuration loaded successfully") + + # 2. Create conversation quality grader + conversation_quality_grader = AzureOpenAIScoreModelGrader( + model_config=model_config, + name="Conversation Quality Assessment", + model="gpt-4o-mini", + input=[ + { + "role": "system", + "content": ( + "You are an expert conversation quality evaluator. " + "Assess the quality of AI assistant responses based on " + "helpfulness, completeness, accuracy, and " + "appropriateness. Return a score between 0.0 (very " + "poor) and 1.0 (excellent)." + ) + }, + { + "role": "user", + "content": ( + "Evaluate this conversation:\n" + "Context: {{ item.context }}\n" + "Messages: {{ item.conversation }}\n\n" + "Provide a quality score from 0.0 to 1.0." + ) + } + ], + range=[0.0, 1.0], + sampling_params={ + "temperature": 0.0 + } + ) + + print("βœ… Conversation quality grader created successfully") + + # 3. Run evaluation with the score model grader + print("\nπŸš€ Running evaluation with score model grader...") + + result = evaluate( + data=data_file, + evaluators={ + "conversation_quality": conversation_quality_grader + } + ) + + # 4. Display results + print("\n=== Evaluation Results ===") + print(f"Total samples evaluated: {len(result['rows'])}") + + # Show metrics + print("\n=== Metrics Summary ===") + for metric_name, metric_value in result['metrics'].items(): + print(f"{metric_name}: {metric_value:.3f}") + + # Show detailed results + print("\n=== Sample Results ===") + df = pd.DataFrame(result['rows']) + + for i, row in df.head(3).iterrows(): + print(f"\nSample {i+1}:") + print(f" Context: {row.get('context', 'N/A')}") + + # Show grader results + for col in df.columns: + if col.startswith('outputs.'): + grader_name = col.split('.')[1] + if 'score' in col: + print(f" {grader_name} Score: {row[col]:.3f}") + elif 'passed' in col: + print(f" {grader_name} Passed: {row[col]}") + + print("\nβœ… Evaluation completed successfully!") + + except Exception as e: + print(f"\n❌ Error during evaluation: {str(e)}") + print("\nFalling back to demonstration mode...") + demonstrate_configuration_only() + + # Clean up + if os.path.exists(data_file): + os.remove(data_file) + print(f"\n🧹 Cleaned up temporary file: {data_file}") + + +def demonstrate_configuration_only(): + """Demonstrate grader configuration without running actual evaluation.""" + + try: + # Create sample data + data_file = create_sample_data() + + print("πŸ“ Testing grader configuration...") + + # Configure with placeholder values for testing + model_config = AzureOpenAIModelConfiguration( + azure_endpoint="https://test-endpoint.openai.azure.com/", + api_key="test-key", + azure_deployment="gpt-4o-mini", + api_version="2024-12-01-preview" + ) + + # Create a simple grader to test + test_grader = AzureOpenAIScoreModelGrader( + model_config=model_config, + name="Test Quality Grader", + model="gpt-4o-mini", + input=[ + { + "role": "system", + "content": "You are a test evaluator." + }, + { + "role": "user", + "content": "Rate this: {{ data.conversation }}" + } + ] + ) + + print("βœ… Grader creation successful!") + print(f" - Grader ID: {test_grader.id}") + print(f" - Grader name: {test_grader._grader_config.name}") + print(f" - Grader model: {test_grader._grader_config.model}") + print(f" - Input messages: {len(test_grader._grader_config.input)}") + + print("\n🚧 Implementation Status:") + print(" - Sample data created: βœ…") + print(" - AzureOpenAIScoreModelGrader class: βœ… (implemented)") + print(" - Integration with evaluate(): βœ… (ready for testing)") + print("\nπŸ“– Ready for use!") + print(" Configure with real API credentials to run evaluations") + + # Clean up + if os.path.exists(data_file): + os.remove(data_file) + + except Exception as e: + print(f"❌ Error testing implementation: {e}") + print("\n🚧 Implementation Status:") + print(" - Sample data created: βœ…") + print(" - AzureOpenAIScoreModelGrader class: ❌ (error)") + print(" - Integration with evaluate(): ❌ (needs fixing)") + + +def demonstrate_different_grader_types(): + """Show examples of different score model grader configurations.""" + + print("\n=== Different Score Model Grader Examples ===\n") + + examples = [ + { + "name": "Helpfulness Grader", + "description": ( + "Evaluates how helpful the AI response is to the user" + ), + "config": { + "name": "Helpfulness Assessment", + "input": [ + { + "role": "system", + "content": ( + "Rate how helpful this AI response is in " + "addressing the user's needs." + ) + }, + { + "role": "user", + "content": ( + "User Question: {{ data.question }}\n" + "AI Response: {{ data.response }}\n\n" + "Helpfulness score (0.0-1.0):" + ) + } + ], + "range": [0.0, 1.0], + "pass_threshold": 0.6 + } + }, + { + "name": "Factual Accuracy Grader", + "description": ( + "Checks factual accuracy against reference information" + ), + "config": { + "name": "Factual Accuracy Check", + "input": [ + { + "role": "system", + "content": ( + "You are a fact-checker. Compare the AI response " + "with reference information and rate accuracy." + ) + }, + { + "role": "user", + "content": ( + "Reference: {{ data.reference }}\n" + "AI Response: {{ data.response }}\n\n" + "Accuracy score (0.0-1.0):" + ) + } + ], + "range": [0.0, 1.0], + "pass_threshold": 0.8 + } + }, + { + "name": "Clarity Grader", + "description": ( + "Evaluates how clear and understandable the response is" + ), + "config": { + "name": "Response Clarity", + "input": [ + { + "role": "developer", + "content": ( + "Evaluate the clarity and understandability " + "of this AI response." + ) + }, + { + "role": "user", + "content": ( + "Response: {{ data.response }}\n\n" + "Clarity score (0.0-1.0, where 1.0 is perfectly " + "clear):" + ) + } + ], + "range": [0.0, 1.0], + "pass_threshold": 0.7, + "sampling_params": { + "temperature": 0.1, + "max_tokens": 150 + } + } + } + ] + + for example in examples: + print(f"🎯 {example['name']}") + print(f" Description: {example['description']}") + print(" Configuration:") + config = example['config'] + print(f" - Name: {config['name']}") + print(f" - Input Messages: {len(config['input'])} messages") + print(f" - Range: {config['range']}") + print(f" - Pass Threshold: {config['pass_threshold']}") + if 'sampling_params' in config: + print(f" - Sampling Params: {config['sampling_params']}") + print() + + +if __name__ == "__main__": + print("πŸš€ Starting Azure OpenAI Score Model Grader Demo\n") + + # Check if environment variables are set + required_vars = [ + "4o_mini_target_endpoint", + "4o_mini_target_endpoint_key", + "4o_mini_target_endpoint_deployment_name" + ] + + missing_vars = [var for var in required_vars if not os.environ.get(var)] + + if missing_vars: + print("⚠️ Missing environment variables:") + for var in missing_vars: + print(f" - {var}") + print("\nRunning in demonstration mode...\n") + demonstrate_configuration_only() + else: + print("βœ… All environment variables found") + demonstrate_score_model_grader() + + demonstrate_different_grader_types() + + print("\nπŸŽ‰ Demo completed!") diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/score_model_test_data.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/score_model_test_data.jsonl new file mode 100644 index 000000000000..158412dfd6ea --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/score_model_test_data.jsonl @@ -0,0 +1,3 @@ +{"conversation": {"messages": [{"role": "user", "content": "How can I improve my Python coding skills?"}, {"role": "assistant", "content": "Here are some effective ways to improve your Python skills: 1) Practice coding daily with platforms like LeetCode or HackerRank, 2) Work on real projects that interest you, 3) Read other people's code on GitHub, 4) Join Python communities and forums, 5) Take online courses or tutorials."}]}, "context": "Programming advice", "expected_quality": "high"} +{"conversation": {"messages": [{"role": "user", "content": "What is Python?"}, {"role": "assistant", "content": "Python is a programming language."}]}, "context": "Basic definition", "expected_quality": "low"} +{"conversation": {"messages": [{"role": "user", "content": "Can you explain machine learning for a beginner?"}, {"role": "assistant", "content": "Machine learning is a subset of artificial intelligence where computers learn patterns from data without being explicitly programmed. Think of it like teaching a child to recognize animals - you show them many pictures of cats and dogs, and eventually they learn to distinguish between them."}]}, "context": "Educational explanation", "expected_quality": "high"} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_score_model_grader.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_score_model_grader.py new file mode 100644 index 000000000000..e674e6fda724 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_score_model_grader.py @@ -0,0 +1,1160 @@ +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +Comprehensive unit tests for AzureOpenAIScoreModelGrader. +This test suite covers initialization scenarios, edge cases, validation logic, +error handling, registry integration, and usage patterns. +""" + +import pytest +from unittest.mock import patch, AsyncMock + +from azure.ai.evaluation import AzureOpenAIModelConfiguration +from azure.ai.evaluation._aoai.score_model_grader import ( + AzureOpenAIScoreModelGrader +) +from azure.ai.evaluation._evaluate._evaluate_aoai import ( + _split_evaluators_and_grader_configs, + _convert_remote_eval_params_to_grader +) + + +@pytest.fixture +def mock_aoai_model_config(): + """Mock Azure OpenAI model configuration for testing.""" + return AzureOpenAIModelConfiguration( + azure_deployment="test-deployment", + azure_endpoint="https://test-endpoint.openai.azure.com/", + api_key="test-api-key", + api_version="2024-12-01-preview", + ) + + +@pytest.fixture +def basic_score_grader_config(): + """Basic configuration for score model grader.""" + return { + "name": "Test Score Grader", + "model": "gpt-4o-mini", + "input": [ + { + "role": "system", + "content": "You are a test evaluator. Rate from 0.0 to 1.0." + }, + { + "role": "user", + "content": "Rate this conversation: {{ item.conversation }}" + } + ], + "range": [0.0, 1.0], + "pass_threshold": 0.5, + "sampling_params": { + "temperature": 0.0, + "max_tokens": 100 + } + } + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGrader: + """Test suite for AzureOpenAIScoreModelGrader.""" + + def test_grader_initialization_valid_config( + self, mock_aoai_model_config, basic_score_grader_config + ): + """Test successful grader initialization with valid configuration.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + assert grader is not None + assert grader.id == "aoai://score_model" + assert grader._model_config == mock_aoai_model_config + assert grader._grader_config.name == "Test Score Grader" + assert grader._grader_config.model == "gpt-4o-mini" + assert grader._grader_config.range == [0.0, 1.0] + assert grader.pass_threshold == 0.5 + + def test_grader_initialization_minimal_config(self, mock_aoai_model_config): + """Test grader initialization with minimal required configuration.""" + minimal_config = { + "name": "Minimal Grader", + "model": "gpt-4", + "input": [ + {"role": "user", "content": "Rate this: {{ item.data }}"} + ] + } + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **minimal_config + ) + + assert grader is not None + assert grader._grader_config.name == "Minimal Grader" + assert grader._grader_config.range == [0.0, 1.0] # Default range + assert grader.pass_threshold == 0.5 # Default threshold + + def test_grader_initialization_missing_model_config( + self, basic_score_grader_config + ): + """Test that grader initialization fails without model config.""" + with pytest.raises(TypeError): + AzureOpenAIScoreModelGrader(**basic_score_grader_config) + + def test_grader_initialization_invalid_model_config( + self, basic_score_grader_config + ): + """Test grader initialization with invalid model config.""" + bad_model_config = AzureOpenAIModelConfiguration( + azure_deployment="test-deployment", + azure_endpoint="https://test-endpoint.openai.azure.com/", + # Missing api_key + ) + + with pytest.raises(Exception) as excinfo: + AzureOpenAIScoreModelGrader( + model_config=bad_model_config, + **basic_score_grader_config + ) + + assert "api_key" in str(excinfo.value) + + def test_grader_initialization_missing_required_fields( + self, mock_aoai_model_config + ): + """Test grader initialization fails with missing required fields.""" + # Missing name + with pytest.raises(TypeError): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + model="gpt-4", + input=[{"role": "user", "content": "test"}] + ) + + # Missing model + with pytest.raises(TypeError): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Test", + input=[{"role": "user", "content": "test"}] + ) + + # Missing input + with pytest.raises(TypeError): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Test", + model="gpt-4" + ) + + def test_grader_initialization_invalid_range(self, mock_aoai_model_config): + """Test grader initialization with invalid range values.""" + config = { + "name": "Test Grader", + "model": "gpt-4", + "input": [{"role": "user", "content": "test"}], + "range": [1.0, 0.0] # Invalid: min > max + } + + with pytest.raises(ValueError) as excinfo: + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **config + ) + + assert "range" in str(excinfo.value).lower() + + def test_grader_initialization_invalid_threshold( + self, mock_aoai_model_config + ): + """Test grader initialization with invalid pass threshold.""" + config = { + "name": "Test Grader", + "model": "gpt-4", + "input": [{"role": "user", "content": "test"}], + "range": [0.0, 1.0], + "pass_threshold": 1.5 # Outside range + } + + with pytest.raises(ValueError) as excinfo: + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **config + ) + + assert "pass_threshold" in str(excinfo.value).lower() + + def test_grader_validation_bypass(self, basic_score_grader_config): + """Test that validation can be bypassed for testing purposes.""" + bad_model_config = AzureOpenAIModelConfiguration( + azure_deployment="test-deployment", + azure_endpoint="https://test-endpoint.openai.azure.com/", + # Missing api_key + ) + + # Should not raise exception when validate=False + grader = AzureOpenAIScoreModelGrader( + model_config=bad_model_config, + validate=False, + **basic_score_grader_config + ) + + assert grader is not None + + def test_grader_registry_integration( + self, mock_aoai_model_config, basic_score_grader_config + ): + """Test that score model grader integrates with the grader registry.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + # Test grader conversion + init_params = { + "model_config": mock_aoai_model_config, + **basic_score_grader_config + } + + converted_grader = _convert_remote_eval_params_to_grader( + AzureOpenAIScoreModelGrader.id, + init_params=init_params + ) + + assert isinstance(converted_grader, AzureOpenAIScoreModelGrader) + assert converted_grader._model_config == mock_aoai_model_config + + def test_grader_split_recognition( + self, mock_aoai_model_config, basic_score_grader_config + ): + """Test that score model grader is correctly recognized as AOAI grader.""" + from azure.ai.evaluation import F1ScoreEvaluator + + built_in_eval = F1ScoreEvaluator() + custom_eval = lambda x: x + score_grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + evaluators = { + "f1_score": built_in_eval, + "custom_eval": custom_eval, + "score_grader": score_grader + } + + just_evaluators, aoai_graders = _split_evaluators_and_grader_configs( + evaluators + ) + + assert len(just_evaluators) == 2 + assert len(aoai_graders) == 1 + assert "f1_score" in just_evaluators + assert "custom_eval" in just_evaluators + assert "score_grader" in aoai_graders + + def test_grader_config_properties( + self, mock_aoai_model_config, basic_score_grader_config + ): + """Test that grader configuration properties are accessible.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + config = grader._grader_config + + assert config.name == "Test Score Grader" + assert config.model == "gpt-4o-mini" + assert len(config.input) == 2 + assert config.input[0].role == "system" + assert config.input[1].role == "user" + assert config.range == [0.0, 1.0] + assert config.sampling_params["temperature"] == 0.0 + assert config.sampling_params["max_tokens"] == 100 + assert grader.pass_threshold == 0.5 + + def test_different_score_ranges(self, mock_aoai_model_config): + """Test grader with different score ranges.""" + # Test 1-5 scale + config_1_to_5 = { + "name": "1-5 Scale Grader", + "model": "gpt-4", + "input": [{"role": "user", "content": "Rate 1-5: {{ item.text }}"}], + "range": [1.0, 5.0], + "pass_threshold": 3.0 + } + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **config_1_to_5 + ) + + assert grader._grader_config.range == [1.0, 5.0] + assert grader.pass_threshold == 3.0 + + # Test 0-10 scale with default threshold + config_0_to_10 = { + "name": "0-10 Scale Grader", + "model": "gpt-4", + "input": [{"role": "user", "content": "Rate 0-10: {{ item.text }}"}], + "range": [0.0, 10.0] + # No pass_threshold specified - should default to 5.0 (midpoint) + } + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **config_0_to_10 + ) + + assert grader._grader_config.range == [0.0, 10.0] + assert grader.pass_threshold == 5.0 # Midpoint default + + def test_grader_id_property( + self, mock_aoai_model_config, basic_score_grader_config + ): + """Test that grader has correct ID.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + assert grader.id == "aoai://score_model" + assert AzureOpenAIScoreModelGrader.id == "aoai://score_model" + + @patch('azure.ai.evaluation._aoai.score_model_grader.AzureOpenAIGrader.get_client') + def test_grader_with_mocked_client( + self, mock_get_client, mock_aoai_model_config, basic_score_grader_config + ): + """Test grader creation and basic properties with mocked client.""" + # Mock the client to avoid actual API calls + mock_client = AsyncMock() + mock_get_client.return_value = mock_client + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + assert grader is not None + assert grader.id == "aoai://score_model" + assert hasattr(grader, 'pass_threshold') + assert grader.pass_threshold == 0.5 + + +@pytest.mark.unittest +class TestScoreModelGraderUsagePatterns: + """Test common usage patterns for score model grader.""" + + def test_conversation_quality_pattern(self, mock_aoai_model_config): + """Test conversation quality grading pattern.""" + config = { + "name": "Conversation Quality", + "model": "gpt-4o-mini", + "input": [ + { + "role": "system", + "content": ( + "Assess conversation quality based on helpfulness, " + "accuracy, and completeness." + ) + }, + { + "role": "user", + "content": ( + "Context: {{ item.context }}\n" + "Conversation: {{ item.conversation }}\n" + "Rate quality (0.0-1.0):" + ) + } + ], + "range": [0.0, 1.0], + "pass_threshold": 0.7 + } + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **config + ) + + assert grader._grader_config.name == "Conversation Quality" + assert grader.pass_threshold == 0.7 + + def test_helpfulness_scoring_pattern(self, mock_aoai_model_config): + """Test helpfulness scoring pattern.""" + config = { + "name": "Helpfulness Score", + "model": "gpt-4", + "input": [ + { + "role": "system", + "content": ( + "Rate how helpful the AI response is to " + "the user's question." + ) + }, + { + "role": "user", + "content": ( + "Question: {{ item.question }}\n" + "Response: {{ item.response }}\n" + "Helpfulness (0-10):" + ) + } + ], + "range": [0.0, 10.0], + "pass_threshold": 6.0, + "sampling_params": {"temperature": 0.0} + } + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **config + ) + + assert grader._grader_config.range == [0.0, 10.0] + assert grader.pass_threshold == 6.0 + + +@pytest.mark.unittest +class TestScoreModelGraderIntegration: + """Test integration with evaluation framework.""" + + def test_grader_in_evaluators_dict( + self, mock_aoai_model_config, basic_score_grader_config + ): + """Test using score grader in evaluators dictionary.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + # Test that grader can be used in evaluators dict + evaluators = {"quality_score": grader} + + # Verify grader separation works + just_evaluators, aoai_graders = _split_evaluators_and_grader_configs( + evaluators + ) + assert len(just_evaluators) == 0 + assert len(aoai_graders) == 1 + assert "quality_score" in aoai_graders + + def test_multiple_graders_recognition(self, mock_aoai_model_config): + """Test multiple score graders in evaluation.""" + quality_grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Quality Assessment", + model="gpt-4o-mini", + input=[{ + "role": "user", + "content": "Rate quality: {{ item.conversation }}" + }], + range=[0.0, 1.0] + ) + + helpfulness_grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Helpfulness Assessment", + model="gpt-4o-mini", + input=[{ + "role": "user", + "content": "Rate helpfulness: {{ item.conversation }}" + }], + range=[0.0, 1.0] + ) + + evaluators = { + "quality": quality_grader, + "helpfulness": helpfulness_grader + } + + # Test grader recognition + just_evaluators, aoai_graders = _split_evaluators_and_grader_configs( + evaluators + ) + + assert len(just_evaluators) == 0 + assert len(aoai_graders) == 2 + assert "quality" in aoai_graders + assert "helpfulness" in aoai_graders + + def test_mixed_evaluator_types(self, mock_aoai_model_config): + """Test mixing score graders with built-in evaluators.""" + from azure.ai.evaluation import F1ScoreEvaluator + + f1_evaluator = F1ScoreEvaluator() + score_grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Custom Score", + model="gpt-4", + input=[{"role": "user", "content": "Rate: {{ item.data }}"}] + ) + + evaluators = { + "f1_score": f1_evaluator, + "custom_score": score_grader + } + + just_evaluators, aoai_graders = _split_evaluators_and_grader_configs( + evaluators + ) + + assert len(just_evaluators) == 1 + assert len(aoai_graders) == 1 + assert "f1_score" in just_evaluators + assert "custom_score" in aoai_graders + + def test_grader_conversion_error_handling(self, mock_aoai_model_config): + """Test error handling in grader conversion.""" + init_params = { + "model_config": mock_aoai_model_config, + "name": "Test", + "model": "gpt-4", + "input": [{"role": "user", "content": "test"}] + } + + # Test invalid grader ID + with pytest.raises(Exception) as excinfo: + _convert_remote_eval_params_to_grader( + "invalid_id", init_params=init_params + ) + + assert "not recognized" in str(excinfo.value) + + # Test successful conversion + grader = _convert_remote_eval_params_to_grader( + AzureOpenAIScoreModelGrader.id, + init_params=init_params + ) + + assert isinstance(grader, AzureOpenAIScoreModelGrader) + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGraderEdgeCases: + """Comprehensive edge case testing for AzureOpenAIScoreModelGrader.""" + + def test_grader_with_empty_input(self, mock_aoai_model_config): + """Test grader creation with empty input list.""" + # Empty input should be allowed - validation happens at runtime + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Empty Input", + model="gpt-4", + input=[] + ) + assert grader is not None + assert len(grader._grader_config.input) == 0 + + def test_grader_with_none_values(self, mock_aoai_model_config): + """Test grader creation with None values for optional fields.""" + # Test with None sampling_params + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="None Values Test", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + sampling_params=None + ) + assert grader is not None + + # Test with None range - should use default + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="None Range Test", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=None + ) + assert grader._grader_config.range == [0.0, 1.0] + + def test_grader_with_extreme_ranges(self, mock_aoai_model_config): + """Test grader with extreme score ranges.""" + # Very large range + grader_large = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Large Range", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[-1000.0, 1000.0], + pass_threshold=0.0 + ) + assert grader_large._grader_config.range == [-1000.0, 1000.0] + assert grader_large.pass_threshold == 0.0 + + # Very small range + grader_small = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Small Range", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[0.0, 0.1], + pass_threshold=0.05 + ) + assert grader_small._grader_config.range == [0.0, 0.1] + assert grader_small.pass_threshold == 0.05 + + def test_grader_with_negative_ranges(self, mock_aoai_model_config): + """Test grader with negative score ranges.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Negative Range", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[-10.0, -1.0], + pass_threshold=-5.0 + ) + assert grader._grader_config.range == [-10.0, -1.0] + assert grader.pass_threshold == -5.0 + + def test_grader_boundary_threshold_values(self, mock_aoai_model_config): + """Test grader with boundary threshold values.""" + # Threshold at minimum + grader_min = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Min Threshold", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[0.0, 10.0], + pass_threshold=0.0 + ) + assert grader_min.pass_threshold == 0.0 + + # Threshold at maximum + grader_max = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Max Threshold", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[0.0, 10.0], + pass_threshold=10.0 + ) + assert grader_max.pass_threshold == 10.0 + + def test_grader_with_invalid_input_structures( + self, mock_aoai_model_config + ): + """Test grader with invalid input message structures.""" + # Missing role + with pytest.raises((TypeError, ValueError, KeyError)): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Missing Role", + model="gpt-4", + input=[{"content": "test"}] + ) + + # Missing content + with pytest.raises((TypeError, ValueError, KeyError)): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Missing Content", + model="gpt-4", + input=[{"role": "user"}] + ) + + # Invalid role + with pytest.raises((TypeError, ValueError)): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Invalid Role", + model="gpt-4", + input=[{"role": "invalid", "content": "test"}], + validate=True + ) + + def test_grader_with_complex_sampling_params(self, mock_aoai_model_config): + """Test grader with various sampling parameter combinations.""" + complex_params = { + "temperature": 0.7, + "max_tokens": 150, + "top_p": 0.9, + "frequency_penalty": 0.1, + "presence_penalty": 0.1, + "stop": ["END", "STOP"] + } + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Complex Params", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + sampling_params=complex_params + ) + + assert grader._grader_config.sampling_params == complex_params + + def test_grader_with_unicode_content(self, mock_aoai_model_config): + """Test grader with Unicode and special characters in content.""" + unicode_content = "ζ΅‹θ―• 🌟 Γ©mojis and spΓ©ciΓ₯l characters Γ±" + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Unicode Test", + model="gpt-4", + input=[{ + "role": "user", + "content": f"Evaluate: {unicode_content} - {{{{ item.text }}}}" + }] + ) + + assert unicode_content in grader._grader_config.input[0].content + + def test_grader_with_very_long_content(self, mock_aoai_model_config): + """Test grader with very long input content.""" + long_content = "Very long content " * 1000 # ~18KB + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Long Content", + model="gpt-4", + input=[{"role": "user", "content": long_content}] + ) + + assert len(grader._grader_config.input[0].content) > 10000 + + def test_grader_invalid_type_parameters(self, mock_aoai_model_config): + """Test grader with wrong parameter types.""" + # String range instead of list + with pytest.raises((TypeError, ValueError)): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="String Range", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range="0-10" + ) + + # String threshold instead of number + with pytest.raises((TypeError, ValueError)): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="String Threshold", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + pass_threshold="5.0" + ) + + # Invalid input type + with pytest.raises((TypeError, ValueError)): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="String Input", + model="gpt-4", + input="This should be a list" + ) + + def test_grader_with_floating_point_precision( + self, mock_aoai_model_config + ): + """Test grader with high precision floating point values.""" + precise_range = [0.0000001, 0.9999999] + precise_threshold = 0.5000001 + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Precise Values", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=precise_range, + pass_threshold=precise_threshold + ) + + assert grader._grader_config.range == precise_range + assert grader.pass_threshold == precise_threshold + + def test_grader_with_zero_range(self, mock_aoai_model_config): + """Test grader with zero-width range.""" + with pytest.raises(ValueError): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Zero Range", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[5.0, 5.0] # Same min and max + ) + + def test_grader_with_inf_nan_values(self, mock_aoai_model_config): + """Test grader with infinity and NaN values.""" + # These values should be allowed at initialization + # but may fail at runtime + grader_inf = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Infinity Range", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[0.0, float('inf')], + validate=False + ) + assert grader_inf is not None + + # Test with NaN - should be allowed at init + grader_nan = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="NaN Threshold", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + pass_threshold=float('nan'), + validate=False + ) + assert grader_nan is not None + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGraderTemplateEdgeCases: + """Test edge cases related to template processing.""" + + def test_grader_with_complex_templates(self, mock_aoai_model_config): + """Test grader with complex template structures.""" + complex_template = """ + Context: {{ item.context }} + Question: {{ item.question }} + Response: {{ item.response }} + {% if item.additional_info %} + Additional: {{ item.additional_info }} + {% endif %} + Rate the response quality (0-10): + """ + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Complex Template", + model="gpt-4", + input=[{"role": "user", "content": complex_template}] + ) + + assert "item.context" in grader._grader_config.input[0].content + assert "{% if" in grader._grader_config.input[0].content + + def test_grader_with_nested_templates(self, mock_aoai_model_config): + """Test grader with nested template variables.""" + nested_template = ( + "{{ item.conversation[0].message }} vs " + "{{ item.conversation[1].message }}" + ) + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Nested Template", + model="gpt-4", + input=[{"role": "user", "content": nested_template}] + ) + + assert "conversation[0]" in grader._grader_config.input[0].content + + def test_grader_with_malformed_templates(self, mock_aoai_model_config): + """Test grader with malformed template syntax.""" + # Missing closing brace + malformed_template = "Rate this: {{ item.text" + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Malformed Template", + model="gpt-4", + input=[{"role": "user", "content": malformed_template}] + ) + + # Should still create grader (template validation happens at runtime) + assert grader is not None + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGraderConfigurationEdgeCases: + """Test edge cases in model configuration.""" + + def test_grader_with_different_api_versions(self, mock_aoai_model_config): + """Test grader with different API versions.""" + old_config = AzureOpenAIModelConfiguration( + azure_deployment="test-deployment", + azure_endpoint="https://test-endpoint.openai.azure.com/", + api_key="test-api-key", + api_version="2023-05-15" # Older version + ) + + grader = AzureOpenAIScoreModelGrader( + model_config=old_config, + name="Old API Version", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + validate=False + ) + + # Model config gets converted to dict internally + assert grader._model_config["api_version"] == "2023-05-15" + + def test_grader_with_various_endpoints(self, mock_aoai_model_config): + """Test grader with different endpoint formats.""" + configs = [ + ("https://test.openai.azure.com/", True), + ("https://test.openai.azure.com", True), # No trailing slash + # HTTP (should work with validate=False) + ("http://localhost:8080/", False), + ("https://custom-domain.com/", False), # Custom domain + ] + + for endpoint, should_validate in configs: + config = AzureOpenAIModelConfiguration( + azure_deployment="test-deployment", + azure_endpoint=endpoint, + api_key="test-api-key", + api_version="2024-12-01-preview" + ) + + grader = AzureOpenAIScoreModelGrader( + model_config=config, + name="Endpoint Test", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + validate=False + ) + + # Model config gets converted to dict internally + assert grader._model_config["azure_endpoint"] == endpoint + + def test_grader_with_empty_credentials(self): + """Test grader with empty/invalid credentials.""" + # Should raise EvaluationException as expected + from azure.ai.evaluation._exceptions import EvaluationException + + with pytest.raises(EvaluationException): + config = AzureOpenAIModelConfiguration( + azure_deployment="", + azure_endpoint="", + api_key="", + api_version="" + ) + AzureOpenAIScoreModelGrader( + model_config=config, + name="Empty Creds", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + validate=True + ) + + def test_grader_with_very_long_names(self, mock_aoai_model_config): + """Test grader with very long names and model names.""" + long_name = "A" * 1000 + long_model = "gpt-4-very-long-model-name-" + "x" * 100 + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name=long_name, + model=long_model, + input=[{"role": "user", "content": "test"}] + ) + + assert grader._grader_config.name == long_name + assert grader._grader_config.model == long_model + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGraderRegistryEdgeCases: + """Test edge cases in grader registry integration.""" + + def test_registry_with_duplicate_grader_names( + self, mock_aoai_model_config + ): + """Test registry behavior with duplicate grader names.""" + grader1 = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Duplicate Name", + model="gpt-4", + input=[{"role": "user", "content": "test1"}] + ) + + grader2 = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Duplicate Name", + model="gpt-4o", + input=[{"role": "user", "content": "test2"}] + ) + + evaluators = { + "grader_a": grader1, + "grader_b": grader2 + } + + just_evaluators, aoai_graders = _split_evaluators_and_grader_configs( + evaluators + ) + + assert len(aoai_graders) == 2 + assert "grader_a" in aoai_graders + assert "grader_b" in aoai_graders + + def test_registry_with_none_evaluators(self): + """Test registry behavior with None evaluators.""" + evaluators = { + "valid_grader": None, + "another_none": None + } + + # Should handle None values gracefully + just_evaluators, aoai_graders = _split_evaluators_and_grader_configs( + evaluators + ) + + # All None values should be in just_evaluators + assert len(just_evaluators) == 2 + assert len(aoai_graders) == 0 + + def test_registry_conversion_with_invalid_params(self): + """Test grader conversion with invalid initialization parameters.""" + # Missing required parameter + invalid_params = { + "name": "Test", + # Missing model and input + } + + with pytest.raises(Exception): + _convert_remote_eval_params_to_grader( + AzureOpenAIScoreModelGrader.id, + init_params=invalid_params + ) + + def test_registry_conversion_with_extra_params( + self, mock_aoai_model_config + ): + """Test grader conversion with extra unknown parameters.""" + params_with_extra = { + "model_config": mock_aoai_model_config, + "name": "Extra Params", + "model": "gpt-4", + "input": [{"role": "user", "content": "test"}], + "unknown_param": "should_be_ignored", + "another_extra": 42 + } + + # Should succeed and ignore extra params + grader = _convert_remote_eval_params_to_grader( + AzureOpenAIScoreModelGrader.id, + init_params=params_with_extra + ) + + assert isinstance(grader, AzureOpenAIScoreModelGrader) + assert grader._grader_config.name == "Extra Params" + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGraderPerformanceEdgeCases: + """Test edge cases related to performance and resource usage.""" + + def test_grader_with_many_input_messages(self, mock_aoai_model_config): + """Test grader with large number of input messages.""" + many_messages = [] + for i in range(100): + many_messages.append({ + "role": "user" if i % 2 == 0 else "assistant", + "content": f"Message {i}: {{{{ item.data_{i} }}}}" + }) + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Many Messages", + model="gpt-4", + input=many_messages + ) + + assert len(grader._grader_config.input) == 100 + + def test_grader_creation_performance(self, mock_aoai_model_config): + """Test creating many graders doesn't cause memory issues.""" + graders = [] + + for i in range(50): # Create 50 graders + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name=f"Grader {i}", + model="gpt-4", + input=[{"role": "user", "content": f"Test {i}"}], + validate=False + ) + graders.append(grader) + + assert len(graders) == 50 + # Check that all graders are unique instances + assert len(set(id(g) for g in graders)) == 50 + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGraderCompatibility: + """Test compatibility with different SDK components.""" + + def test_grader_with_different_evaluator_types( + self, mock_aoai_model_config + ): + """Test grader compatibility with various evaluator types.""" + try: + from azure.ai.evaluation import F1ScoreEvaluator + + f1_eval = F1ScoreEvaluator() + + score_grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Compatibility Test", + model="gpt-4", + input=[{"role": "user", "content": "test"}] + ) + + def custom_eval(x): + return {"score": 0.5} + + evaluators = { + "f1": f1_eval, + "custom": custom_eval, + "score_grader": score_grader + } + + just_evaluators, aoai_graders = ( + _split_evaluators_and_grader_configs(evaluators) + ) + + assert len(just_evaluators) >= 2 # f1 and custom + assert len(aoai_graders) == 1 + assert "score_grader" in aoai_graders + + except ImportError: + # Skip if evaluators not available + pytest.skip("Built-in evaluators not available") + + def test_grader_string_representation(self, mock_aoai_model_config): + """Test string representation of grader for debugging.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="String Repr Test", + model="gpt-4", + input=[{"role": "user", "content": "test"}] + ) + + # Should have meaningful string representation + grader_str = str(grader) + assert ("AzureOpenAIScoreModelGrader" in grader_str or + "String Repr Test" in grader_str) + + @patch( + 'azure.ai.evaluation._aoai.score_model_grader.' + 'AzureOpenAIGrader.get_client' + ) + def test_grader_with_client_initialization_error( + self, mock_get_client, mock_aoai_model_config + ): + """Test grader behavior when client initialization fails.""" + mock_get_client.side_effect = Exception("Client initialization failed") + + # Should still create grader object (client is created lazily) + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Client Error Test", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + validate=False + ) + + assert grader is not None + assert grader.id == "aoai://score_model" diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py index a80ba79b8726..eeef523053b2 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py @@ -39,6 +39,7 @@ class TestSaveEval: "RedTeamOutput", "AzureOpenAIGrader", "AzureOpenAILabelGrader", + "AzureOpenAIScoreModelGrader", "AzureOpenAIStringCheckGrader", "AzureOpenAITextSimilarityGrader" ])