From e8899873c911869086c230c9c31332e49ce02627 Mon Sep 17 00:00:00 2001 From: Nagkumar Arkalgud Date: Mon, 16 Jun 2025 15:28:15 -0700 Subject: [PATCH 1/5] feat: Add AzureOpenAIScoreModelGrader for continuous scoring evaluation - Implement AzureOpenAIScoreModelGrader in _aoai/score_model_grader.py - Update module exports in _aoai/__init__.py and __init__.py - Register grader in _evaluate/_evaluate_aoai.py grader registry - Add comprehensive sample script with real credentials support - Include integration plan documentation - Support conversation-style input, score ranges, and sampling parameters - Handle template variables using {{ item.field }} syntax - Provide fallback demo mode for configuration testing --- .../azure/ai/evaluation/__init__.py | 2 + .../azure/ai/evaluation/_aoai/__init__.py | 2 + .../ai/evaluation/_aoai/score_model_grader.py | 84 +++ .../ai/evaluation/_evaluate/_evaluate_aoai.py | 2 + .../integrating_aoai_score_model_grader.md | 608 ++++++++++++++++++ .../samples/aoai_score_model_grader_sample.py | 470 ++++++++++++++ 6 files changed, 1168 insertions(+) create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py create mode 100644 sdk/evaluation/azure-ai-evaluation/integrating_aoai_score_model_grader.md create mode 100644 sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py index 2c9daebb8b41..d71d5b1fe7c6 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py @@ -45,6 +45,7 @@ from ._aoai.label_grader import AzureOpenAILabelGrader from ._aoai.string_check_grader import AzureOpenAIStringCheckGrader from ._aoai.text_similarity_grader import AzureOpenAITextSimilarityGrader +from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader _patch_all = [] @@ -99,6 +100,7 @@ "AzureOpenAILabelGrader", "AzureOpenAIStringCheckGrader", "AzureOpenAITextSimilarityGrader", + "AzureOpenAIScoreModelGrader", ] __all__.extend([p for p in _patch_all if p not in __all__]) \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/__init__.py index f5fa183c8914..ffab8bf38247 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/__init__.py @@ -4,7 +4,9 @@ from .aoai_grader import AzureOpenAIGrader +from .score_model_grader import AzureOpenAIScoreModelGrader __all__ = [ "AzureOpenAIGrader", + "AzureOpenAIScoreModelGrader", ] \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py new file mode 100644 index 000000000000..22480a9a4f80 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py @@ -0,0 +1,84 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +from typing import Any, Dict, Union, List, Optional + +from azure.ai.evaluation._model_configurations import ( + AzureOpenAIModelConfiguration, + OpenAIModelConfiguration +) +from openai.types.graders import ScoreModelGrader +from azure.ai.evaluation._common._experimental import experimental + +from .aoai_grader import AzureOpenAIGrader + + +@experimental +class AzureOpenAIScoreModelGrader(AzureOpenAIGrader): + """ + Wrapper class for OpenAI's score model graders. + + Enables continuous scoring evaluation with custom prompts and flexible + conversation-style inputs. Supports configurable score ranges and + pass thresholds for binary classification. + + Supplying a ScoreModelGrader to the `evaluate` method will cause an + asynchronous request to evaluate the grader via the OpenAI API. The + results of the evaluation will then be merged into the standard + evaluation results. + + :param model_config: The model configuration to use for the grader. + :type model_config: Union[ + ~azure.ai.evaluation.AzureOpenAIModelConfiguration, + ~azure.ai.evaluation.OpenAIModelConfiguration + ] + :param input: The input messages for the grader. List of conversation + messages with role and content. + :type input: List[Dict[str, str]] + :param model: The model to use for the evaluation. + :type model: str + :param name: The name of the grader. + :type name: str + :param range: The range of the score. Defaults to [0, 1]. + :type range: Optional[List[float]] + :param sampling_params: The sampling parameters for the model. + :type sampling_params: Optional[Dict[str, Any]] + :param kwargs: Additional keyword arguments to pass to the grader. + :type kwargs: Any + """ + + id = "aoai://score_model" + + def __init__( + self, + *, + model_config: Union[ + AzureOpenAIModelConfiguration, OpenAIModelConfiguration + ], + input: List[Dict[str, str]], + model: str, + name: str, + range: Optional[List[float]] = None, + sampling_params: Optional[Dict[str, Any]] = None, + **kwargs: Any + ): + # Create OpenAI ScoreModelGrader instance + grader_kwargs = { + "input": input, + "model": model, + "name": name, + "type": "score_model" + } + + if range is not None: + grader_kwargs["range"] = range + if sampling_params is not None: + grader_kwargs["sampling_params"] = sampling_params + + grader = ScoreModelGrader(**grader_kwargs) + + super().__init__( + model_config=model_config, + grader_config=grader, + **kwargs + ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py index fe2a8effc0e9..249af27c83ea 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py @@ -316,12 +316,14 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]: AzureOpenAILabelGrader, AzureOpenAIStringCheckGrader, AzureOpenAITextSimilarityGrader, + AzureOpenAIScoreModelGrader, ) id_map = { AzureOpenAIGrader.id: AzureOpenAIGrader, AzureOpenAILabelGrader.id: AzureOpenAILabelGrader, AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader, AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader, + AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader, } for key in id_map.keys(): diff --git a/sdk/evaluation/azure-ai-evaluation/integrating_aoai_score_model_grader.md b/sdk/evaluation/azure-ai-evaluation/integrating_aoai_score_model_grader.md new file mode 100644 index 000000000000..3d79813ec011 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/integrating_aoai_score_model_grader.md @@ -0,0 +1,608 @@ +# Integrating Azure OpenAI Score Model Grader + +## Overview + +This document provides a comprehensive plan for integrating the `AzureOpenAIScoreModelGrader` into the Azure AI Evaluation SDK. The Score Model Grader enables continuous scoring (0.0-1.0) with custom prompts, complementing the existing categorical and binary graders. + +## Current State Analysis + +### Existing AOAI Grader Architecture + +The Azure AI Evaluation SDK currently supports three types of AOAI graders: + +1. **`AzureOpenAIGrader`** (Base Class) + - Located: `azure/ai/evaluation/_aoai/aoai_grader.py` + - Handles model configuration and OpenAI client creation + - Validates API keys and endpoints + - Supports both Azure OpenAI and OpenAI configurations + +2. **`AzureOpenAILabelGrader`** + - Located: `azure/ai/evaluation/_aoai/label_grader.py` + - Wraps OpenAI's `LabelModelGrader` + - Supports classification with predefined labels + - Uses `pass_threshold` for binary pass/fail results + +3. **`AzureOpenAIStringCheckGrader`** + - Located: `azure/ai/evaluation/_aoai/string_check_grader.py` + - Wraps OpenAI's `StringCheckGrader` + - Supports string operations: `eq`, `ne`, `like`, `ilike` + - Binary pass/fail results only + +4. **`AzureOpenAITextSimilarityGrader`** + - Located: `azure/ai/evaluation/_aoai/text_similarity_grader.py` + - Wraps OpenAI's `TextSimilarityGrader` + - Supports various similarity metrics (BLEU, ROUGE, cosine, etc.) + - Uses `pass_threshold` for binary classification + +### Integration Points + +**Evaluation Pipeline:** +- `_evaluate_aoai.py` - Orchestrates AOAI evaluations +- `_split_evaluators_and_grader_configs()` - Separates AOAI graders from regular evaluators +- `_get_grader_class()` - Maps grader IDs to implementation classes +- Main `__init__.py` - Exports public API classes + +**Key Functions:** +- `_begin_aoai_evaluation()` - Starts AOAI evaluation runs +- `_get_evaluation_run_results()` - Retrieves and formats results +- `_convert_remote_eval_params_to_grader()` - Creates grader instances from config + +## Missing Component: Score Model Grader + +### OpenAI SDK ScoreModelGrader Structure + +Based on analysis of the OpenAI SDK, the `ScoreModelGrader` has the following structure: + +```python +class ScoreModelGrader(BaseModel): + input: List[Input] # Conversation-style messages + model: str # Grading model (e.g., "gpt-4") + name: str # Grader name + type: Literal["score_model"] # Always "score_model" + range: Optional[List[float]] # Score range, defaults to [0, 1] + sampling_params: Optional[object] # Model parameters + +class Input(BaseModel): + content: str # Message content with templates + role: Literal["user", "assistant", "system", "developer"] + type: Optional[Literal["message"]] = None +``` + +### Key Capabilities + +**Continuous Scoring:** +- Returns floating-point scores (typically 0.0-1.0) +- Configurable score ranges (e.g., [0, 5], [0, 100]) +- Pass/fail threshold for binary classification + +**Flexible Prompting:** +- Multi-message conversations (system, user, assistant, developer roles) +- Template string support for dynamic content injection +- Custom evaluation criteria and instructions + +**Model Configuration:** +- Supports any OpenAI-compatible model +- Configurable sampling parameters (temperature, max_tokens, etc.) +- Independent model selection for grading vs. evaluation + +## Implementation Plan + +### Phase 1: Core Implementation + +#### Step 1.1: Create AzureOpenAIScoreModelGrader Class + +**File:** `azure/ai/evaluation/_aoai/score_model_grader.py` + +```python +from typing import Any, Dict, Union, List, Optional +from typing_extensions import Literal + +from azure.ai.evaluation._model_configurations import ( + AzureOpenAIModelConfiguration, + OpenAIModelConfiguration +) +from openai.types.graders import ScoreModelGrader +from azure.ai.evaluation._common._experimental import experimental + +from .aoai_grader import AzureOpenAIGrader + + +@experimental +class AzureOpenAIScoreModelGrader(AzureOpenAIGrader): + """ + Wrapper class for OpenAI's score model graders. + + Enables continuous scoring evaluation with custom prompts and flexible + conversation-style inputs. Supports configurable score ranges and + pass thresholds for binary classification. + """ + + id = "aoai://score_model" + + def __init__( + self, + *, + model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], + input: List[Dict[str, str]], + model: str, + name: str, + range: Optional[List[float]] = None, + sampling_params: Optional[Dict[str, Any]] = None, + **kwargs: Any + ): + # Create OpenAI ScoreModelGrader instance + grader_kwargs = { + "input": input, + "model": model, + "name": name, + "type": "score_model" + } + + if range is not None: + grader_kwargs["range"] = range + if sampling_params is not None: + grader_kwargs["sampling_params"] = sampling_params + + grader = ScoreModelGrader(**grader_kwargs) + + super().__init__( + model_config=model_config, + grader_config=grader, + **kwargs + ) +``` + +**Key Implementation Details:** +- Inherits from `AzureOpenAIGrader` for consistent model handling +- Wraps OpenAI's `ScoreModelGrader` following existing patterns +- Supports all ScoreModelGrader parameters +- Uses experimental decorator for preview functionality + +#### Step 1.2: Update Module Exports + +**File:** `azure/ai/evaluation/_aoai/__init__.py` + +```python +from .aoai_grader import AzureOpenAIGrader +from .score_model_grader import AzureOpenAIScoreModelGrader # Add this import + +__all__ = [ + "AzureOpenAIGrader", + "AzureOpenAIScoreModelGrader", # Add this export +] +``` + +**File:** `azure/ai/evaluation/__init__.py` + +```python +# Add import +from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader + +# Add to exports (around line 40) +``` + +#### Step 1.3: Update Grader Registry + +**File:** `azure/ai/evaluation/_evaluate/_evaluate_aoai.py` + +Update the `_get_grader_class()` function: + +```python +def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]: + """Given a model ID, return the class of the corresponding grader wrapper.""" + + from azure.ai.evaluation import ( + AzureOpenAIGrader, + AzureOpenAILabelGrader, + AzureOpenAIStringCheckGrader, + AzureOpenAITextSimilarityGrader, + AzureOpenAIScoreModelGrader, # Add this import + ) + + id_map = { + AzureOpenAIGrader.id: AzureOpenAIGrader, + AzureOpenAILabelGrader.id: AzureOpenAILabelGrader, + AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader, + AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader, + AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader, # Add this + } + + # ... rest of function unchanged +``` + +### Phase 2: Testing and Validation + +#### Step 2.1: Create Unit Tests + +**File:** `tests/unittests/test_aoai_score_model_grader.py` + +```python +import pytest +from azure.ai.evaluation import AzureOpenAIScoreModelGrader +from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration + + +class TestAzureOpenAIScoreModelGrader: + + def test_initialization_with_required_params(self): + """Test basic initialization with required parameters.""" + model_config = AzureOpenAIModelConfiguration( + azure_endpoint="https://test.openai.azure.com/", + api_key="test-key", + azure_deployment="gpt-4" + ) + + grader = AzureOpenAIScoreModelGrader( + model_config=model_config, + input=[ + {"role": "system", "content": "You are a helpful evaluator."}, + {"role": "user", "content": "Rate this: {{ data.text }}"} + ], + model="gpt-4", + name="Test Grader" + ) + + assert grader._grader_config.name == "Test Grader" + assert grader._grader_config.model == "gpt-4" + assert grader._grader_config.type == "score_model" + + def test_initialization_with_optional_params(self): + """Test initialization with optional parameters.""" + # Test with range and sampling_params + # ... implementation + + def test_client_creation(self): + """Test that the grader can create appropriate OpenAI client.""" + # ... implementation +``` + +#### Step 2.2: Create Integration Tests + +**File:** `tests/e2etests/test_aoai_score_model_integration.py` + +```python +import pytest +from azure.ai.evaluation import evaluate, AzureOpenAIScoreModelGrader +from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration + + +@pytest.mark.skipif("not config.getoption('--live')") +class TestAzureOpenAIScoreModelIntegration: + + def test_evaluate_with_score_model_grader(self): + """Test end-to-end evaluation with score model grader.""" + # Create test data + # Configure grader + # Run evaluation + # Validate results format + pass +``` + +#### Step 2.3: Update Sample and Documentation + +**Sample File:** `samples/aoai_score_model_grader_sample.py` (already created) + +**Documentation Updates:** +- Update README.md with new grader information +- Add API documentation +- Update migration guide if needed + +### Phase 3: Advanced Features and Optimizations + +#### Step 3.1: Enhanced Template Support + +**Template Variables:** +- `{{ data.field_name }}` - Access input data fields +- `{{ outputs.evaluator_name }}` - Access other evaluator outputs +- Custom template functions for data formatting + +**Implementation:** +- Validate template syntax during initialization +- Provide clear error messages for template issues +- Support nested field access (e.g., `{{ data.conversation.messages[0] }}`) + +#### Step 3.2: Result Processing Enhancements + +**Score Parsing:** +- Handle different score formats (JSON, plain text, structured) +- Extract reasoning/explanation from model responses +- Validate score ranges and handle edge cases + +**Metrics Calculation:** +- Mean, median, standard deviation of scores +- Distribution analysis +- Correlation with other evaluators + +#### Step 3.3: Performance Optimizations + +**Batch Processing:** +- Group evaluations by model configuration +- Optimize API calls and reduce latency +- Implement retry logic with exponential backoff + +**Caching:** +- Cache grader configurations +- Store evaluation results for reuse +- Implement result invalidation strategies + +## Usage Examples + +### Basic Conversation Quality Assessment + +```python +from azure.ai.evaluation import evaluate, AzureOpenAIScoreModelGrader +from azure.ai.evaluation import AzureOpenAIModelConfiguration + +# Configure model +model_config = AzureOpenAIModelConfiguration( + azure_endpoint="https://your-endpoint.openai.azure.com/", + api_key="your-api-key", + azure_deployment="gpt-4" +) + +# Create grader +quality_grader = AzureOpenAIScoreModelGrader( + model_config=model_config, + name="Conversation Quality", + model="gpt-4", + input=[ + { + "role": "system", + "content": "Rate conversation quality from 0.0 to 1.0" + }, + { + "role": "user", + "content": "Conversation: {{ data.conversation }}\\nScore:" + } + ], + range=[0.0, 1.0], + pass_threshold=0.7 +) + +# Run evaluation +results = evaluate( + data="conversations.jsonl", + evaluators={"quality": quality_grader} +) +``` + +### Multi-Criteria Evaluation + +```python +# Multiple score model graders for comprehensive evaluation +evaluators = { + "helpfulness": AzureOpenAIScoreModelGrader( + model_config=model_config, + name="Helpfulness", + model="gpt-4", + input=[...], # Helpfulness-specific prompts + pass_threshold=0.6 + ), + "accuracy": AzureOpenAIScoreModelGrader( + model_config=model_config, + name="Factual Accuracy", + model="gpt-4", + input=[...], # Accuracy-specific prompts + pass_threshold=0.8 + ), + "clarity": AzureOpenAIScoreModelGrader( + model_config=model_config, + name="Response Clarity", + model="gpt-4", + input=[...], # Clarity-specific prompts + pass_threshold=0.7 + ) +} + +results = evaluate(data="data.jsonl", evaluators=evaluators) +``` + +### Custom Score Ranges + +```python +# 1-5 star rating system +star_rating_grader = AzureOpenAIScoreModelGrader( + model_config=model_config, + name="Star Rating", + model="gpt-4", + input=[ + { + "role": "system", + "content": "Rate from 1 (worst) to 5 (best) stars" + }, + { + "role": "user", + "content": "Review: {{ data.review }}\\nStars (1-5):" + } + ], + range=[1.0, 5.0], + pass_threshold=3.0 # 3+ stars = pass +) +``` + +## Error Handling and Edge Cases + +### Common Error Scenarios + +1. **Invalid Template Syntax** + - Missing closing braces + - Invalid field references + - Nested template errors + +2. **Model Configuration Issues** + - Invalid API keys or endpoints + - Unsupported model names + - Network connectivity problems + +3. **Score Parsing Failures** + - Non-numeric responses + - Scores outside specified range + - Malformed JSON responses + +4. **Evaluation Pipeline Errors** + - Data formatting issues + - Column mapping conflicts + - Resource quota limits + +### Error Handling Implementation + +```python +class AzureOpenAIScoreModelGrader(AzureOpenAIGrader): + + def _validate_grader_config(self) -> None: + """Enhanced validation for score model graders.""" + super()._validate_grader_config() + + # Validate input messages + if not self._grader_config.input: + raise EvaluationException("Score model grader requires input messages") + + # Validate model name + if not self._grader_config.model: + raise EvaluationException("Score model grader requires model name") + + # Validate score range + if self._grader_config.range: + if len(self._grader_config.range) != 2: + raise EvaluationException("Score range must contain exactly 2 values") + if self._grader_config.range[0] >= self._grader_config.range[1]: + raise EvaluationException("Invalid score range: min must be < max") + + # Validate template syntax in input messages + self._validate_template_syntax() + + def _validate_template_syntax(self) -> None: + """Validate template strings in input messages.""" + for msg in self._grader_config.input: + content = msg.get("content", "") + # Check for balanced braces, valid field references, etc. + # ... implementation +``` + +## Testing Strategy + +### Unit Testing + +**Test Categories:** +1. **Initialization Tests** - Parameter validation, error handling +2. **Configuration Tests** - Model config validation, client creation +3. **Template Tests** - Template syntax validation, variable substitution +4. **Serialization Tests** - JSON serialization/deserialization + +### Integration Testing + +**Test Scenarios:** +1. **End-to-End Evaluation** - Complete evaluation pipeline +2. **Multi-Grader Scenarios** - Multiple graders with different configs +3. **Error Recovery** - Network failures, API errors, malformed responses +4. **Performance Testing** - Large datasets, concurrent evaluations + +### Manual Testing Checklist + +- [ ] Basic grader initialization with minimal parameters +- [ ] Advanced grader configuration with all optional parameters +- [ ] Template variable substitution with various data formats +- [ ] Score range validation and threshold behavior +- [ ] Integration with existing evaluation pipeline +- [ ] Error handling for common failure scenarios +- [ ] Performance with large datasets (100+ samples) +- [ ] Concurrent evaluation with multiple graders + +## Deployment Considerations + +### Backward Compatibility + +- All changes are additive - no breaking changes to existing APIs +- New grader is marked as experimental initially +- Existing evaluations continue to work unchanged + +### Documentation Updates + +- Update API reference documentation +- Add usage examples to README +- Create migration guide for users +- Update troubleshooting guide + +### Monitoring and Observability + +- Add telemetry for score model grader usage +- Monitor evaluation success/failure rates +- Track performance metrics (latency, throughput) +- Log template parsing errors and API failures + +## Success Criteria + +### Functional Requirements + +- [ ] `AzureOpenAIScoreModelGrader` class implemented and tested +- [ ] Integration with existing evaluation pipeline +- [ ] Support for all OpenAI ScoreModelGrader features +- [ ] Comprehensive error handling and validation +- [ ] Template variable substitution working correctly + +### Quality Requirements + +- [ ] Unit test coverage > 90% +- [ ] Integration tests passing +- [ ] Documentation complete and accurate +- [ ] Sample code working and demonstrative +- [ ] Performance acceptable (< 2x overhead vs. direct OpenAI calls) + +### User Experience Requirements + +- [ ] Consistent API with existing graders +- [ ] Clear error messages for common issues +- [ ] Comprehensive examples and documentation +- [ ] Smooth migration path from direct OpenAI usage + +## Timeline Estimate + +**Phase 1 - Core Implementation:** 3-5 days +- Day 1: Implement `AzureOpenAIScoreModelGrader` class +- Day 2: Update exports and grader registry +- Day 3: Create unit tests +- Day 4-5: Integration testing and bug fixes + +**Phase 2 - Testing and Documentation:** 2-3 days +- Day 1: Comprehensive testing +- Day 2: Documentation updates +- Day 3: Sample refinement and validation + +**Phase 3 - Advanced Features:** 1-2 days (optional) +- Enhanced template support +- Performance optimizations +- Additional error handling + +**Total Estimated Time:** 6-10 days + +## Risk Assessment + +### High Risk +- **OpenAI SDK Changes** - Risk of breaking changes in grader interfaces +- **Template Complexity** - Complex template syntax could be error-prone + +### Medium Risk +- **Performance Impact** - Additional API calls could affect evaluation speed +- **Error Handling Gaps** - Unforeseen edge cases in score parsing + +### Low Risk +- **Integration Issues** - Existing pipeline is well-established +- **Backward Compatibility** - Additive changes only + +### Mitigation Strategies +- Pin OpenAI SDK version to avoid breaking changes +- Implement comprehensive test suite with edge cases +- Add performance monitoring and optimization +- Create detailed error handling documentation + +## Conclusion + +This implementation plan provides a comprehensive roadmap for integrating the `AzureOpenAIScoreModelGrader` into the Azure AI Evaluation SDK. The approach leverages existing infrastructure while adding powerful continuous scoring capabilities that complement the current categorical and binary graders. + +The phased implementation approach ensures: +- **Reliability** - Thorough testing at each stage +- **Maintainability** - Consistent with existing patterns +- **Usability** - Clear API and comprehensive documentation +- **Extensibility** - Foundation for future enhancements + +Following this plan will result in a robust, well-tested implementation that provides users with flexible, powerful evaluation capabilities while maintaining the high quality and reliability standards of the Azure AI Evaluation SDK. diff --git a/sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py b/sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py new file mode 100644 index 000000000000..b760a2ba3bd9 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py @@ -0,0 +1,470 @@ +#!/usr/bin/env python3 + +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +""" +Sample demonstrating the use of AzureOpenAIScoreModelGrader for continuous +scoring evaluation. + +This sample shows how to: +1. Configure an Azure OpenAI model for grading +2. Create a score model grader with custom prompts +3. Run evaluation using the evaluate() method +4. Interpret continuous scoring results + +Prerequisites: +- Azure OpenAI resource with API key and endpoint +- Model deployment (e.g., gpt-4, gpt-4o-mini) +- Sample conversation data in JSONL format +- Environment variables configured in .env file +""" + +import json +import os +from dotenv import load_dotenv +import pandas as pd +from azure.ai.evaluation import evaluate, AzureOpenAIScoreModelGrader +from azure.ai.evaluation import AzureOpenAIModelConfiguration + +# Load environment variables +load_dotenv() + + +def create_sample_data() -> str: + """Create sample conversation data for testing.""" + sample_conversations = [ + { + "conversation": { + "messages": [ + { + "content": "How can I improve my Python coding skills?", + "role": "user" + }, + { + "content": ( + "Here are some effective ways to improve your " + "Python skills: 1) Practice coding daily with " + "platforms like LeetCode or HackerRank, 2) Work " + "on real projects that interest you, 3) Read " + "other people's code on GitHub, 4) Join Python " + "communities and forums, 5) Take online courses " + "or tutorials. The key is consistent practice and " + "building projects that challenge you." + ), + "role": "assistant" + } + ] + }, + "expected_quality": "high", + "context": "Programming advice request" + }, + { + "conversation": { + "messages": [ + { + "content": "What is Python?", + "role": "user" + }, + { + "content": "Python is a programming language.", + "role": "assistant" + } + ] + }, + "expected_quality": "low", + "context": "Basic definition request" + }, + { + "conversation": { + "messages": [ + { + "content": ( + "Can you explain machine learning concepts " + "for a beginner?" + ), + "role": "user" + }, + { + "content": ( + "Machine learning is a subset of artificial " + "intelligence where computers learn patterns from " + "data without being explicitly programmed. Think " + "of it like teaching a child to recognize animals " + "- you show them many pictures of cats and dogs, " + "and eventually they learn to distinguish between " + "them. In ML, we feed algorithms lots of data so " + "they can make predictions or decisions on new, " + "unseen data. Common types include supervised " + "learning (learning from labeled examples), " + "unsupervised learning (finding hidden patterns), " + "and reinforcement learning (learning through " + "trial and error)." + ), + "role": "assistant" + } + ] + }, + "expected_quality": "high", + "context": "Educational explanation request" + }, + { + "conversation": { + "messages": [ + { + "content": "Help me debug this code", + "role": "user" + }, + { + "content": "Sure, please share your code.", + "role": "assistant" + } + ] + }, + "expected_quality": "medium", + "context": "Debug assistance request" + }, + { + "conversation": { + "messages": [ + { + "content": ( + "What are the best practices for writing " + "clean Python code?" + ), + "role": "user" + }, + { + "content": ( + "Here are key best practices for writing clean " + "Python code: 1) Follow PEP 8 style guidelines " + "for consistent formatting, 2) Use descriptive " + "variable and function names that explain their " + "purpose, 3) Write docstrings for functions and " + "classes, 4) Keep functions small and focused on " + "a single task, 5) Use type hints to improve code " + "clarity, 6) Handle exceptions appropriately, " + "7) Write unit tests for your code, 8) Use " + "virtual environments for dependency management, " + "9) Comment complex logic but avoid obvious " + "comments, 10) Refactor code regularly to improve " + "readability and maintainability." + ), + "role": "assistant" + } + ] + }, + "expected_quality": "high", + "context": "Best practices inquiry" + } + ] + + # Create JSONL file + filename = "sample_conversations.jsonl" + with open(filename, 'w') as f: + for conv in sample_conversations: + f.write(json.dumps(conv) + '\n') + + print(f"Created sample data file: {filename}") + return filename + + +def demonstrate_score_model_grader(): + """Demonstrate the AzureOpenAIScoreModelGrader usage with real credentials.""" + + # Create sample data + data_file = create_sample_data() + + print("=== Azure OpenAI Score Model Grader Demo ===\n") + + try: + # 1. Configure Azure OpenAI model using environment variables + model_config = AzureOpenAIModelConfiguration( + azure_endpoint=os.environ.get("4o_mini_target_endpoint"), + api_key=os.environ.get("4o_mini_target_endpoint_key"), + azure_deployment=os.environ.get( + "4o_mini_target_endpoint_deployment_name" + ), + api_version="2024-12-01-preview" + ) + + print("βœ… Model configuration loaded successfully") + + # 2. Create conversation quality grader + conversation_quality_grader = AzureOpenAIScoreModelGrader( + model_config=model_config, + name="Conversation Quality Assessment", + model="gpt-4o-mini", + input=[ + { + "role": "system", + "content": ( + "You are an expert conversation quality evaluator. " + "Assess the quality of AI assistant responses based on " + "helpfulness, completeness, accuracy, and " + "appropriateness. Return a score between 0.0 (very " + "poor) and 1.0 (excellent)." + ) + }, + { + "role": "user", + "content": ( + "Evaluate this conversation:\n" + "Context: {{ item.context }}\n" + "Messages: {{ item.conversation }}\n\n" + "Provide a quality score from 0.0 to 1.0." + ) + } + ], + range=[0.0, 1.0], + sampling_params={ + "temperature": 0.0 + } + ) + + print("βœ… Conversation quality grader created successfully") + + # 3. Run evaluation with the score model grader + print("\nπŸš€ Running evaluation with score model grader...") + + result = evaluate( + data=data_file, + evaluators={ + "conversation_quality": conversation_quality_grader + } + ) + + # 4. Display results + print("\n=== Evaluation Results ===") + print(f"Total samples evaluated: {len(result['rows'])}") + + # Show metrics + print("\n=== Metrics Summary ===") + for metric_name, metric_value in result['metrics'].items(): + print(f"{metric_name}: {metric_value:.3f}") + + # Show detailed results + print("\n=== Sample Results ===") + df = pd.DataFrame(result['rows']) + + for i, row in df.head(3).iterrows(): + print(f"\nSample {i+1}:") + print(f" Context: {row.get('context', 'N/A')}") + + # Show grader results + for col in df.columns: + if col.startswith('outputs.'): + grader_name = col.split('.')[1] + if 'score' in col: + print(f" {grader_name} Score: {row[col]:.3f}") + elif 'passed' in col: + print(f" {grader_name} Passed: {row[col]}") + + print("\nβœ… Evaluation completed successfully!") + + except Exception as e: + print(f"\n❌ Error during evaluation: {str(e)}") + print("\nFalling back to demonstration mode...") + demonstrate_configuration_only() + + # Clean up + if os.path.exists(data_file): + os.remove(data_file) + print(f"\n🧹 Cleaned up temporary file: {data_file}") + + +def demonstrate_configuration_only(): + """Demonstrate grader configuration without running actual evaluation.""" + + try: + # Create sample data + data_file = create_sample_data() + + print("πŸ“ Testing grader configuration...") + + # Configure with placeholder values for testing + model_config = AzureOpenAIModelConfiguration( + azure_endpoint="https://test-endpoint.openai.azure.com/", + api_key="test-key", + azure_deployment="gpt-4o-mini", + api_version="2024-12-01-preview" + ) + + # Create a simple grader to test + test_grader = AzureOpenAIScoreModelGrader( + model_config=model_config, + name="Test Quality Grader", + model="gpt-4o-mini", + input=[ + { + "role": "system", + "content": "You are a test evaluator." + }, + { + "role": "user", + "content": "Rate this: {{ data.conversation }}" + } + ] + ) + + print("βœ… Grader creation successful!") + print(f" - Grader ID: {test_grader.id}") + print(f" - Grader name: {test_grader._grader_config.name}") + print(f" - Grader model: {test_grader._grader_config.model}") + print(f" - Input messages: {len(test_grader._grader_config.input)}") + + print("\n🚧 Implementation Status:") + print(" - Sample data created: βœ…") + print(" - AzureOpenAIScoreModelGrader class: βœ… (implemented)") + print(" - Integration with evaluate(): βœ… (ready for testing)") + print("\nπŸ“– Ready for use!") + print(" Configure with real API credentials to run evaluations") + + # Clean up + if os.path.exists(data_file): + os.remove(data_file) + + except Exception as e: + print(f"❌ Error testing implementation: {e}") + print("\n🚧 Implementation Status:") + print(" - Sample data created: βœ…") + print(" - AzureOpenAIScoreModelGrader class: ❌ (error)") + print(" - Integration with evaluate(): ❌ (needs fixing)") + + +def demonstrate_different_grader_types(): + """Show examples of different score model grader configurations.""" + + print("\n=== Different Score Model Grader Examples ===\n") + + examples = [ + { + "name": "Helpfulness Grader", + "description": ( + "Evaluates how helpful the AI response is to the user" + ), + "config": { + "name": "Helpfulness Assessment", + "input": [ + { + "role": "system", + "content": ( + "Rate how helpful this AI response is in " + "addressing the user's needs." + ) + }, + { + "role": "user", + "content": ( + "User Question: {{ data.question }}\n" + "AI Response: {{ data.response }}\n\n" + "Helpfulness score (0.0-1.0):" + ) + } + ], + "range": [0.0, 1.0], + "pass_threshold": 0.6 + } + }, + { + "name": "Factual Accuracy Grader", + "description": ( + "Checks factual accuracy against reference information" + ), + "config": { + "name": "Factual Accuracy Check", + "input": [ + { + "role": "system", + "content": ( + "You are a fact-checker. Compare the AI response " + "with reference information and rate accuracy." + ) + }, + { + "role": "user", + "content": ( + "Reference: {{ data.reference }}\n" + "AI Response: {{ data.response }}\n\n" + "Accuracy score (0.0-1.0):" + ) + } + ], + "range": [0.0, 1.0], + "pass_threshold": 0.8 + } + }, + { + "name": "Clarity Grader", + "description": ( + "Evaluates how clear and understandable the response is" + ), + "config": { + "name": "Response Clarity", + "input": [ + { + "role": "developer", + "content": ( + "Evaluate the clarity and understandability " + "of this AI response." + ) + }, + { + "role": "user", + "content": ( + "Response: {{ data.response }}\n\n" + "Clarity score (0.0-1.0, where 1.0 is perfectly " + "clear):" + ) + } + ], + "range": [0.0, 1.0], + "pass_threshold": 0.7, + "sampling_params": { + "temperature": 0.1, + "max_tokens": 150 + } + } + } + ] + + for example in examples: + print(f"🎯 {example['name']}") + print(f" Description: {example['description']}") + print(" Configuration:") + config = example['config'] + print(f" - Name: {config['name']}") + print(f" - Input Messages: {len(config['input'])} messages") + print(f" - Range: {config['range']}") + print(f" - Pass Threshold: {config['pass_threshold']}") + if 'sampling_params' in config: + print(f" - Sampling Params: {config['sampling_params']}") + print() + + +if __name__ == "__main__": + print("πŸš€ Starting Azure OpenAI Score Model Grader Demo\n") + + # Check if environment variables are set + required_vars = [ + "4o_mini_target_endpoint", + "4o_mini_target_endpoint_key", + "4o_mini_target_endpoint_deployment_name" + ] + + missing_vars = [var for var in required_vars if not os.environ.get(var)] + + if missing_vars: + print("⚠️ Missing environment variables:") + for var in missing_vars: + print(f" - {var}") + print("\nRunning in demonstration mode...\n") + demonstrate_configuration_only() + else: + print("βœ… All environment variables found") + demonstrate_score_model_grader() + + demonstrate_different_grader_types() + + print("\nπŸŽ‰ Demo completed!") From dd2824fe944b7e012661ceee48e6713abfb454e6 Mon Sep 17 00:00:00 2001 From: Nagkumar Arkalgud Date: Tue, 17 Jun 2025 09:10:26 -0700 Subject: [PATCH 2/5] Add tests --- .../ai/evaluation/_aoai/score_model_grader.py | 20 + .../data/score_model_test_data.jsonl | 3 + .../unittests/test_aoai_score_model_grader.py | 1160 +++++++++++++++++ 3 files changed, 1183 insertions(+) create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/data/score_model_test_data.jsonl create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_score_model_grader.py diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py index 22480a9a4f80..88077f57f9fa 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py @@ -41,6 +41,9 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader): :type name: str :param range: The range of the score. Defaults to [0, 1]. :type range: Optional[List[float]] + :param pass_threshold: Score threshold for pass/fail classification. + Defaults to midpoint of range. + :type pass_threshold: Optional[float] :param sampling_params: The sampling parameters for the model. :type sampling_params: Optional[Dict[str, Any]] :param kwargs: Additional keyword arguments to pass to the grader. @@ -59,9 +62,26 @@ def __init__( model: str, name: str, range: Optional[List[float]] = None, + pass_threshold: Optional[float] = None, sampling_params: Optional[Dict[str, Any]] = None, **kwargs: Any ): + # Validate range and pass_threshold + if range is not None: + if len(range) != 2 or range[0] >= range[1]: + raise ValueError("range must be a list of two numbers [min, max] where min < max") + else: + range = [0.0, 1.0] # Default range + + if pass_threshold is not None: + if range and (pass_threshold < range[0] or pass_threshold > range[1]): + raise ValueError(f"pass_threshold {pass_threshold} must be within range {range}") + else: + pass_threshold = (range[0] + range[1]) / 2 # Default to midpoint + + # Store pass_threshold as instance attribute + self.pass_threshold = pass_threshold + # Create OpenAI ScoreModelGrader instance grader_kwargs = { "input": input, diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/score_model_test_data.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/score_model_test_data.jsonl new file mode 100644 index 000000000000..158412dfd6ea --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/score_model_test_data.jsonl @@ -0,0 +1,3 @@ +{"conversation": {"messages": [{"role": "user", "content": "How can I improve my Python coding skills?"}, {"role": "assistant", "content": "Here are some effective ways to improve your Python skills: 1) Practice coding daily with platforms like LeetCode or HackerRank, 2) Work on real projects that interest you, 3) Read other people's code on GitHub, 4) Join Python communities and forums, 5) Take online courses or tutorials."}]}, "context": "Programming advice", "expected_quality": "high"} +{"conversation": {"messages": [{"role": "user", "content": "What is Python?"}, {"role": "assistant", "content": "Python is a programming language."}]}, "context": "Basic definition", "expected_quality": "low"} +{"conversation": {"messages": [{"role": "user", "content": "Can you explain machine learning for a beginner?"}, {"role": "assistant", "content": "Machine learning is a subset of artificial intelligence where computers learn patterns from data without being explicitly programmed. Think of it like teaching a child to recognize animals - you show them many pictures of cats and dogs, and eventually they learn to distinguish between them."}]}, "context": "Educational explanation", "expected_quality": "high"} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_score_model_grader.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_score_model_grader.py new file mode 100644 index 000000000000..e674e6fda724 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_score_model_grader.py @@ -0,0 +1,1160 @@ +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +""" +Comprehensive unit tests for AzureOpenAIScoreModelGrader. +This test suite covers initialization scenarios, edge cases, validation logic, +error handling, registry integration, and usage patterns. +""" + +import pytest +from unittest.mock import patch, AsyncMock + +from azure.ai.evaluation import AzureOpenAIModelConfiguration +from azure.ai.evaluation._aoai.score_model_grader import ( + AzureOpenAIScoreModelGrader +) +from azure.ai.evaluation._evaluate._evaluate_aoai import ( + _split_evaluators_and_grader_configs, + _convert_remote_eval_params_to_grader +) + + +@pytest.fixture +def mock_aoai_model_config(): + """Mock Azure OpenAI model configuration for testing.""" + return AzureOpenAIModelConfiguration( + azure_deployment="test-deployment", + azure_endpoint="https://test-endpoint.openai.azure.com/", + api_key="test-api-key", + api_version="2024-12-01-preview", + ) + + +@pytest.fixture +def basic_score_grader_config(): + """Basic configuration for score model grader.""" + return { + "name": "Test Score Grader", + "model": "gpt-4o-mini", + "input": [ + { + "role": "system", + "content": "You are a test evaluator. Rate from 0.0 to 1.0." + }, + { + "role": "user", + "content": "Rate this conversation: {{ item.conversation }}" + } + ], + "range": [0.0, 1.0], + "pass_threshold": 0.5, + "sampling_params": { + "temperature": 0.0, + "max_tokens": 100 + } + } + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGrader: + """Test suite for AzureOpenAIScoreModelGrader.""" + + def test_grader_initialization_valid_config( + self, mock_aoai_model_config, basic_score_grader_config + ): + """Test successful grader initialization with valid configuration.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + assert grader is not None + assert grader.id == "aoai://score_model" + assert grader._model_config == mock_aoai_model_config + assert grader._grader_config.name == "Test Score Grader" + assert grader._grader_config.model == "gpt-4o-mini" + assert grader._grader_config.range == [0.0, 1.0] + assert grader.pass_threshold == 0.5 + + def test_grader_initialization_minimal_config(self, mock_aoai_model_config): + """Test grader initialization with minimal required configuration.""" + minimal_config = { + "name": "Minimal Grader", + "model": "gpt-4", + "input": [ + {"role": "user", "content": "Rate this: {{ item.data }}"} + ] + } + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **minimal_config + ) + + assert grader is not None + assert grader._grader_config.name == "Minimal Grader" + assert grader._grader_config.range == [0.0, 1.0] # Default range + assert grader.pass_threshold == 0.5 # Default threshold + + def test_grader_initialization_missing_model_config( + self, basic_score_grader_config + ): + """Test that grader initialization fails without model config.""" + with pytest.raises(TypeError): + AzureOpenAIScoreModelGrader(**basic_score_grader_config) + + def test_grader_initialization_invalid_model_config( + self, basic_score_grader_config + ): + """Test grader initialization with invalid model config.""" + bad_model_config = AzureOpenAIModelConfiguration( + azure_deployment="test-deployment", + azure_endpoint="https://test-endpoint.openai.azure.com/", + # Missing api_key + ) + + with pytest.raises(Exception) as excinfo: + AzureOpenAIScoreModelGrader( + model_config=bad_model_config, + **basic_score_grader_config + ) + + assert "api_key" in str(excinfo.value) + + def test_grader_initialization_missing_required_fields( + self, mock_aoai_model_config + ): + """Test grader initialization fails with missing required fields.""" + # Missing name + with pytest.raises(TypeError): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + model="gpt-4", + input=[{"role": "user", "content": "test"}] + ) + + # Missing model + with pytest.raises(TypeError): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Test", + input=[{"role": "user", "content": "test"}] + ) + + # Missing input + with pytest.raises(TypeError): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Test", + model="gpt-4" + ) + + def test_grader_initialization_invalid_range(self, mock_aoai_model_config): + """Test grader initialization with invalid range values.""" + config = { + "name": "Test Grader", + "model": "gpt-4", + "input": [{"role": "user", "content": "test"}], + "range": [1.0, 0.0] # Invalid: min > max + } + + with pytest.raises(ValueError) as excinfo: + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **config + ) + + assert "range" in str(excinfo.value).lower() + + def test_grader_initialization_invalid_threshold( + self, mock_aoai_model_config + ): + """Test grader initialization with invalid pass threshold.""" + config = { + "name": "Test Grader", + "model": "gpt-4", + "input": [{"role": "user", "content": "test"}], + "range": [0.0, 1.0], + "pass_threshold": 1.5 # Outside range + } + + with pytest.raises(ValueError) as excinfo: + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **config + ) + + assert "pass_threshold" in str(excinfo.value).lower() + + def test_grader_validation_bypass(self, basic_score_grader_config): + """Test that validation can be bypassed for testing purposes.""" + bad_model_config = AzureOpenAIModelConfiguration( + azure_deployment="test-deployment", + azure_endpoint="https://test-endpoint.openai.azure.com/", + # Missing api_key + ) + + # Should not raise exception when validate=False + grader = AzureOpenAIScoreModelGrader( + model_config=bad_model_config, + validate=False, + **basic_score_grader_config + ) + + assert grader is not None + + def test_grader_registry_integration( + self, mock_aoai_model_config, basic_score_grader_config + ): + """Test that score model grader integrates with the grader registry.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + # Test grader conversion + init_params = { + "model_config": mock_aoai_model_config, + **basic_score_grader_config + } + + converted_grader = _convert_remote_eval_params_to_grader( + AzureOpenAIScoreModelGrader.id, + init_params=init_params + ) + + assert isinstance(converted_grader, AzureOpenAIScoreModelGrader) + assert converted_grader._model_config == mock_aoai_model_config + + def test_grader_split_recognition( + self, mock_aoai_model_config, basic_score_grader_config + ): + """Test that score model grader is correctly recognized as AOAI grader.""" + from azure.ai.evaluation import F1ScoreEvaluator + + built_in_eval = F1ScoreEvaluator() + custom_eval = lambda x: x + score_grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + evaluators = { + "f1_score": built_in_eval, + "custom_eval": custom_eval, + "score_grader": score_grader + } + + just_evaluators, aoai_graders = _split_evaluators_and_grader_configs( + evaluators + ) + + assert len(just_evaluators) == 2 + assert len(aoai_graders) == 1 + assert "f1_score" in just_evaluators + assert "custom_eval" in just_evaluators + assert "score_grader" in aoai_graders + + def test_grader_config_properties( + self, mock_aoai_model_config, basic_score_grader_config + ): + """Test that grader configuration properties are accessible.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + config = grader._grader_config + + assert config.name == "Test Score Grader" + assert config.model == "gpt-4o-mini" + assert len(config.input) == 2 + assert config.input[0].role == "system" + assert config.input[1].role == "user" + assert config.range == [0.0, 1.0] + assert config.sampling_params["temperature"] == 0.0 + assert config.sampling_params["max_tokens"] == 100 + assert grader.pass_threshold == 0.5 + + def test_different_score_ranges(self, mock_aoai_model_config): + """Test grader with different score ranges.""" + # Test 1-5 scale + config_1_to_5 = { + "name": "1-5 Scale Grader", + "model": "gpt-4", + "input": [{"role": "user", "content": "Rate 1-5: {{ item.text }}"}], + "range": [1.0, 5.0], + "pass_threshold": 3.0 + } + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **config_1_to_5 + ) + + assert grader._grader_config.range == [1.0, 5.0] + assert grader.pass_threshold == 3.0 + + # Test 0-10 scale with default threshold + config_0_to_10 = { + "name": "0-10 Scale Grader", + "model": "gpt-4", + "input": [{"role": "user", "content": "Rate 0-10: {{ item.text }}"}], + "range": [0.0, 10.0] + # No pass_threshold specified - should default to 5.0 (midpoint) + } + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **config_0_to_10 + ) + + assert grader._grader_config.range == [0.0, 10.0] + assert grader.pass_threshold == 5.0 # Midpoint default + + def test_grader_id_property( + self, mock_aoai_model_config, basic_score_grader_config + ): + """Test that grader has correct ID.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + assert grader.id == "aoai://score_model" + assert AzureOpenAIScoreModelGrader.id == "aoai://score_model" + + @patch('azure.ai.evaluation._aoai.score_model_grader.AzureOpenAIGrader.get_client') + def test_grader_with_mocked_client( + self, mock_get_client, mock_aoai_model_config, basic_score_grader_config + ): + """Test grader creation and basic properties with mocked client.""" + # Mock the client to avoid actual API calls + mock_client = AsyncMock() + mock_get_client.return_value = mock_client + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + assert grader is not None + assert grader.id == "aoai://score_model" + assert hasattr(grader, 'pass_threshold') + assert grader.pass_threshold == 0.5 + + +@pytest.mark.unittest +class TestScoreModelGraderUsagePatterns: + """Test common usage patterns for score model grader.""" + + def test_conversation_quality_pattern(self, mock_aoai_model_config): + """Test conversation quality grading pattern.""" + config = { + "name": "Conversation Quality", + "model": "gpt-4o-mini", + "input": [ + { + "role": "system", + "content": ( + "Assess conversation quality based on helpfulness, " + "accuracy, and completeness." + ) + }, + { + "role": "user", + "content": ( + "Context: {{ item.context }}\n" + "Conversation: {{ item.conversation }}\n" + "Rate quality (0.0-1.0):" + ) + } + ], + "range": [0.0, 1.0], + "pass_threshold": 0.7 + } + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **config + ) + + assert grader._grader_config.name == "Conversation Quality" + assert grader.pass_threshold == 0.7 + + def test_helpfulness_scoring_pattern(self, mock_aoai_model_config): + """Test helpfulness scoring pattern.""" + config = { + "name": "Helpfulness Score", + "model": "gpt-4", + "input": [ + { + "role": "system", + "content": ( + "Rate how helpful the AI response is to " + "the user's question." + ) + }, + { + "role": "user", + "content": ( + "Question: {{ item.question }}\n" + "Response: {{ item.response }}\n" + "Helpfulness (0-10):" + ) + } + ], + "range": [0.0, 10.0], + "pass_threshold": 6.0, + "sampling_params": {"temperature": 0.0} + } + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **config + ) + + assert grader._grader_config.range == [0.0, 10.0] + assert grader.pass_threshold == 6.0 + + +@pytest.mark.unittest +class TestScoreModelGraderIntegration: + """Test integration with evaluation framework.""" + + def test_grader_in_evaluators_dict( + self, mock_aoai_model_config, basic_score_grader_config + ): + """Test using score grader in evaluators dictionary.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + **basic_score_grader_config + ) + + # Test that grader can be used in evaluators dict + evaluators = {"quality_score": grader} + + # Verify grader separation works + just_evaluators, aoai_graders = _split_evaluators_and_grader_configs( + evaluators + ) + assert len(just_evaluators) == 0 + assert len(aoai_graders) == 1 + assert "quality_score" in aoai_graders + + def test_multiple_graders_recognition(self, mock_aoai_model_config): + """Test multiple score graders in evaluation.""" + quality_grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Quality Assessment", + model="gpt-4o-mini", + input=[{ + "role": "user", + "content": "Rate quality: {{ item.conversation }}" + }], + range=[0.0, 1.0] + ) + + helpfulness_grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Helpfulness Assessment", + model="gpt-4o-mini", + input=[{ + "role": "user", + "content": "Rate helpfulness: {{ item.conversation }}" + }], + range=[0.0, 1.0] + ) + + evaluators = { + "quality": quality_grader, + "helpfulness": helpfulness_grader + } + + # Test grader recognition + just_evaluators, aoai_graders = _split_evaluators_and_grader_configs( + evaluators + ) + + assert len(just_evaluators) == 0 + assert len(aoai_graders) == 2 + assert "quality" in aoai_graders + assert "helpfulness" in aoai_graders + + def test_mixed_evaluator_types(self, mock_aoai_model_config): + """Test mixing score graders with built-in evaluators.""" + from azure.ai.evaluation import F1ScoreEvaluator + + f1_evaluator = F1ScoreEvaluator() + score_grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Custom Score", + model="gpt-4", + input=[{"role": "user", "content": "Rate: {{ item.data }}"}] + ) + + evaluators = { + "f1_score": f1_evaluator, + "custom_score": score_grader + } + + just_evaluators, aoai_graders = _split_evaluators_and_grader_configs( + evaluators + ) + + assert len(just_evaluators) == 1 + assert len(aoai_graders) == 1 + assert "f1_score" in just_evaluators + assert "custom_score" in aoai_graders + + def test_grader_conversion_error_handling(self, mock_aoai_model_config): + """Test error handling in grader conversion.""" + init_params = { + "model_config": mock_aoai_model_config, + "name": "Test", + "model": "gpt-4", + "input": [{"role": "user", "content": "test"}] + } + + # Test invalid grader ID + with pytest.raises(Exception) as excinfo: + _convert_remote_eval_params_to_grader( + "invalid_id", init_params=init_params + ) + + assert "not recognized" in str(excinfo.value) + + # Test successful conversion + grader = _convert_remote_eval_params_to_grader( + AzureOpenAIScoreModelGrader.id, + init_params=init_params + ) + + assert isinstance(grader, AzureOpenAIScoreModelGrader) + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGraderEdgeCases: + """Comprehensive edge case testing for AzureOpenAIScoreModelGrader.""" + + def test_grader_with_empty_input(self, mock_aoai_model_config): + """Test grader creation with empty input list.""" + # Empty input should be allowed - validation happens at runtime + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Empty Input", + model="gpt-4", + input=[] + ) + assert grader is not None + assert len(grader._grader_config.input) == 0 + + def test_grader_with_none_values(self, mock_aoai_model_config): + """Test grader creation with None values for optional fields.""" + # Test with None sampling_params + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="None Values Test", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + sampling_params=None + ) + assert grader is not None + + # Test with None range - should use default + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="None Range Test", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=None + ) + assert grader._grader_config.range == [0.0, 1.0] + + def test_grader_with_extreme_ranges(self, mock_aoai_model_config): + """Test grader with extreme score ranges.""" + # Very large range + grader_large = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Large Range", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[-1000.0, 1000.0], + pass_threshold=0.0 + ) + assert grader_large._grader_config.range == [-1000.0, 1000.0] + assert grader_large.pass_threshold == 0.0 + + # Very small range + grader_small = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Small Range", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[0.0, 0.1], + pass_threshold=0.05 + ) + assert grader_small._grader_config.range == [0.0, 0.1] + assert grader_small.pass_threshold == 0.05 + + def test_grader_with_negative_ranges(self, mock_aoai_model_config): + """Test grader with negative score ranges.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Negative Range", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[-10.0, -1.0], + pass_threshold=-5.0 + ) + assert grader._grader_config.range == [-10.0, -1.0] + assert grader.pass_threshold == -5.0 + + def test_grader_boundary_threshold_values(self, mock_aoai_model_config): + """Test grader with boundary threshold values.""" + # Threshold at minimum + grader_min = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Min Threshold", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[0.0, 10.0], + pass_threshold=0.0 + ) + assert grader_min.pass_threshold == 0.0 + + # Threshold at maximum + grader_max = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Max Threshold", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[0.0, 10.0], + pass_threshold=10.0 + ) + assert grader_max.pass_threshold == 10.0 + + def test_grader_with_invalid_input_structures( + self, mock_aoai_model_config + ): + """Test grader with invalid input message structures.""" + # Missing role + with pytest.raises((TypeError, ValueError, KeyError)): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Missing Role", + model="gpt-4", + input=[{"content": "test"}] + ) + + # Missing content + with pytest.raises((TypeError, ValueError, KeyError)): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Missing Content", + model="gpt-4", + input=[{"role": "user"}] + ) + + # Invalid role + with pytest.raises((TypeError, ValueError)): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Invalid Role", + model="gpt-4", + input=[{"role": "invalid", "content": "test"}], + validate=True + ) + + def test_grader_with_complex_sampling_params(self, mock_aoai_model_config): + """Test grader with various sampling parameter combinations.""" + complex_params = { + "temperature": 0.7, + "max_tokens": 150, + "top_p": 0.9, + "frequency_penalty": 0.1, + "presence_penalty": 0.1, + "stop": ["END", "STOP"] + } + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Complex Params", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + sampling_params=complex_params + ) + + assert grader._grader_config.sampling_params == complex_params + + def test_grader_with_unicode_content(self, mock_aoai_model_config): + """Test grader with Unicode and special characters in content.""" + unicode_content = "ζ΅‹θ―• 🌟 Γ©mojis and spΓ©ciΓ₯l characters Γ±" + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Unicode Test", + model="gpt-4", + input=[{ + "role": "user", + "content": f"Evaluate: {unicode_content} - {{{{ item.text }}}}" + }] + ) + + assert unicode_content in grader._grader_config.input[0].content + + def test_grader_with_very_long_content(self, mock_aoai_model_config): + """Test grader with very long input content.""" + long_content = "Very long content " * 1000 # ~18KB + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Long Content", + model="gpt-4", + input=[{"role": "user", "content": long_content}] + ) + + assert len(grader._grader_config.input[0].content) > 10000 + + def test_grader_invalid_type_parameters(self, mock_aoai_model_config): + """Test grader with wrong parameter types.""" + # String range instead of list + with pytest.raises((TypeError, ValueError)): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="String Range", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range="0-10" + ) + + # String threshold instead of number + with pytest.raises((TypeError, ValueError)): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="String Threshold", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + pass_threshold="5.0" + ) + + # Invalid input type + with pytest.raises((TypeError, ValueError)): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="String Input", + model="gpt-4", + input="This should be a list" + ) + + def test_grader_with_floating_point_precision( + self, mock_aoai_model_config + ): + """Test grader with high precision floating point values.""" + precise_range = [0.0000001, 0.9999999] + precise_threshold = 0.5000001 + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Precise Values", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=precise_range, + pass_threshold=precise_threshold + ) + + assert grader._grader_config.range == precise_range + assert grader.pass_threshold == precise_threshold + + def test_grader_with_zero_range(self, mock_aoai_model_config): + """Test grader with zero-width range.""" + with pytest.raises(ValueError): + AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Zero Range", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[5.0, 5.0] # Same min and max + ) + + def test_grader_with_inf_nan_values(self, mock_aoai_model_config): + """Test grader with infinity and NaN values.""" + # These values should be allowed at initialization + # but may fail at runtime + grader_inf = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Infinity Range", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + range=[0.0, float('inf')], + validate=False + ) + assert grader_inf is not None + + # Test with NaN - should be allowed at init + grader_nan = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="NaN Threshold", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + pass_threshold=float('nan'), + validate=False + ) + assert grader_nan is not None + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGraderTemplateEdgeCases: + """Test edge cases related to template processing.""" + + def test_grader_with_complex_templates(self, mock_aoai_model_config): + """Test grader with complex template structures.""" + complex_template = """ + Context: {{ item.context }} + Question: {{ item.question }} + Response: {{ item.response }} + {% if item.additional_info %} + Additional: {{ item.additional_info }} + {% endif %} + Rate the response quality (0-10): + """ + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Complex Template", + model="gpt-4", + input=[{"role": "user", "content": complex_template}] + ) + + assert "item.context" in grader._grader_config.input[0].content + assert "{% if" in grader._grader_config.input[0].content + + def test_grader_with_nested_templates(self, mock_aoai_model_config): + """Test grader with nested template variables.""" + nested_template = ( + "{{ item.conversation[0].message }} vs " + "{{ item.conversation[1].message }}" + ) + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Nested Template", + model="gpt-4", + input=[{"role": "user", "content": nested_template}] + ) + + assert "conversation[0]" in grader._grader_config.input[0].content + + def test_grader_with_malformed_templates(self, mock_aoai_model_config): + """Test grader with malformed template syntax.""" + # Missing closing brace + malformed_template = "Rate this: {{ item.text" + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Malformed Template", + model="gpt-4", + input=[{"role": "user", "content": malformed_template}] + ) + + # Should still create grader (template validation happens at runtime) + assert grader is not None + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGraderConfigurationEdgeCases: + """Test edge cases in model configuration.""" + + def test_grader_with_different_api_versions(self, mock_aoai_model_config): + """Test grader with different API versions.""" + old_config = AzureOpenAIModelConfiguration( + azure_deployment="test-deployment", + azure_endpoint="https://test-endpoint.openai.azure.com/", + api_key="test-api-key", + api_version="2023-05-15" # Older version + ) + + grader = AzureOpenAIScoreModelGrader( + model_config=old_config, + name="Old API Version", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + validate=False + ) + + # Model config gets converted to dict internally + assert grader._model_config["api_version"] == "2023-05-15" + + def test_grader_with_various_endpoints(self, mock_aoai_model_config): + """Test grader with different endpoint formats.""" + configs = [ + ("https://test.openai.azure.com/", True), + ("https://test.openai.azure.com", True), # No trailing slash + # HTTP (should work with validate=False) + ("http://localhost:8080/", False), + ("https://custom-domain.com/", False), # Custom domain + ] + + for endpoint, should_validate in configs: + config = AzureOpenAIModelConfiguration( + azure_deployment="test-deployment", + azure_endpoint=endpoint, + api_key="test-api-key", + api_version="2024-12-01-preview" + ) + + grader = AzureOpenAIScoreModelGrader( + model_config=config, + name="Endpoint Test", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + validate=False + ) + + # Model config gets converted to dict internally + assert grader._model_config["azure_endpoint"] == endpoint + + def test_grader_with_empty_credentials(self): + """Test grader with empty/invalid credentials.""" + # Should raise EvaluationException as expected + from azure.ai.evaluation._exceptions import EvaluationException + + with pytest.raises(EvaluationException): + config = AzureOpenAIModelConfiguration( + azure_deployment="", + azure_endpoint="", + api_key="", + api_version="" + ) + AzureOpenAIScoreModelGrader( + model_config=config, + name="Empty Creds", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + validate=True + ) + + def test_grader_with_very_long_names(self, mock_aoai_model_config): + """Test grader with very long names and model names.""" + long_name = "A" * 1000 + long_model = "gpt-4-very-long-model-name-" + "x" * 100 + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name=long_name, + model=long_model, + input=[{"role": "user", "content": "test"}] + ) + + assert grader._grader_config.name == long_name + assert grader._grader_config.model == long_model + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGraderRegistryEdgeCases: + """Test edge cases in grader registry integration.""" + + def test_registry_with_duplicate_grader_names( + self, mock_aoai_model_config + ): + """Test registry behavior with duplicate grader names.""" + grader1 = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Duplicate Name", + model="gpt-4", + input=[{"role": "user", "content": "test1"}] + ) + + grader2 = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Duplicate Name", + model="gpt-4o", + input=[{"role": "user", "content": "test2"}] + ) + + evaluators = { + "grader_a": grader1, + "grader_b": grader2 + } + + just_evaluators, aoai_graders = _split_evaluators_and_grader_configs( + evaluators + ) + + assert len(aoai_graders) == 2 + assert "grader_a" in aoai_graders + assert "grader_b" in aoai_graders + + def test_registry_with_none_evaluators(self): + """Test registry behavior with None evaluators.""" + evaluators = { + "valid_grader": None, + "another_none": None + } + + # Should handle None values gracefully + just_evaluators, aoai_graders = _split_evaluators_and_grader_configs( + evaluators + ) + + # All None values should be in just_evaluators + assert len(just_evaluators) == 2 + assert len(aoai_graders) == 0 + + def test_registry_conversion_with_invalid_params(self): + """Test grader conversion with invalid initialization parameters.""" + # Missing required parameter + invalid_params = { + "name": "Test", + # Missing model and input + } + + with pytest.raises(Exception): + _convert_remote_eval_params_to_grader( + AzureOpenAIScoreModelGrader.id, + init_params=invalid_params + ) + + def test_registry_conversion_with_extra_params( + self, mock_aoai_model_config + ): + """Test grader conversion with extra unknown parameters.""" + params_with_extra = { + "model_config": mock_aoai_model_config, + "name": "Extra Params", + "model": "gpt-4", + "input": [{"role": "user", "content": "test"}], + "unknown_param": "should_be_ignored", + "another_extra": 42 + } + + # Should succeed and ignore extra params + grader = _convert_remote_eval_params_to_grader( + AzureOpenAIScoreModelGrader.id, + init_params=params_with_extra + ) + + assert isinstance(grader, AzureOpenAIScoreModelGrader) + assert grader._grader_config.name == "Extra Params" + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGraderPerformanceEdgeCases: + """Test edge cases related to performance and resource usage.""" + + def test_grader_with_many_input_messages(self, mock_aoai_model_config): + """Test grader with large number of input messages.""" + many_messages = [] + for i in range(100): + many_messages.append({ + "role": "user" if i % 2 == 0 else "assistant", + "content": f"Message {i}: {{{{ item.data_{i} }}}}" + }) + + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Many Messages", + model="gpt-4", + input=many_messages + ) + + assert len(grader._grader_config.input) == 100 + + def test_grader_creation_performance(self, mock_aoai_model_config): + """Test creating many graders doesn't cause memory issues.""" + graders = [] + + for i in range(50): # Create 50 graders + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name=f"Grader {i}", + model="gpt-4", + input=[{"role": "user", "content": f"Test {i}"}], + validate=False + ) + graders.append(grader) + + assert len(graders) == 50 + # Check that all graders are unique instances + assert len(set(id(g) for g in graders)) == 50 + + +@pytest.mark.unittest +class TestAzureOpenAIScoreModelGraderCompatibility: + """Test compatibility with different SDK components.""" + + def test_grader_with_different_evaluator_types( + self, mock_aoai_model_config + ): + """Test grader compatibility with various evaluator types.""" + try: + from azure.ai.evaluation import F1ScoreEvaluator + + f1_eval = F1ScoreEvaluator() + + score_grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Compatibility Test", + model="gpt-4", + input=[{"role": "user", "content": "test"}] + ) + + def custom_eval(x): + return {"score": 0.5} + + evaluators = { + "f1": f1_eval, + "custom": custom_eval, + "score_grader": score_grader + } + + just_evaluators, aoai_graders = ( + _split_evaluators_and_grader_configs(evaluators) + ) + + assert len(just_evaluators) >= 2 # f1 and custom + assert len(aoai_graders) == 1 + assert "score_grader" in aoai_graders + + except ImportError: + # Skip if evaluators not available + pytest.skip("Built-in evaluators not available") + + def test_grader_string_representation(self, mock_aoai_model_config): + """Test string representation of grader for debugging.""" + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="String Repr Test", + model="gpt-4", + input=[{"role": "user", "content": "test"}] + ) + + # Should have meaningful string representation + grader_str = str(grader) + assert ("AzureOpenAIScoreModelGrader" in grader_str or + "String Repr Test" in grader_str) + + @patch( + 'azure.ai.evaluation._aoai.score_model_grader.' + 'AzureOpenAIGrader.get_client' + ) + def test_grader_with_client_initialization_error( + self, mock_get_client, mock_aoai_model_config + ): + """Test grader behavior when client initialization fails.""" + mock_get_client.side_effect = Exception("Client initialization failed") + + # Should still create grader object (client is created lazily) + grader = AzureOpenAIScoreModelGrader( + model_config=mock_aoai_model_config, + name="Client Error Test", + model="gpt-4", + input=[{"role": "user", "content": "test"}], + validate=False + ) + + assert grader is not None + assert grader.id == "aoai://score_model" From bae2e76021f46f735ebdb615568407a98ee91dd5 Mon Sep 17 00:00:00 2001 From: Nagkumar Arkalgud Date: Tue, 17 Jun 2025 13:53:56 -0700 Subject: [PATCH 3/5] Removed the plan md --- .../integrating_aoai_score_model_grader.md | 608 ------------------ 1 file changed, 608 deletions(-) delete mode 100644 sdk/evaluation/azure-ai-evaluation/integrating_aoai_score_model_grader.md diff --git a/sdk/evaluation/azure-ai-evaluation/integrating_aoai_score_model_grader.md b/sdk/evaluation/azure-ai-evaluation/integrating_aoai_score_model_grader.md deleted file mode 100644 index 3d79813ec011..000000000000 --- a/sdk/evaluation/azure-ai-evaluation/integrating_aoai_score_model_grader.md +++ /dev/null @@ -1,608 +0,0 @@ -# Integrating Azure OpenAI Score Model Grader - -## Overview - -This document provides a comprehensive plan for integrating the `AzureOpenAIScoreModelGrader` into the Azure AI Evaluation SDK. The Score Model Grader enables continuous scoring (0.0-1.0) with custom prompts, complementing the existing categorical and binary graders. - -## Current State Analysis - -### Existing AOAI Grader Architecture - -The Azure AI Evaluation SDK currently supports three types of AOAI graders: - -1. **`AzureOpenAIGrader`** (Base Class) - - Located: `azure/ai/evaluation/_aoai/aoai_grader.py` - - Handles model configuration and OpenAI client creation - - Validates API keys and endpoints - - Supports both Azure OpenAI and OpenAI configurations - -2. **`AzureOpenAILabelGrader`** - - Located: `azure/ai/evaluation/_aoai/label_grader.py` - - Wraps OpenAI's `LabelModelGrader` - - Supports classification with predefined labels - - Uses `pass_threshold` for binary pass/fail results - -3. **`AzureOpenAIStringCheckGrader`** - - Located: `azure/ai/evaluation/_aoai/string_check_grader.py` - - Wraps OpenAI's `StringCheckGrader` - - Supports string operations: `eq`, `ne`, `like`, `ilike` - - Binary pass/fail results only - -4. **`AzureOpenAITextSimilarityGrader`** - - Located: `azure/ai/evaluation/_aoai/text_similarity_grader.py` - - Wraps OpenAI's `TextSimilarityGrader` - - Supports various similarity metrics (BLEU, ROUGE, cosine, etc.) - - Uses `pass_threshold` for binary classification - -### Integration Points - -**Evaluation Pipeline:** -- `_evaluate_aoai.py` - Orchestrates AOAI evaluations -- `_split_evaluators_and_grader_configs()` - Separates AOAI graders from regular evaluators -- `_get_grader_class()` - Maps grader IDs to implementation classes -- Main `__init__.py` - Exports public API classes - -**Key Functions:** -- `_begin_aoai_evaluation()` - Starts AOAI evaluation runs -- `_get_evaluation_run_results()` - Retrieves and formats results -- `_convert_remote_eval_params_to_grader()` - Creates grader instances from config - -## Missing Component: Score Model Grader - -### OpenAI SDK ScoreModelGrader Structure - -Based on analysis of the OpenAI SDK, the `ScoreModelGrader` has the following structure: - -```python -class ScoreModelGrader(BaseModel): - input: List[Input] # Conversation-style messages - model: str # Grading model (e.g., "gpt-4") - name: str # Grader name - type: Literal["score_model"] # Always "score_model" - range: Optional[List[float]] # Score range, defaults to [0, 1] - sampling_params: Optional[object] # Model parameters - -class Input(BaseModel): - content: str # Message content with templates - role: Literal["user", "assistant", "system", "developer"] - type: Optional[Literal["message"]] = None -``` - -### Key Capabilities - -**Continuous Scoring:** -- Returns floating-point scores (typically 0.0-1.0) -- Configurable score ranges (e.g., [0, 5], [0, 100]) -- Pass/fail threshold for binary classification - -**Flexible Prompting:** -- Multi-message conversations (system, user, assistant, developer roles) -- Template string support for dynamic content injection -- Custom evaluation criteria and instructions - -**Model Configuration:** -- Supports any OpenAI-compatible model -- Configurable sampling parameters (temperature, max_tokens, etc.) -- Independent model selection for grading vs. evaluation - -## Implementation Plan - -### Phase 1: Core Implementation - -#### Step 1.1: Create AzureOpenAIScoreModelGrader Class - -**File:** `azure/ai/evaluation/_aoai/score_model_grader.py` - -```python -from typing import Any, Dict, Union, List, Optional -from typing_extensions import Literal - -from azure.ai.evaluation._model_configurations import ( - AzureOpenAIModelConfiguration, - OpenAIModelConfiguration -) -from openai.types.graders import ScoreModelGrader -from azure.ai.evaluation._common._experimental import experimental - -from .aoai_grader import AzureOpenAIGrader - - -@experimental -class AzureOpenAIScoreModelGrader(AzureOpenAIGrader): - """ - Wrapper class for OpenAI's score model graders. - - Enables continuous scoring evaluation with custom prompts and flexible - conversation-style inputs. Supports configurable score ranges and - pass thresholds for binary classification. - """ - - id = "aoai://score_model" - - def __init__( - self, - *, - model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], - input: List[Dict[str, str]], - model: str, - name: str, - range: Optional[List[float]] = None, - sampling_params: Optional[Dict[str, Any]] = None, - **kwargs: Any - ): - # Create OpenAI ScoreModelGrader instance - grader_kwargs = { - "input": input, - "model": model, - "name": name, - "type": "score_model" - } - - if range is not None: - grader_kwargs["range"] = range - if sampling_params is not None: - grader_kwargs["sampling_params"] = sampling_params - - grader = ScoreModelGrader(**grader_kwargs) - - super().__init__( - model_config=model_config, - grader_config=grader, - **kwargs - ) -``` - -**Key Implementation Details:** -- Inherits from `AzureOpenAIGrader` for consistent model handling -- Wraps OpenAI's `ScoreModelGrader` following existing patterns -- Supports all ScoreModelGrader parameters -- Uses experimental decorator for preview functionality - -#### Step 1.2: Update Module Exports - -**File:** `azure/ai/evaluation/_aoai/__init__.py` - -```python -from .aoai_grader import AzureOpenAIGrader -from .score_model_grader import AzureOpenAIScoreModelGrader # Add this import - -__all__ = [ - "AzureOpenAIGrader", - "AzureOpenAIScoreModelGrader", # Add this export -] -``` - -**File:** `azure/ai/evaluation/__init__.py` - -```python -# Add import -from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader - -# Add to exports (around line 40) -``` - -#### Step 1.3: Update Grader Registry - -**File:** `azure/ai/evaluation/_evaluate/_evaluate_aoai.py` - -Update the `_get_grader_class()` function: - -```python -def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]: - """Given a model ID, return the class of the corresponding grader wrapper.""" - - from azure.ai.evaluation import ( - AzureOpenAIGrader, - AzureOpenAILabelGrader, - AzureOpenAIStringCheckGrader, - AzureOpenAITextSimilarityGrader, - AzureOpenAIScoreModelGrader, # Add this import - ) - - id_map = { - AzureOpenAIGrader.id: AzureOpenAIGrader, - AzureOpenAILabelGrader.id: AzureOpenAILabelGrader, - AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader, - AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader, - AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader, # Add this - } - - # ... rest of function unchanged -``` - -### Phase 2: Testing and Validation - -#### Step 2.1: Create Unit Tests - -**File:** `tests/unittests/test_aoai_score_model_grader.py` - -```python -import pytest -from azure.ai.evaluation import AzureOpenAIScoreModelGrader -from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration - - -class TestAzureOpenAIScoreModelGrader: - - def test_initialization_with_required_params(self): - """Test basic initialization with required parameters.""" - model_config = AzureOpenAIModelConfiguration( - azure_endpoint="https://test.openai.azure.com/", - api_key="test-key", - azure_deployment="gpt-4" - ) - - grader = AzureOpenAIScoreModelGrader( - model_config=model_config, - input=[ - {"role": "system", "content": "You are a helpful evaluator."}, - {"role": "user", "content": "Rate this: {{ data.text }}"} - ], - model="gpt-4", - name="Test Grader" - ) - - assert grader._grader_config.name == "Test Grader" - assert grader._grader_config.model == "gpt-4" - assert grader._grader_config.type == "score_model" - - def test_initialization_with_optional_params(self): - """Test initialization with optional parameters.""" - # Test with range and sampling_params - # ... implementation - - def test_client_creation(self): - """Test that the grader can create appropriate OpenAI client.""" - # ... implementation -``` - -#### Step 2.2: Create Integration Tests - -**File:** `tests/e2etests/test_aoai_score_model_integration.py` - -```python -import pytest -from azure.ai.evaluation import evaluate, AzureOpenAIScoreModelGrader -from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration - - -@pytest.mark.skipif("not config.getoption('--live')") -class TestAzureOpenAIScoreModelIntegration: - - def test_evaluate_with_score_model_grader(self): - """Test end-to-end evaluation with score model grader.""" - # Create test data - # Configure grader - # Run evaluation - # Validate results format - pass -``` - -#### Step 2.3: Update Sample and Documentation - -**Sample File:** `samples/aoai_score_model_grader_sample.py` (already created) - -**Documentation Updates:** -- Update README.md with new grader information -- Add API documentation -- Update migration guide if needed - -### Phase 3: Advanced Features and Optimizations - -#### Step 3.1: Enhanced Template Support - -**Template Variables:** -- `{{ data.field_name }}` - Access input data fields -- `{{ outputs.evaluator_name }}` - Access other evaluator outputs -- Custom template functions for data formatting - -**Implementation:** -- Validate template syntax during initialization -- Provide clear error messages for template issues -- Support nested field access (e.g., `{{ data.conversation.messages[0] }}`) - -#### Step 3.2: Result Processing Enhancements - -**Score Parsing:** -- Handle different score formats (JSON, plain text, structured) -- Extract reasoning/explanation from model responses -- Validate score ranges and handle edge cases - -**Metrics Calculation:** -- Mean, median, standard deviation of scores -- Distribution analysis -- Correlation with other evaluators - -#### Step 3.3: Performance Optimizations - -**Batch Processing:** -- Group evaluations by model configuration -- Optimize API calls and reduce latency -- Implement retry logic with exponential backoff - -**Caching:** -- Cache grader configurations -- Store evaluation results for reuse -- Implement result invalidation strategies - -## Usage Examples - -### Basic Conversation Quality Assessment - -```python -from azure.ai.evaluation import evaluate, AzureOpenAIScoreModelGrader -from azure.ai.evaluation import AzureOpenAIModelConfiguration - -# Configure model -model_config = AzureOpenAIModelConfiguration( - azure_endpoint="https://your-endpoint.openai.azure.com/", - api_key="your-api-key", - azure_deployment="gpt-4" -) - -# Create grader -quality_grader = AzureOpenAIScoreModelGrader( - model_config=model_config, - name="Conversation Quality", - model="gpt-4", - input=[ - { - "role": "system", - "content": "Rate conversation quality from 0.0 to 1.0" - }, - { - "role": "user", - "content": "Conversation: {{ data.conversation }}\\nScore:" - } - ], - range=[0.0, 1.0], - pass_threshold=0.7 -) - -# Run evaluation -results = evaluate( - data="conversations.jsonl", - evaluators={"quality": quality_grader} -) -``` - -### Multi-Criteria Evaluation - -```python -# Multiple score model graders for comprehensive evaluation -evaluators = { - "helpfulness": AzureOpenAIScoreModelGrader( - model_config=model_config, - name="Helpfulness", - model="gpt-4", - input=[...], # Helpfulness-specific prompts - pass_threshold=0.6 - ), - "accuracy": AzureOpenAIScoreModelGrader( - model_config=model_config, - name="Factual Accuracy", - model="gpt-4", - input=[...], # Accuracy-specific prompts - pass_threshold=0.8 - ), - "clarity": AzureOpenAIScoreModelGrader( - model_config=model_config, - name="Response Clarity", - model="gpt-4", - input=[...], # Clarity-specific prompts - pass_threshold=0.7 - ) -} - -results = evaluate(data="data.jsonl", evaluators=evaluators) -``` - -### Custom Score Ranges - -```python -# 1-5 star rating system -star_rating_grader = AzureOpenAIScoreModelGrader( - model_config=model_config, - name="Star Rating", - model="gpt-4", - input=[ - { - "role": "system", - "content": "Rate from 1 (worst) to 5 (best) stars" - }, - { - "role": "user", - "content": "Review: {{ data.review }}\\nStars (1-5):" - } - ], - range=[1.0, 5.0], - pass_threshold=3.0 # 3+ stars = pass -) -``` - -## Error Handling and Edge Cases - -### Common Error Scenarios - -1. **Invalid Template Syntax** - - Missing closing braces - - Invalid field references - - Nested template errors - -2. **Model Configuration Issues** - - Invalid API keys or endpoints - - Unsupported model names - - Network connectivity problems - -3. **Score Parsing Failures** - - Non-numeric responses - - Scores outside specified range - - Malformed JSON responses - -4. **Evaluation Pipeline Errors** - - Data formatting issues - - Column mapping conflicts - - Resource quota limits - -### Error Handling Implementation - -```python -class AzureOpenAIScoreModelGrader(AzureOpenAIGrader): - - def _validate_grader_config(self) -> None: - """Enhanced validation for score model graders.""" - super()._validate_grader_config() - - # Validate input messages - if not self._grader_config.input: - raise EvaluationException("Score model grader requires input messages") - - # Validate model name - if not self._grader_config.model: - raise EvaluationException("Score model grader requires model name") - - # Validate score range - if self._grader_config.range: - if len(self._grader_config.range) != 2: - raise EvaluationException("Score range must contain exactly 2 values") - if self._grader_config.range[0] >= self._grader_config.range[1]: - raise EvaluationException("Invalid score range: min must be < max") - - # Validate template syntax in input messages - self._validate_template_syntax() - - def _validate_template_syntax(self) -> None: - """Validate template strings in input messages.""" - for msg in self._grader_config.input: - content = msg.get("content", "") - # Check for balanced braces, valid field references, etc. - # ... implementation -``` - -## Testing Strategy - -### Unit Testing - -**Test Categories:** -1. **Initialization Tests** - Parameter validation, error handling -2. **Configuration Tests** - Model config validation, client creation -3. **Template Tests** - Template syntax validation, variable substitution -4. **Serialization Tests** - JSON serialization/deserialization - -### Integration Testing - -**Test Scenarios:** -1. **End-to-End Evaluation** - Complete evaluation pipeline -2. **Multi-Grader Scenarios** - Multiple graders with different configs -3. **Error Recovery** - Network failures, API errors, malformed responses -4. **Performance Testing** - Large datasets, concurrent evaluations - -### Manual Testing Checklist - -- [ ] Basic grader initialization with minimal parameters -- [ ] Advanced grader configuration with all optional parameters -- [ ] Template variable substitution with various data formats -- [ ] Score range validation and threshold behavior -- [ ] Integration with existing evaluation pipeline -- [ ] Error handling for common failure scenarios -- [ ] Performance with large datasets (100+ samples) -- [ ] Concurrent evaluation with multiple graders - -## Deployment Considerations - -### Backward Compatibility - -- All changes are additive - no breaking changes to existing APIs -- New grader is marked as experimental initially -- Existing evaluations continue to work unchanged - -### Documentation Updates - -- Update API reference documentation -- Add usage examples to README -- Create migration guide for users -- Update troubleshooting guide - -### Monitoring and Observability - -- Add telemetry for score model grader usage -- Monitor evaluation success/failure rates -- Track performance metrics (latency, throughput) -- Log template parsing errors and API failures - -## Success Criteria - -### Functional Requirements - -- [ ] `AzureOpenAIScoreModelGrader` class implemented and tested -- [ ] Integration with existing evaluation pipeline -- [ ] Support for all OpenAI ScoreModelGrader features -- [ ] Comprehensive error handling and validation -- [ ] Template variable substitution working correctly - -### Quality Requirements - -- [ ] Unit test coverage > 90% -- [ ] Integration tests passing -- [ ] Documentation complete and accurate -- [ ] Sample code working and demonstrative -- [ ] Performance acceptable (< 2x overhead vs. direct OpenAI calls) - -### User Experience Requirements - -- [ ] Consistent API with existing graders -- [ ] Clear error messages for common issues -- [ ] Comprehensive examples and documentation -- [ ] Smooth migration path from direct OpenAI usage - -## Timeline Estimate - -**Phase 1 - Core Implementation:** 3-5 days -- Day 1: Implement `AzureOpenAIScoreModelGrader` class -- Day 2: Update exports and grader registry -- Day 3: Create unit tests -- Day 4-5: Integration testing and bug fixes - -**Phase 2 - Testing and Documentation:** 2-3 days -- Day 1: Comprehensive testing -- Day 2: Documentation updates -- Day 3: Sample refinement and validation - -**Phase 3 - Advanced Features:** 1-2 days (optional) -- Enhanced template support -- Performance optimizations -- Additional error handling - -**Total Estimated Time:** 6-10 days - -## Risk Assessment - -### High Risk -- **OpenAI SDK Changes** - Risk of breaking changes in grader interfaces -- **Template Complexity** - Complex template syntax could be error-prone - -### Medium Risk -- **Performance Impact** - Additional API calls could affect evaluation speed -- **Error Handling Gaps** - Unforeseen edge cases in score parsing - -### Low Risk -- **Integration Issues** - Existing pipeline is well-established -- **Backward Compatibility** - Additive changes only - -### Mitigation Strategies -- Pin OpenAI SDK version to avoid breaking changes -- Implement comprehensive test suite with edge cases -- Add performance monitoring and optimization -- Create detailed error handling documentation - -## Conclusion - -This implementation plan provides a comprehensive roadmap for integrating the `AzureOpenAIScoreModelGrader` into the Azure AI Evaluation SDK. The approach leverages existing infrastructure while adding powerful continuous scoring capabilities that complement the current categorical and binary graders. - -The phased implementation approach ensures: -- **Reliability** - Thorough testing at each stage -- **Maintainability** - Consistent with existing patterns -- **Usability** - Clear API and comprehensive documentation -- **Extensibility** - Foundation for future enhancements - -Following this plan will result in a robust, well-tested implementation that provides users with flexible, powerful evaluation capabilities while maintaining the high quality and reliability standards of the Azure AI Evaluation SDK. From e86d1de2cea0512c6063c1f1f58ba24c11c9017c Mon Sep 17 00:00:00 2001 From: Nagkumar Arkalgud Date: Mon, 23 Jun 2025 10:17:52 -0700 Subject: [PATCH 4/5] Add evaluator to exceptions for save eval e2e test --- .../azure-ai-evaluation/tests/unittests/test_save_eval.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py index a80ba79b8726..eeef523053b2 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py @@ -39,6 +39,7 @@ class TestSaveEval: "RedTeamOutput", "AzureOpenAIGrader", "AzureOpenAILabelGrader", + "AzureOpenAIScoreModelGrader", "AzureOpenAIStringCheckGrader", "AzureOpenAITextSimilarityGrader" ]) From 25b3e92c0a01a667951827e8eddc3cc07e00c940 Mon Sep 17 00:00:00 2001 From: Mike Shi Date: Tue, 24 Jun 2025 03:17:07 -0700 Subject: [PATCH 5/5] Replace AOAI evaluator ids with newer version to maintain consistency --- .../azure/ai/evaluation/_aoai/aoai_grader.py | 2 +- .../azure/ai/evaluation/_aoai/label_grader.py | 2 +- .../azure/ai/evaluation/_aoai/score_model_grader.py | 2 +- .../azure/ai/evaluation/_aoai/string_check_grader.py | 2 +- .../azure/ai/evaluation/_aoai/text_similarity_grader.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py index 820644c9bc6b..95ef77544cee 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/aoai_grader.py @@ -35,7 +35,7 @@ class AzureOpenAIGrader(): """ - id = "aoai://general" + id = "azureai://built-in/evaluators/azure-openai/custom_grader" def __init__(self, *, model_config : Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration], grader_config: Dict[str, Any], **kwargs: Any): self._model_config = model_config diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py index 6d4752830c68..338584c0ae57 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/label_grader.py @@ -42,7 +42,7 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader): """ - id = "aoai://label_model" + id = "azureai://built-in/evaluators/azure-openai/label_grader" def __init__( self, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py index 88077f57f9fa..ec35e5d1372a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py @@ -50,7 +50,7 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader): :type kwargs: Any """ - id = "aoai://score_model" + id = "azureai://built-in/evaluators/azure-openai/scorer_grader" def __init__( self, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py index 627c53ed3497..ba3b056569fb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/string_check_grader.py @@ -38,7 +38,7 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader): """ - id = "aoai://string_check" + id = "azureai://built-in/evaluators/azure-openai/string_check_grader" def __init__( self, diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py index 9289f3fd2538..06b7facab7e2 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/text_similarity_grader.py @@ -52,7 +52,7 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader): """ - id = "aoai://text_similarity" + id = "azureai://built-in/evaluators/azure-openai/text_similarity_grader" def __init__( self,