diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index bbed43df792e..9758c0ab4632 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -1052,6 +1052,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements result_df_dict = results_df.to_dict("records") result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore + if eval_run_info_list: + result["oai_eval_run_ids"] = [ + {"eval_group_id": info["eval_group_id"], "eval_run_id": info["eval_run_id"]} for info in eval_run_info_list + ] # _add_aoai_structured_results_to_results(result, LOGGER, kwargs.get("eval_meta_data")) eval_id: Optional[str] = kwargs.get("_eval_id") diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py index 1dc5615363f0..916dec2984e4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py @@ -139,6 +139,7 @@ class Conversation(TypedDict): class EvaluationResult(TypedDict): metrics: Dict + oai_eval_run_ids: NotRequired[List[Dict[str, str]]] studio_url: NotRequired[str] rows: List[Dict] _evaluation_results_list: List[Dict] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/formatting_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/formatting_utils.py index 32a75bd4057e..5e3fcfedb115 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/formatting_utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/formatting_utils.py @@ -52,7 +52,7 @@ def get_strategy_name(attack_strategy: Union[AttackStrategy, List[AttackStrategy def get_flattened_attack_strategies( - attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]] + attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]], ) -> List[Union[AttackStrategy, List[AttackStrategy]]]: """Flatten complex attack strategies into individual strategies. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py index efed102a1350..18cc2ddc89a3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_simulator.py @@ -426,7 +426,7 @@ def _parse_prompty_response(self, *, response: str) -> Dict[str, Any]: try: if isinstance(response, str): response = response.replace("\u2019", "'").replace("\u2018", "'") - response = response.replace("\u201C", '"').replace("\u201D", '"') + response = response.replace("\u201c", '"').replace("\u201d", '"') # Replace None with null response = response.replace("None", "null") diff --git a/sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py b/sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py index 6590d5754580..c5cfc5ae421c 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py @@ -24,7 +24,7 @@ - AZURE_AI_PROJECT_ENDPOINT 2. Hub-based project (legacy): - AZURE_SUBSCRIPTION_ID - - AZURE_RESOURCE_GROUP_NAME + - AZURE_RESOURCE_GROUP_NAME - AZURE_PROJECT_NAME """ diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_safety_evaluation.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_safety_evaluation.py index b6fb7fb55396..06ed112a504f 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_safety_evaluation.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_safety_evaluation.py @@ -9,7 +9,7 @@ """ DESCRIPTION: These samples demonstrate usage of _SafetyEvaluation class with various _SafetyEvaluator instances. - + USAGE: python evaluation_samples_safety_evaluation.py diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_simulate.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_simulate.py index df82a6bd2cd1..870019541de2 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_simulate.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_simulate.py @@ -10,7 +10,7 @@ """ DESCRIPTION: These samples demonstrate usage of various classes and methods used to perform simulation in the azure-ai-evaluation library. - + USAGE: python evaluation_samples_simulate.py diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_threshold.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_threshold.py index 80cf780fd18e..67bf31ddee9e 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_threshold.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_threshold.py @@ -10,7 +10,7 @@ """ DESCRIPTION: These samples demonstrate usage of various classes and methods used to perform evaluation with thresholds in the azure-ai-evaluation library. - + USAGE: python evaluation_samples_threshold.py diff --git a/sdk/evaluation/azure-ai-evaluation/samples/red_team_samples.py b/sdk/evaluation/azure-ai-evaluation/samples/red_team_samples.py index 3550617c0e2d..30e2eb9d5022 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/red_team_samples.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/red_team_samples.py @@ -8,7 +8,7 @@ DESCRIPTION: These samples demonstrate usage of various classes and methods used in Red Team functionality within the azure-ai-evaluation library. - + USAGE: python red_team_samples.py diff --git a/sdk/evaluation/azure-ai-evaluation/samples/score_model_multimodal/aoai_score_model_grader_sample_audio.py b/sdk/evaluation/azure-ai-evaluation/samples/score_model_multimodal/aoai_score_model_grader_sample_audio.py index 651c302b21ac..6b6139d8e3ae 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/score_model_multimodal/aoai_score_model_grader_sample_audio.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/score_model_multimodal/aoai_score_model_grader_sample_audio.py @@ -24,7 +24,7 @@ - AZURE_AI_PROJECT_ENDPOINT 2. Hub-based project (legacy): - AZURE_SUBSCRIPTION_ID - - AZURE_RESOURCE_GROUP_NAME + - AZURE_RESOURCE_GROUP_NAME - AZURE_PROJECT_NAME """ diff --git a/sdk/evaluation/azure-ai-evaluation/samples/score_model_multimodal/aoai_score_model_grader_sample_audio_file.py b/sdk/evaluation/azure-ai-evaluation/samples/score_model_multimodal/aoai_score_model_grader_sample_audio_file.py index 66810052c7ac..91b45a255805 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/score_model_multimodal/aoai_score_model_grader_sample_audio_file.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/score_model_multimodal/aoai_score_model_grader_sample_audio_file.py @@ -24,7 +24,7 @@ - AZURE_AI_PROJECT_ENDPOINT 2. Hub-based project (legacy): - AZURE_SUBSCRIPTION_ID - - AZURE_RESOURCE_GROUP_NAME + - AZURE_RESOURCE_GROUP_NAME - AZURE_PROJECT_NAME """ diff --git a/sdk/evaluation/azure-ai-evaluation/samples/score_model_multimodal/aoai_score_model_grader_sample_image.py b/sdk/evaluation/azure-ai-evaluation/samples/score_model_multimodal/aoai_score_model_grader_sample_image.py index 7cc04c445545..4e370005ef9e 100644 --- a/sdk/evaluation/azure-ai-evaluation/samples/score_model_multimodal/aoai_score_model_grader_sample_image.py +++ b/sdk/evaluation/azure-ai-evaluation/samples/score_model_multimodal/aoai_score_model_grader_sample_image.py @@ -24,7 +24,7 @@ - AZURE_AI_PROJECT_ENDPOINT 2. Hub-based project (legacy): - AZURE_SUBSCRIPTION_ID - - AZURE_RESOURCE_GROUP_NAME + - AZURE_RESOURCE_GROUP_NAME - AZURE_PROJECT_NAME """ diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 65c05e31509e..7a5d53449a1c 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -1345,6 +1345,62 @@ def run_test(): assert len(empty_converted["_evaluation_results_list"]) == 0 assert empty_converted["_evaluation_summary"]["result_counts"]["total"] == 0 + @patch( + "azure.ai.evaluation._evaluate._evaluate._map_names_to_builtins", + return_value={}, + ) + @patch("azure.ai.evaluation._evaluate._evaluate._get_evaluation_run_results") + @patch("azure.ai.evaluation._evaluate._evaluate._begin_aoai_evaluation") + @patch("azure.ai.evaluation._evaluate._evaluate._preprocess_data") + def test_evaluate_returns_oai_eval_run_ids( + self, + mock_preprocess, + mock_begin, + mock_get_results, + _, + mock_model_config, + ): + df = pd.DataFrame([{"query": "hi"}]) + grader = AzureOpenAILabelGrader( + model_config=mock_model_config, + input=[{"content": "{{item.query}}", "role": "user"}], + labels=["positive", "negative", "neutral"], + passing_labels=["neutral"], + model="gpt-4o-2024-11-20", + name="labelgrader", + ) + mock_preprocess.return_value = { + "column_mapping": {}, + "evaluators": {}, + "graders": {"g": grader}, + "input_data_df": df, + "target_run": None, + "batch_run_client": None, + "batch_run_data": None, + } + mock_begin.return_value = [ + { + "client": None, + "eval_group_id": "grp1", + "eval_run_id": "run1", + "grader_name_map": {}, + "expected_rows": len(df), + } + ] + mock_get_results.return_value = ( + pd.DataFrame([{"outputs.g.score": 1}]), + {"g.pass_rate": 1.0}, + ) + + result = evaluate( + evaluators={"g": grader}, + data="dummy_path", + azure_ai_project=None, + ) + + assert "oai_eval_run_ids" in result + assert result["oai_eval_run_ids"] == [{"eval_group_id": "grp1", "eval_run_id": "run1"}] + @pytest.mark.unittest class TestTagsInLoggingFunctions: