diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py index 027f02639fe7..3feee814586e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py @@ -89,6 +89,80 @@ class TokenScope(str, enum.Enum): COGNITIVE_SERVICES_MANAGEMENT = "https://ai.azure.com/.default" AZURE_ML = "https://ml.azure.com/.default" +class _EvaluatorMetricMapping: + """ + Static mapping of evaluator names to their metric names, based on assets.json. + The 'builtin.' prefix is removed from the evaluator name keys. + """ + EVALUATOR_NAME_METRICS_MAPPINGS = { + "bleu_score": ["bleu"], + "coherence": ["coherence"], + "document_retrieval": [ + "ndcg@3", "xdcg@3", "fidelity", "top1_relevance", "top3_max_relevance", + "holes", "holes_ratio", "total_retrieved_documents", "total_ground_truth_documents" + ], + "f1_score": ["f1_score"], + "fluency": ["fluency"], + "gleu_score": ["gleu"], + "meteor_score": ["meteor"], + "relevance": ["relevance"], + "response_completeness": ["response_completeness"], + "rouge_score": ["rouge_precision", "rouge_recall", "rouge_f1_score"], + "groundedness_pro": ["groundedness_pro"], + "similarity": ["similarity"], + "intent_resolution": ["intent_resolution"], + "retrieval": ["retrieval"], + "task_adherence": ["task_adherence"], + "tool_call_accuracy": ["tool_call_accuracy"], + "groundedness": ["groundedness"], + "code_vulnerability": ["code_vulnerability"], + "eci": ["eci"], + "protected_material": ["protected_material"], + "ungrounded_attributes": ["ungrounded_attributes"], + "indirect_attack": [ + "xpia", "xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering" + ], + "label_grader": ["label_model"], + "string_check_grader": ["string_check"], + "text_similarity_grader": ["similarity"], + "score_model_grader": ["score_model"], + "sexual": ["sexual"], + "self_harm": ["self_harm"], + "violence": ["violence"], + "hate_unfairness": ["hate_unfairness"], + } + + EVAL_CLASS_NAME_MAP = { + "BleuScoreEvaluator": "bleu_score", + "CodeVulnerabilityEvaluator": "code_vulnerability", + "CoherenceEvaluator": "coherence", + "ContentSafetyEvaluator": "content_safety", + "DocumentRetrievalEvaluator": "document_retrieval", + "ECIEvaluator": "eci", + "F1ScoreEvaluator": "f1_score", + "FluencyEvaluator": "fluency", + "GleuScoreEvaluator": "gleu_score", + "GroundednessEvaluator": "groundedness", + "GroundednessProEvaluator": "groundedness_pro", + "HateUnfairnessEvaluator": "hate_unfairness", + "IndirectAttackEvaluator": "indirect_attack", + "IntentResolutionEvaluator": "intent_resolution", + "MeteorScoreEvaluator": "meteor_score", + "ProtectedMaterialEvaluator": "protected_material", + "QAEvaluator": "qa", + "RelevanceEvaluator": "relevance", + "ResponseCompletenessEvaluator": "response_completeness", + "RetrievalEvaluator": "retrieval", + "RougeScoreEvaluator": "rouge_score", + "SelfHarmEvaluator": "self_harm", + "SexualEvaluator": "sexual", + "SimilarityEvaluator": "similarity", + "TaskAdherenceEvaluator": "task_adherence", + "TaskCompletionEvaluator": "task_completion", + "ToolCallAccuracyEvaluator": "tool_call_accuracy", + "UngroundedAttributesEvaluator": "ungrounded_attributes", + "ViolenceEvaluator": "violence", + } DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index 07a1bd271390..7319f77cc52d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -39,6 +39,7 @@ BINARY_AGGREGATE_SUFFIX, DEFAULT_OAI_EVAL_RUN_NAME, EVALUATION_EVENT_NAME, + _EvaluatorMetricMapping ) from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig, AppInsightsConfig from .._user_agent import UserAgentSingleton @@ -55,8 +56,7 @@ _trace_destination_from_project_scope, _write_output, DataLoaderFactory, - _log_metrics_and_instance_results_onedp, - _convert_results_to_aoai_evaluation_results + _log_metrics_and_instance_results_onedp ) from ._batch_run.batch_clients import BatchClient, BatchClientRun @@ -995,8 +995,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore # _add_aoai_structured_results_to_results(result, LOGGER, kwargs.get("eval_meta_data")) + eval_id: Optional[str] = kwargs.get("eval_id") + eval_run_id: Optional[str] = kwargs.get("eval_run_id") eval_meta_data: Optional[Dict[str, Any]] = kwargs.get("eval_meta_data") - _convert_results_to_aoai_evaluation_results(result, LOGGER, eval_meta_data, eval_run_summary_dict) + _convert_results_to_aoai_evaluation_results(result, LOGGER, eval_id, eval_run_id, evaluators_and_graders, eval_run_summary_dict, eval_meta_data) if app_insights_configuration := kwargs.get("app_insights_configuration"): emit_eval_result_events_to_app_insights(app_insights_configuration, result["evaluation_results_list"]) @@ -1577,3 +1579,377 @@ def _turn_error_logs_into_exception(log_path: str) -> None: category=ErrorCategory.FAILED_EXECUTION, blame=ErrorBlame.UNKNOWN, ) + + +def _convert_results_to_aoai_evaluation_results( + results: EvaluationResult, + logger: logging.Logger, + eval_id: Optional[str] = None, + eval_run_id: Optional[str] = None, + evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]] = None, + eval_run_summary: Optional[Dict[str, Any]] = None, + eval_meta_data: Optional[Dict[str, Any]] = None +) -> None: + """ + Convert evaluation results to AOAI evaluation results format. + + Each row of input results.rows looks like: + {"inputs.query":"What is the capital of France?","inputs.context":"France is in Europe", + "inputs.generated_response":"Paris is the capital of France.","inputs.ground_truth":"Paris is the capital of France.", + "outputs.F1_score.f1_score":1.0,"outputs.F1_score.f1_result":"pass","outputs.F1_score.f1_threshold":0.5} + + Convert each row into new RunOutputItem object with results array. + + :param results: The evaluation results to convert + :type results: EvaluationResult + :param eval_meta_data: The evaluation metadata, containing eval_id, eval_run_id, and testing_criteria + :type eval_meta_data: Dict[str, Any] + :param logger: Logger instance + :type logger: logging.Logger + :return: EvaluationResult with converted evaluation results in AOAI format + :rtype: EvaluationResult + """ + + if evaluators is None: + return + + # Get the testing_criteria_name and testing_criteria_type from evaluators + testing_criteria_name_types_metrics: Optional[Dict[str, Any]] = {} + criteria_name_types_from_meta: Optional[Dict[str, str]] = {} + if eval_meta_data and "testing_criteria" in eval_meta_data: + testing_criteria_list: Optional[List[Dict[str, Any]]] = eval_meta_data.get("testing_criteria") + if testing_criteria_list is not None: + for criteria in testing_criteria_list: + criteria_name = criteria.get("name") + criteria_type = criteria.get("type") + if criteria_name is not None and criteria_type is not None: + criteria_name_types_from_meta[criteria_name] = criteria + + for criteria_name, evaluator in evaluators.items(): + criteria_type = None + metrics = [] + if criteria_name in criteria_name_types_from_meta: + criteria_type = criteria_name_types_from_meta[criteria_name].get("type", None) + evaluator_name = criteria_name_types_from_meta[criteria_name].get("evaluator_name", None) + if evaluator_name: + metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(evaluator_name, []) + if metrics_mapped and len(metrics_mapped) > 0: + metrics.extend(metrics_mapped) + else: + metrics.append(criteria_name) + elif isinstance(evaluator, AzureOpenAIGrader): + criteria_type = evaluator._type # pylint: disable=protected-access + metrics.append(criteria_name) + elif isinstance(evaluator, Callable): + criteria_type = "azure_ai_evaluator" + evaluator_class_name = evaluator.__class__.__name__ + eval_name = _EvaluatorMetricMapping.EVAL_CLASS_NAME_MAP.get(evaluator_class_name, None) + if eval_name: + metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(eval_name, []) + if metrics_mapped and len(metrics_mapped) > 0: + metrics.extend(metrics_mapped) + else: + metrics.append(criteria_name) + else: + criteria_type = "unknown" + metrics.append(criteria_name) + testing_criteria_name_types_metrics[criteria_name] = { + "type": criteria_type, + "metrics": metrics + } + + created_time = int(time.time()) + converted_rows = [] + + for row_idx, row in enumerate(results.get("rows", [])): + # Group outputs by test criteria name + criteria_groups = {criteria: {} for criteria in testing_criteria_name_types_metrics.keys()} + input_groups = {} + top_sample = {} + for key, value in row.items(): + if key.startswith("outputs."): + # Parse key: outputs.. + parts = key.split(".", 2) # Split into max 3 parts: ['outputs', '', ''] + if len(parts) >= 3: + criteria_name = parts[1] + metric_name = parts[2] + + if criteria_name not in criteria_groups: + criteria_groups[criteria_name] = {} + + criteria_groups[criteria_name][metric_name] = value + elif key.startswith("inputs."): + input_key = key.replace('inputs.', '') + if input_key not in input_groups: + input_groups[input_key] = value + + # Convert each criteria group to RunOutputItem result + run_output_results = [] + for criteria_name, metrics in criteria_groups.items(): + # Extract metrics for this criteria + expected_metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", []) + result_per_metric = {} + # Find score - look for various score patterns + for metric_key, metric_value in metrics.items(): + if metric_key.endswith("_score") or metric_key == "score": + metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics) + if metric not in result_per_metric: + result_per_metric[metric] = { "score": metric_value } + else: + result_per_metric[metric]["score"] = metric_value + elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"): + metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics) + label = metric_value + passed = True if (str(metric_value).lower() == 'pass' or str(metric_value).lower() == 'true') else False + if metric not in result_per_metric: + result_per_metric[metric] = { + "label": label, + "passed": passed + } + else: + result_per_metric[metric]["label"] = metric_value + result_per_metric[metric]["passed"] = passed + elif metric_key.endswith("_reason") or metric_key == "reason": + metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics) + if metric not in result_per_metric: + result_per_metric[metric] = { "reason": metric_value } + else: + result_per_metric[metric]["reason"] = metric_value + elif metric_key.endswith("_threshold") or metric_key == "threshold": + metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics) + if metric not in result_per_metric: + result_per_metric[metric] = { "threshold": metric_value } + else: + result_per_metric[metric]["threshold"] = metric_value + elif metric_key == "sample": + metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics) + if metric not in result_per_metric: + result_per_metric[metric] = { "sample": metric_value } + else: + result_per_metric[metric]["sample"] = metric_value + elif not any(metric_key.endswith(suffix) for suffix in ["_result", "_reason", "_threshold"]): + metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics) + # If no score found yet and this doesn't match other patterns, use as score + if metric_key==metric and metric not in result_per_metric: + result_per_metric[metric] = { "score": metric_value } + elif metric_key==metric and result_per_metric[metric].get("score", None) is None: + result_per_metric[metric]["score"] = metric_value + + for metric, metric_values in result_per_metric.items(): + score = metric_values.get("score", None) + label = metric_values.get("label", None) + reason = metric_values.get("reason", None) + threshold = metric_values.get("threshold", None) + passed = metric_values.get("passed", None) + sample = metric_values.get("sample", None) + + # Create result object for this criteria + result_obj = { + "type": testing_criteria_name_types_metrics.get(criteria_name, {}).get("type", "azure_ai_evaluator"), + "name": criteria_name, # Use criteria name as name + "metric": metric if metric is not None else criteria_name # Use criteria name as metric + } + # Add optional fields + result_obj["score"] = score + result_obj["label"] = label + result_obj["reason"] = reason + result_obj["threshold"] = threshold + result_obj["passed"] = passed + + if sample is not None: + result_obj["sample"] = sample + top_sample = sample # Save top sample for the row + elif (eval_run_summary and criteria_name in eval_run_summary + and isinstance(eval_run_summary[criteria_name], dict) + and "error_code" in eval_run_summary[criteria_name]): + error_info = { + "code": eval_run_summary[criteria_name].get("error_code", None), + "message": eval_run_summary[criteria_name].get("error_message", None), + } if eval_run_summary[criteria_name].get("error_code", None) is not None else None + sample = { + "error": error_info + } if error_info is not None else None + result_obj["sample"] = sample + + if (eval_run_summary and criteria_name in eval_run_summary + and isinstance(eval_run_summary[criteria_name], dict) + and "error_code" in eval_run_summary[criteria_name]): + error_info = { + "code": eval_run_summary[criteria_name].get("error_code", None), + "message": eval_run_summary[criteria_name].get("error_message", None), + } if eval_run_summary[criteria_name].get("error_code", None) is not None else None + sample = { + "error": error_info + } if error_info is not None else None + # Create result object for this criteria + metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", []) + for metric in metrics: + result_obj = { + "type": testing_criteria_name_types_metrics.get(criteria_name, {}).get("type", "azure_ai_evaluator"), + "name": criteria_name, # Use criteria name as name + "metric": metric if metric is not None else criteria_name, # Use criteria name as metric + "score": None, + "label": None, + "reason": None, + "threshold": None, + "passed": None, + "sample": sample + } + + run_output_results.append(result_obj) + + # Create RunOutputItem structure + run_output_item = { + "object": "eval.run.output_item", + "id": f"{row_idx+1}", + "run_id": eval_run_id, + "eval_id": eval_id, + "created_at": created_time, + "datasource_item_id": row_idx, + "datasource_item": input_groups, + "results": run_output_results, + "status": "completed" if len(run_output_results) > 0 else "error" + } + + run_output_item["sample"] = top_sample + + converted_rows.append(run_output_item) + + # Create converted results maintaining the same structure + results["evaluation_results_list"] = converted_rows + logger.info(f"Converted {len(converted_rows)} rows to AOAI evaluation format, eval_id: {eval_id}, eval_run_id: {eval_run_id}") + # Calculate summary statistics + evaluation_summary = _calculate_aoai_evaluation_summary(converted_rows, logger) + results["evaluation_summary"] = evaluation_summary + logger.info(f"Summary statistics calculated for {len(converted_rows)} rows, eval_id: {eval_id}, eval_run_id: {eval_run_id}") + +def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str: + """ + Get the metric name from the testing criteria and metric key. + + :param testing_criteria_name: The name of the testing criteria + :type testing_criteria_name: str + :param metric_key: The metric key to look for + :type metric_key: str + :param metric_list: List of expected metrics for the testing criteria + :type metric_list: List[str] + :return: The metric name if found, otherwise the testing criteria name + :rtype: str + """ + metric = None + for expected_metric in metric_list: + if metric_key.startswith(expected_metric): + metric = expected_metric + break + if metric is None: + metric = testing_criteria_name + return metric + +def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logger) -> Dict[str, Any]: + """ + Calculate summary statistics for AOAI evaluation results. + + :param aoai_results: List of AOAI result objects (run_output_items) + :type aoai_results: list + :return: Summary statistics dictionary + :rtype: Dict[str, Any] + """ + # Calculate result counts based on aoaiResults + result_counts = { + "total": 0, + "errored": 0, + "failed": 0, + "passed": 0 + } + + # Count results by status and calculate per model usage + model_usage_stats = {} # Dictionary to aggregate usage by model + result_counts_stats = {} # Dictionary to aggregate usage by model + + for aoai_result in aoai_results: + logger.info(f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, row keys: {aoai_result.keys() if hasattr(aoai_result, 'keys') else 'N/A'}") + if isinstance(aoai_result, dict) and 'results' in aoai_result: + logger.info(f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, results count: {len(aoai_result['results'])}") + result_counts["total"] += len(aoai_result['results']) + for result_item in aoai_result['results']: + if isinstance(result_item, dict): + # Check if the result has a 'passed' field + if 'passed' in result_item and result_item['passed'] is not None: + testing_criteria = result_item.get("name", "") + if testing_criteria not in result_counts_stats: + result_counts_stats[testing_criteria] = { + "testing_criteria": testing_criteria, + "failed": 0, + "passed": 0 + } + if result_item['passed'] is True: + result_counts["passed"] += 1 + result_counts_stats[testing_criteria]["passed"] += 1 + + elif result_item['passed'] is False: + result_counts["failed"] += 1 + result_counts_stats[testing_criteria]["failed"] += 1 + # Check if the result indicates an error status + elif (('status' in result_item and result_item['status'] in ['error', 'errored']) + or (result_item['sample'] and isinstance(result_item['sample'], dict) and result_item['sample'].get('error', None) is not None)): + result_counts["errored"] += 1 + elif hasattr(aoai_result, 'status') and aoai_result.status == 'error': + result_counts["errored"] += 1 + elif isinstance(aoai_result, dict) and aoai_result.get('status') == 'error': + result_counts["errored"] += 1 + + # Extract usage statistics from aoai_result.sample + sample_data_list = [] + if isinstance(aoai_result, dict) and aoai_result['results'] and isinstance(aoai_result['results'], list): + for result_item in aoai_result['results']: + if isinstance(result_item, dict) and 'sample' in result_item and result_item['sample']: + sample_data_list.append(result_item['sample']) + + for sample_data in sample_data_list: + if sample_data and isinstance(sample_data, dict) and 'usage' in sample_data: + usage_data = sample_data['usage'] + model_name = sample_data.get('model', 'unknown') + if model_name not in model_usage_stats: + model_usage_stats[model_name] = { + 'invocation_count': 0, + 'total_tokens': 0, + 'prompt_tokens': 0, + 'completion_tokens': 0, + 'cached_tokens': 0 + } + # Aggregate usage statistics + model_stats = model_usage_stats[model_name] + model_stats['invocation_count'] += 1 + if isinstance(usage_data, dict): + model_stats['total_tokens'] += usage_data.get('total_tokens', 0) + model_stats['prompt_tokens'] += usage_data.get('prompt_tokens', 0) + model_stats['completion_tokens'] += usage_data.get('completion_tokens', 0) + model_stats['cached_tokens'] += usage_data.get('cached_tokens', 0) + + # Convert model usage stats to list format matching EvaluationRunPerModelUsage + per_model_usage = [] + for model_name, stats in model_usage_stats.items(): + per_model_usage.append({ + 'model_name': model_name, + 'invocation_count': stats['invocation_count'], + 'total_tokens': stats['total_tokens'], + 'prompt_tokens': stats['prompt_tokens'], + 'completion_tokens': stats['completion_tokens'], + 'cached_tokens': stats['cached_tokens'] + }) + result_counts_stats_val = [] + logger.info(f"\r\n Result counts stats: {result_counts_stats}") + for criteria_name, stats_val in result_counts_stats.items(): + if isinstance(stats_val, dict): + logger.info(f"\r\n Criteria: {criteria_name}, stats: {stats_val}") + result_counts_stats_val.append({ + 'testing_criteria': criteria_name, + 'passed': stats_val.get('passed', 0), + 'failed': stats_val.get('failed', 0) + }) + return { + "result_counts": result_counts, + "per_model_usage": per_model_usage, + "per_testing_criteria_results": result_counts_stats_val + } + diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py index f9fff6626aff..d5de8467037e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py @@ -485,268 +485,3 @@ def get_loader(filename: Union[os.PathLike, str]) -> Union[JSONLDataFileLoader, # fallback to JSONL to maintain backward compatibility return JSONLDataFileLoader(filename) - - -def _convert_results_to_aoai_evaluation_results( - results: EvaluationResult, - logger: logging.Logger, - eval_meta_data: Optional[Dict[str, Any]] = None, - eval_run_summary: Optional[Dict[str, Any]] = None -) -> None: - """ - Convert evaluation results to AOAI evaluation results format. - - Each row of input results.rows looks like: - {"inputs.query":"What is the capital of France?","inputs.context":"France is in Europe", - "inputs.generated_response":"Paris is the capital of France.","inputs.ground_truth":"Paris is the capital of France.", - "outputs.F1_score.f1_score":1.0,"outputs.F1_score.f1_result":"pass","outputs.F1_score.f1_threshold":0.5} - - Convert each row into new RunOutputItem object with results array. - - :param results: The evaluation results to convert - :type results: EvaluationResult - :param eval_meta_data: The evaluation metadata, containing eval_id, eval_run_id, and testing_criteria - :type eval_meta_data: Dict[str, Any] - :param logger: Logger instance - :type logger: logging.Logger - :return: EvaluationResult with converted evaluation results in AOAI format - :rtype: EvaluationResult - """ - - if eval_meta_data is None: - return - - created_time = int(time.time()) - converted_rows = [] - - eval_id: Optional[str] = eval_meta_data.get("eval_id") - eval_run_id: Optional[str] = eval_meta_data.get("eval_run_id") - testing_criteria_list: Optional[List[Dict[str, Any]]] = eval_meta_data.get("testing_criteria") - - testing_criteria_name_types: Optional[Dict[str, str]] = {} - if testing_criteria_list is not None: - for criteria in testing_criteria_list: - criteria_name = criteria.get("name") - criteria_type = criteria.get("type") - if criteria_name is not None and criteria_type is not None: - testing_criteria_name_types[criteria_name] = criteria_type - - for row_idx, row in enumerate(results.get("rows", [])): - # Group outputs by test criteria name - criteria_groups = {criteria: {} for criteria in testing_criteria_name_types.keys()} - input_groups = {} - top_sample = [] - for key, value in row.items(): - if key.startswith("outputs."): - # Parse key: outputs.. - parts = key.split(".", 2) # Split into max 3 parts: ['outputs', '', ''] - if len(parts) >= 3: - criteria_name = parts[1] - metric_name = parts[2] - - if criteria_name not in criteria_groups: - criteria_groups[criteria_name] = {} - - criteria_groups[criteria_name][metric_name] = value - elif key.startswith("inputs."): - input_key = key.replace('inputs.', '') - if input_key not in input_groups: - input_groups[input_key] = value - - # Convert each criteria group to RunOutputItem result - run_output_results = [] - for criteria_name, metrics in criteria_groups.items(): - # Extract metrics for this criteria - score = None - label = None - reason = None - threshold = None - passed = None - sample = None - # Find score - look for various score patterns - for metric_key, metric_value in metrics.items(): - if metric_key.endswith("_score") or metric_key == "score": - score = metric_value - elif metric_key.endswith("_result") or metric_key == "result" or metric_key == "passed": - label = metric_value - passed = True if (str(metric_value).lower() == 'pass' or str(metric_value).lower() == 'true') else False - elif metric_key.endswith("_reason") or metric_key == "reason": - reason = metric_value - elif metric_key.endswith("_threshold") or metric_key == "threshold": - threshold = metric_value - elif metric_key == "sample": - sample = metric_value - elif not any(metric_key.endswith(suffix) for suffix in ["_result", "_reason", "_threshold"]): - # If no score found yet and this doesn't match other patterns, use as score - if score is None: - score = metric_value - - # Determine passed status - passed = True if (str(label).lower() == 'pass' or str(label).lower() == 'true') else False - - # Create result object for this criteria - result_obj = { - "type": testing_criteria_name_types[criteria_name] if testing_criteria_name_types and criteria_name in testing_criteria_name_types else "azure_ai_evaluator", # Use criteria name as type - "name": criteria_name, # Use criteria name as name - "metric": criteria_name # Use criteria name as metric - } - # Add optional fields if they exist - #if score is not None: - result_obj["score"] = score - #if label is not None: - result_obj["label"] = label - #if reason is not None: - result_obj["reason"] = reason - #if threshold is not None: - result_obj["threshold"] = threshold - #if passed is not None: - result_obj["passed"] = passed - - if sample is not None: - result_obj["sample"] = sample - top_sample.append(sample) # Save top sample for the row - elif (eval_run_summary and criteria_name in eval_run_summary - and isinstance(eval_run_summary[criteria_name], dict) - and "error_code" in eval_run_summary[criteria_name]): - error_info = { - "code": eval_run_summary[criteria_name].get("error_code", None), - "message": eval_run_summary[criteria_name].get("error_message", None), - } if eval_run_summary[criteria_name].get("error_code", None) is not None else None - sample = { - "error": error_info - } if error_info is not None else None - result_obj["sample"] = sample - if sample is not None: - top_sample.append(sample) - - run_output_results.append(result_obj) - - # Create RunOutputItem structure - run_output_item = { - "object": "eval.run.output_item", - "id": f"{row_idx+1}", - "run_id": eval_run_id, - "eval_id": eval_id, - "created_at": created_time, - "datasource_item_id": row_idx, - "datasource_item": input_groups, - "results": run_output_results, - "status": "completed" if len(run_output_results) > 0 else "error" - } - - run_output_item["sample"] = top_sample - - converted_rows.append(run_output_item) - - # Create converted results maintaining the same structure - results["evaluation_results_list"] = converted_rows - logger.info(f"Converted {len(converted_rows)} rows to AOAI evaluation format, eval_id: {eval_id}, eval_run_id: {eval_run_id}") - # Calculate summary statistics - evaluation_summary = _calculate_aoai_evaluation_summary(converted_rows, logger) - results["evaluation_summary"] = evaluation_summary - logger.info(f"Summary statistics calculated for {len(converted_rows)} rows, eval_id: {eval_id}, eval_run_id: {eval_run_id}") - - -def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logger) -> Dict[str, Any]: - """ - Calculate summary statistics for AOAI evaluation results. - - :param aoai_results: List of AOAI result objects (run_output_items) - :type aoai_results: list - :return: Summary statistics dictionary - :rtype: Dict[str, Any] - """ - # Calculate result counts based on aoaiResults - result_counts = { - "total": 0, - "errored": 0, - "failed": 0, - "passed": 0 - } - - # Count results by status and calculate per model usage - model_usage_stats = {} # Dictionary to aggregate usage by model - result_counts_stats = {} # Dictionary to aggregate usage by model - - for aoai_result in aoai_results: - logger.info(f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, row keys: {aoai_result.keys() if hasattr(aoai_result, 'keys') else 'N/A'}") - if isinstance(aoai_result, dict) and 'results' in aoai_result: - logger.info(f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, results count: {len(aoai_result['results'])}") - result_counts["total"] += len(aoai_result['results']) - for result_item in aoai_result['results']: - if isinstance(result_item, dict): - # Check if the result has a 'passed' field - if 'passed' in result_item: - testing_criteria = result_item.get("name", "") - if testing_criteria not in result_counts_stats: - result_counts_stats[testing_criteria] = { - "testing_criteria": testing_criteria, - "failed": 0, - "passed": 0 - } - if result_item['passed'] is True: - result_counts["passed"] += 1 - result_counts_stats[testing_criteria]["passed"] += 1 - - elif result_item['passed'] is False: - result_counts["failed"] += 1 - result_counts_stats[testing_criteria]["failed"] += 1 - # Check if the result indicates an error status - elif 'status' in result_item and result_item['status'] in ['error', 'errored']: - result_counts["errored"] += 1 - elif hasattr(aoai_result, 'status') and aoai_result.status == 'error': - result_counts["errored"] += 1 - elif isinstance(aoai_result, dict) and aoai_result.get('status') == 'error': - result_counts["errored"] += 1 - - # Extract usage statistics from aoai_result.sample - sample_data_list = None - if isinstance(aoai_result, dict) and 'sample' in aoai_result: - sample_data_list = aoai_result['sample'] - - for sample_data in sample_data_list: - if sample_data and isinstance(sample_data, dict) and 'usage' in sample_data: - usage_data = sample_data['usage'] - model_name = sample_data.get('model', 'unknown') - if model_name not in model_usage_stats: - model_usage_stats[model_name] = { - 'invocation_count': 0, - 'total_tokens': 0, - 'prompt_tokens': 0, - 'completion_tokens': 0, - 'cached_tokens': 0 - } - # Aggregate usage statistics - model_stats = model_usage_stats[model_name] - model_stats['invocation_count'] += 1 - if isinstance(usage_data, dict): - model_stats['total_tokens'] += usage_data.get('total_tokens', 0) - model_stats['prompt_tokens'] += usage_data.get('prompt_tokens', 0) - model_stats['completion_tokens'] += usage_data.get('completion_tokens', 0) - model_stats['cached_tokens'] += usage_data.get('cached_tokens', 0) - # Convert model usage stats to list format matching EvaluationRunPerModelUsage - per_model_usage = [] - for model_name, stats in model_usage_stats.items(): - per_model_usage.append({ - 'model_name': model_name, - 'invocation_count': stats['invocation_count'], - 'total_tokens': stats['total_tokens'], - 'prompt_tokens': stats['prompt_tokens'], - 'completion_tokens': stats['completion_tokens'], - 'cached_tokens': stats['cached_tokens'] - }) - result_counts_stats_val = [] - logger.info(f"\r\n Result counts stats: {result_counts_stats}") - for criteria_name, stats_val in result_counts_stats.items(): - if isinstance(stats_val, dict): - logger.info(f"\r\n Criteria: {criteria_name}, stats: {stats_val}") - result_counts_stats_val.append({ - 'testing_criteria': criteria_name, - 'passed': stats_val.get('passed', 0), - 'failed': stats_val.get('failed', 0) - }) - return { - "result_counts": result_counts, - "per_model_usage": per_model_usage, - "per_testing_criteria_results": result_counts_stats_val - } diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_error_summary.json b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_error_summary.json new file mode 100644 index 000000000000..985fd29987d1 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_error_summary.json @@ -0,0 +1,11 @@ +{ + "self_harm": { + "status": "Failed", + "duration": "0:00:00.000869", + "completed_lines": 0, + "failed_lines": 0, + "log_path": null, + "error_message": "(UserError) Missing inputs for line 1: 'data.item.query, data.item.response'", + "error_code": "INVALID VALUE" + } +} \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_eval_meta_data.json b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_eval_meta_data.json index 95c7d54f5afa..f24024c18e81 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_eval_meta_data.json +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_eval_meta_data.json @@ -8,7 +8,8 @@ }, { "type": "azure_ai_evaluator", - "name": "violence" + "name": "violence", + "evaluator_name": "violence" } ] } \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index a7bc4d3f7acc..7bfdcd60c893 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -23,7 +23,9 @@ SexualEvaluator, SelfHarmEvaluator, HateUnfairnessEvaluator, + AzureOpenAIModelConfiguration ) +from azure.ai.evaluation._aoai.label_grader import AzureOpenAILabelGrader from azure.ai.evaluation._constants import ( DEFAULT_EVALUATION_RESULTS_FILE_NAME, _AggregationType, @@ -33,6 +35,7 @@ _aggregate_metrics, _apply_target_to_data, _rename_columns_conditionally, + _convert_results_to_aoai_evaluation_results ) from azure.ai.evaluation._evaluate._utils import _convert_name_map_into_property_entries from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _trace_destination_from_project_scope @@ -1081,6 +1084,197 @@ def evaluator(query, response, *, bar=None, **kwargs): assert "foo" in row3_kwargs, "Making a column mapping to an unnamed parameter should appear in kwargs" assert {"query", "response", "bar"}.isdisjoint(row3_kwargs), "Named parameters should not be in kwargs" + def test_convert_results_to_aoai_evaluation_results(self): + """Test _convert_results_to_aoai_evaluation_results function with test data""" + import logging + + # Load test data from the JSON file + parent = pathlib.Path(__file__).parent.resolve() + test_data_path = os.path.join(parent, "data", "evaluation_util_convert_old_output_test.jsonl") + + test_data_path = os.path.join(parent, "data", "evaluation_util_convert_old_output_test.jsonl") + test_input_eval_metadata_path = os.path.join(parent, "data", "evaluation_util_convert_eval_meta_data.json") + test_input_eval_error_summary_path = os.path.join(parent, "data", "evaluation_util_convert_error_summary.json") + + mock_model_config = AzureOpenAIModelConfiguration( + azure_deployment="test-deployment", + azure_endpoint="https://test-endpoint.openai.azure.com/", + api_key="test-api-key", + api_version="2024-12-01-preview", + ) + fake_project = {"subscription_id": "123", "resource_group_name": "123", "project_name": "123"} + evaluators = { + "labelgrader": AzureOpenAILabelGrader( + model_config=mock_model_config, + input=[{"content": "{{item.query}}", "role": "user"}], + labels=["positive", "negative", "neutral"], + passing_labels=["neutral"], + model="gpt-4o-2024-11-20", + name="labelgrader", + ), + "violence": ViolenceEvaluator(None, fake_project), + "self_harm": SelfHarmEvaluator(None, fake_project) + } + + # Create logger + logger = logging.getLogger("test_logger") + # Read and parse the JSONL file (contains multiple JSON objects) + test_rows = [] + with open(test_data_path, 'r') as f: + for line in f: + line = line.strip() + if line: + logger.info(line) + test_rows.append(json.loads(line)) + test_eval_input_metadata = {} + with open(test_input_eval_metadata_path, 'r') as f: + test_eval_input_metadata = json.load(f) + test_eval_error_summary = {} + with open(test_input_eval_error_summary_path, 'r') as f: + test_eval_error_summary = json.load(f) + + eval_id = "test_eval_group_123" + eval_run_id = "test_run_456" + # Create EvaluationResult structure + test_results = { + "metrics": {"overall_score": 0.75}, + "rows": test_rows, + "studio_url": "https://test-studio.com" + } + + + # Test the conversion function + def run_test(): + _convert_results_to_aoai_evaluation_results( + results=test_results, + logger=logger, + eval_run_id=eval_run_id, + eval_id=eval_id, + evaluators=evaluators, + eval_run_summary=test_eval_error_summary, + eval_meta_data=test_eval_input_metadata + ) + + # Run the async function + run_test() + converted_results = test_results + + # Verify the structure + assert "metrics" in converted_results + assert "rows" in converted_results + assert "studio_url" in converted_results + assert "evaluation_results_list" in converted_results + assert "evaluation_summary" in converted_results + + # Verify metrics preserved + assert converted_results["metrics"]["overall_score"] == 0.75 + + # Verify studio URL preserved + assert converted_results["studio_url"] == "https://test-studio.com" + + # Verify evaluation_results_list is same as rows (converted format) + assert len(converted_results["evaluation_results_list"]) == len(test_rows) + assert len(converted_results["evaluation_results_list"]) == len(converted_results["rows"]) + + # Verify conversion structure for each row + for i, converted_row in enumerate(converted_results["evaluation_results_list"]): + # Check RunOutputItem structure + assert "object" in converted_row + assert converted_row["object"] == "eval.run.output_item" + assert "id" in converted_row + assert "run_id" in converted_row + assert "eval_id" in converted_row + assert "created_at" in converted_row + assert "datasource_item_id" in converted_row + assert "results" in converted_row + assert "sample" in converted_row + + # Verify IDs + assert converted_row["run_id"] == "test_run_456" + assert converted_row["eval_id"] == "test_eval_group_123" + assert converted_row["datasource_item_id"] == i + + # Verify results array structure + assert isinstance(converted_row["results"], list) + + # Check that results contain expected evaluator results + result_names = [result.get("name") for result in converted_row["results"]] + + # Based on test data, should have violence and labelgrader + if i < len(test_rows): + original_row = test_rows[i] + expected_evaluators = set() + for key in original_row.keys(): + if key.startswith("outputs."): + parts = key.split(".", 2) + if len(parts) >= 2: + expected_evaluators.add(parts[1]) + + # Verify all expected evaluators are present in results + for evaluator in expected_evaluators: + assert evaluator in result_names + + # Check individual result structure + for result in converted_row["results"]: + assert "type" in result + assert "name" in result + assert "metric" in result + + # Verify evaluation summary structure + summary = converted_results["evaluation_summary"] + assert "result_counts" in summary + assert "per_model_usage" in summary + assert "per_testing_criteria_results" in summary + + # Check result counts structure + result_counts = summary["result_counts"] + assert "total" in result_counts + assert "passed" in result_counts + assert "failed" in result_counts + assert "errored" in result_counts + + logger.info(result_counts) + # Verify counts are non-negative integers + for count_type, count_value in result_counts.items(): + assert isinstance(count_value, int) + assert count_value >= 0 + + # Check per_testing_criteria_results structure + criteria_results = summary["per_testing_criteria_results"] + assert isinstance(criteria_results, list) + logger.info(criteria_results) + for criteria_result in criteria_results: + assert "testing_criteria" in criteria_result + assert "passed" in criteria_result + assert "failed" in criteria_result + assert isinstance(criteria_result["passed"], int) + assert isinstance(criteria_result["failed"], int) + + # Check per_model_usage structure + model_usage = summary["per_model_usage"] + assert isinstance(model_usage, list) + for usage_item in model_usage: + assert "model_name" in usage_item + assert "invocation_count" in usage_item + assert "total_tokens" in usage_item + assert "prompt_tokens" in usage_item + assert "completion_tokens" in usage_item + assert "cached_tokens" in usage_item + + # Test with empty results + empty_results = {"metrics": {}, "rows": [], "studio_url": None} + _convert_results_to_aoai_evaluation_results( + results=empty_results, + logger=logger, + eval_run_id=eval_run_id, + eval_id=eval_id, + evaluators=evaluators + ) + empty_converted = empty_results + + assert len(empty_converted["rows"]) == 0 + assert len(empty_converted["evaluation_results_list"]) == 0 + assert empty_converted["evaluation_summary"]["result_counts"]["total"] == 0 @pytest.mark.unittest class TestTagsInLoggingFunctions: @@ -1395,3 +1589,4 @@ def test_log_metrics_and_instance_results_onedp_no_redundant_tags(self, mock_cli assert ( not hasattr(call_args, "tags") or call_args.tags is None ), "Tags should not be redundantly set in update_evaluation_run" + \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py index 63d22a74353c..ed0a19c56b7a 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py @@ -16,9 +16,6 @@ reformat_agent_response, reformat_tool_definitions, ) -from azure.ai.evaluation._evaluate._utils import ( - _convert_results_to_aoai_evaluation_results -) from azure.ai.evaluation._exceptions import EvaluationException, ErrorMessage from azure.monitor.opentelemetry.exporter import AzureMonitorLogExporter @@ -849,166 +846,3 @@ def test_empty_tool_list(self): tools = [] expected_output = "TOOL_DEFINITIONS:" self.assertEqual(reformat_tool_definitions(tools), expected_output) - - def test_convert_results_to_aoai_evaluation_results(self): - """Test _convert_results_to_aoai_evaluation_results function with test data""" - import asyncio - import logging - - # Load test data from the JSON file - parent = pathlib.Path(__file__).parent.resolve() - test_data_path = os.path.join(parent, "data", "evaluation_util_convert_old_output_test.jsonl") - - test_data_path = os.path.join(parent, "data", "evaluation_util_convert_old_output_test.jsonl") - test_input_eval_metadata_path = os.path.join(parent, "data", "evaluation_util_convert_eval_meta_data.json") - - # Create logger - logger = logging.getLogger("test_logger") - # Read and parse the JSONL file (contains multiple JSON objects) - test_rows = [] - with open(test_data_path, 'r') as f: - for line in f: - line = line.strip() - if line: - logger.info(line) - test_rows.append(json.loads(line)) - - eval_metadata = {} - # Read and parse the evaluation metadata JSON file - with open(test_input_eval_metadata_path, 'r') as f: - eval_metadata = json.load(f) - - # Create EvaluationResult structure - test_results = { - "metrics": {"overall_score": 0.75}, - "rows": test_rows, - "studio_url": "https://test-studio.com" - } - - - # Test the conversion function - def run_test(): - _convert_results_to_aoai_evaluation_results( - results=test_results, - logger=logger, - eval_meta_data=eval_metadata - ) - - # Run the async function - run_test() - converted_results = test_results - - # Verify the structure - self.assertIn("metrics", converted_results) - self.assertIn("rows", converted_results) - self.assertIn("studio_url", converted_results) - self.assertIn("evaluation_results_list", converted_results) - self.assertIn("evaluation_summary", converted_results) - - # Verify metrics preserved - self.assertEqual(converted_results["metrics"]["overall_score"], 0.75) - - # Verify studio URL preserved - self.assertEqual(converted_results["studio_url"], "https://test-studio.com") - - # Verify evaluation_results_list is same as rows (converted format) - self.assertEqual(len(converted_results["evaluation_results_list"]), len(test_rows)) - self.assertEqual(len(converted_results["evaluation_results_list"]), len(converted_results["rows"])) - - # Verify conversion structure for each row - for i, converted_row in enumerate(converted_results["evaluation_results_list"]): - # Check RunOutputItem structure - self.assertIn("object", converted_row) - self.assertEqual(converted_row["object"], "eval.run.output_item") - self.assertIn("id", converted_row) - self.assertIn("run_id", converted_row) - self.assertIn("eval_id", converted_row) - self.assertIn("created_at", converted_row) - self.assertIn("datasource_item_id", converted_row) - self.assertIn("results", converted_row) - self.assertIn("sample", converted_row) - - # Verify IDs - self.assertEqual(converted_row["run_id"], "test_run_456") - self.assertEqual(converted_row["eval_id"], "test_eval_group_123") - self.assertEqual(converted_row["datasource_item_id"], i) - - # Verify results array structure - self.assertIsInstance(converted_row["results"], list) - - # Check that results contain expected evaluator results - result_names = [result.get("name") for result in converted_row["results"]] - - # Based on test data, should have violence and labelgrader - if i < len(test_rows): - original_row = test_rows[i] - expected_evaluators = set() - for key in original_row.keys(): - if key.startswith("outputs."): - parts = key.split(".", 2) - if len(parts) >= 2: - expected_evaluators.add(parts[1]) - - # Verify all expected evaluators are present in results - for evaluator in expected_evaluators: - self.assertIn(evaluator, result_names) - - # Check individual result structure - for result in converted_row["results"]: - self.assertIn("type", result) - self.assertIn("name", result) - self.assertIn("metric", result) - - # Verify evaluation summary structure - summary = converted_results["evaluation_summary"] - self.assertIn("result_counts", summary) - self.assertIn("per_model_usage", summary) - self.assertIn("per_testing_criteria_results", summary) - - # Check result counts structure - result_counts = summary["result_counts"] - self.assertIn("total", result_counts) - self.assertIn("passed", result_counts) - self.assertIn("failed", result_counts) - self.assertIn("errored", result_counts) - - logger.info(result_counts) - # Verify counts are non-negative integers - for count_type, count_value in result_counts.items(): - self.assertIsInstance(count_value, int) - self.assertGreaterEqual(count_value, 0) - - # Check per_testing_criteria_results structure - criteria_results = summary["per_testing_criteria_results"] - self.assertIsInstance(criteria_results, list) - logger.info(criteria_results) - for criteria_result in criteria_results: - self.assertIn("testing_criteria", criteria_result) - self.assertIn("passed", criteria_result) - self.assertIn("failed", criteria_result) - self.assertIsInstance(criteria_result["passed"], int) - self.assertIsInstance(criteria_result["failed"], int) - - # Check per_model_usage structure - model_usage = summary["per_model_usage"] - self.assertIsInstance(model_usage, list) - for usage_item in model_usage: - self.assertIn("model_name", usage_item) - self.assertIn("invocation_count", usage_item) - self.assertIn("total_tokens", usage_item) - self.assertIn("prompt_tokens", usage_item) - self.assertIn("completion_tokens", usage_item) - self.assertIn("cached_tokens", usage_item) - - # Test with empty results - empty_results = {"metrics": {}, "rows": [], "studio_url": None} - _convert_results_to_aoai_evaluation_results( - results=empty_results, - logger=logger, - eval_meta_data=eval_metadata - ) - empty_converted = empty_results - - self.assertEqual(len(empty_converted["rows"]), 0) - self.assertEqual(len(empty_converted["evaluation_results_list"]), 0) - self.assertEqual(empty_converted["evaluation_summary"]["result_counts"]["total"], 0)