Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
980e2fa
add eval result converter
YoYoJa Oct 3, 2025
57c73b8
Add result converter
YoYoJa Oct 6, 2025
1730b17
update converter params to optional
YoYoJa Oct 6, 2025
3bf93f7
add eval meta data
YoYoJa Oct 7, 2025
e47639d
merge converter change to include eval_meta_data
YoYoJa Oct 7, 2025
5b198b4
fix type
YoYoJa Oct 8, 2025
5fbbabe
remove useless file
YoYoJa Oct 8, 2025
6ca31a1
get eval meta data as input
YoYoJa Oct 8, 2025
ea93d1a
fix build errors
YoYoJa Oct 8, 2025
e6a9caa
remove useless import
YoYoJa Oct 8, 2025
f24f0e0
resolve comments
YoYoJa Oct 8, 2025
0abddb0
update
YoYoJa Oct 8, 2025
518b4af
update comments
YoYoJa Oct 8, 2025
74dfddc
merge otel
YoYoJa Oct 8, 2025
15d881e
merge result converter
YoYoJa Oct 8, 2025
5c44f70
fix checker failure
YoYoJa Oct 9, 2025
8d2eb5e
Groundedness Evaluator to not add tool result to tool call message (#…
singankit Oct 9, 2025
417277d
Add ledger certificate package (#43278)
catalinaperalta Oct 9, 2025
48f4b19
[Identity] Update test-resources bicep (#43304)
pvaneck Oct 9, 2025
92274fb
[Communication Shared] Adding the mypy fixes (#42925)
v-dharmarajv Oct 9, 2025
2ce023e
add error msg and error code
YoYoJa Oct 9, 2025
654e28f
merge main
YoYoJa Oct 9, 2025
32aad08
Surface evaluator error msg
YoYoJa Oct 10, 2025
d1449f5
surface out error
YoYoJa Oct 10, 2025
47d20e3
update
YoYoJa Oct 10, 2025
5cee7e4
update UT
YoYoJa Oct 10, 2025
9256912
fix usage
YoYoJa Oct 10, 2025
0ff811b
fix usage
YoYoJa Oct 10, 2025
18b0db2
resolve conflict
YoYoJa Oct 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ def _convert_results_to_aoai_evaluation_results(
eval_run_id: Optional[str] = eval_meta_data.get("eval_run_id")
testing_criteria_list: Optional[List[Dict[str, Any]]] = eval_meta_data.get("testing_criteria")

testing_criteria_name_types = {}
testing_criteria_name_types: Optional[Dict[str, str]] = {}
if testing_criteria_list is not None:
for criteria in testing_criteria_list:
criteria_name = criteria.get("name")
Expand Down Expand Up @@ -591,30 +591,33 @@ def _convert_results_to_aoai_evaluation_results(
"metric": criteria_name # Use criteria name as metric
}
# Add optional fields if they exist
if score is not None:
result_obj["score"] = score
if label is not None:
result_obj["label"] = label
if reason is not None:
result_obj["reason"] = reason
if threshold is not None:
result_obj["threshold"] = threshold
if passed is not None:
result_obj["passed"] = passed
#if score is not None:
result_obj["score"] = score
#if label is not None:
result_obj["label"] = label
#if reason is not None:
result_obj["reason"] = reason
#if threshold is not None:
result_obj["threshold"] = threshold
#if passed is not None:
result_obj["passed"] = passed

if sample is not None:
result_obj["sample"] = sample
top_sample.append(sample) # Save top sample for the row
elif criteria_name in eval_run_summary and "error_code" in eval_run_summary[criteria_name]:
elif (eval_run_summary and criteria_name in eval_run_summary
and isinstance(eval_run_summary[criteria_name], dict)
and "error_code" in eval_run_summary[criteria_name]):
error_info = {
"code": eval_run_summary[criteria_name].get("error_code", None),
"message": eval_run_summary[criteria_name].get("error_message", None),
}
} if eval_run_summary[criteria_name].get("error_code", None) is not None else None
sample = {
"error": error_info
}
} if error_info is not None else None
result_obj["sample"] = sample
top_sample.append(sample)
if sample is not None:
top_sample.append(sample)

run_output_results.append(result_obj)

Expand Down Expand Up @@ -666,9 +669,9 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
result_counts_stats = {} # Dictionary to aggregate usage by model

for aoai_result in aoai_results:
logger.info(f"\r\nProcessing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, row keys: {aoai_result.keys() if hasattr(aoai_result, 'keys') else 'N/A'}")
logger.info(f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, row keys: {aoai_result.keys() if hasattr(aoai_result, 'keys') else 'N/A'}")
if isinstance(aoai_result, dict) and 'results' in aoai_result:
logger.info(f"\r\n2 Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, results count: {len(aoai_result['results'])}")
logger.info(f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, results count: {len(aoai_result['results'])}")
result_counts["total"] += len(aoai_result['results'])
for result_item in aoai_result['results']:
if isinstance(result_item, dict):
Expand Down Expand Up @@ -697,39 +700,22 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
result_counts["errored"] += 1

# Extract usage statistics from aoai_result.sample
sample_data = None
sample_data_list = None
if isinstance(aoai_result, dict) and 'sample' in aoai_result:
logger.info(f"\r\n 2 Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, summary count: {len(aoai_result['sample'])}")
sample_data = aoai_result['sample']
if sample_data and hasattr(sample_data, 'usage') and sample_data.usage:
usage_data = sample_data.usage
model_name = sample_data.model if hasattr(sample_data, 'model') and sample_data.model else 'unknown'
if model_name not in model_usage_stats:
model_usage_stats[model_name] = {
'invocation_count': 0,
'total_tokens': 0,
'prompt_tokens': 0,
'completion_tokens': 0,
'cached_tokens': 0
}
# Aggregate usage statistics
model_stats = model_usage_stats[model_name]
model_stats['invocation_count'] += 1
model_stats['total_tokens'] += usage_data.total_tokens if hasattr(usage_data, 'total_tokens') and usage_data.total_tokens else 0
model_stats['prompt_tokens'] += usage_data.prompt_tokens if hasattr(usage_data, 'prompt_tokens') and usage_data.prompt_tokens else 0
model_stats['completion_tokens'] += usage_data.completion_tokens if hasattr(usage_data, 'completion_tokens') and usage_data.completion_tokens else 0
model_stats['cached_tokens'] += usage_data.cached_tokens if hasattr(usage_data, 'cached_tokens') and usage_data.cached_tokens else 0
elif sample_data and isinstance(sample_data, dict) and 'usage' in sample_data:
usage_data = sample_data['usage']
model_name = sample_data.get('model', 'unknown')
if model_name not in model_usage_stats:
model_usage_stats[model_name] = {
'invocation_count': 0,
'total_tokens': 0,
'prompt_tokens': 0,
'completion_tokens': 0,
'cached_tokens': 0
}
sample_data_list = aoai_result['sample']

for sample_data in sample_data_list:
if sample_data and isinstance(sample_data, dict) and 'usage' in sample_data:
usage_data = sample_data['usage']
model_name = sample_data.get('model', 'unknown')
if model_name not in model_usage_stats:
model_usage_stats[model_name] = {
'invocation_count': 0,
'total_tokens': 0,
'prompt_tokens': 0,
'completion_tokens': 0,
'cached_tokens': 0
}
# Aggregate usage statistics
model_stats = model_usage_stats[model_name]
model_stats['invocation_count'] += 1
Expand Down
16 changes: 6 additions & 10 deletions sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,7 +860,7 @@ def test_convert_results_to_aoai_evaluation_results(self):
test_data_path = os.path.join(parent, "data", "evaluation_util_convert_old_output_test.jsonl")

test_data_path = os.path.join(parent, "data", "evaluation_util_convert_old_output_test.jsonl")
test_input_eval_metadata_path = os.path.join(parent, "data", "evaluation_uril_convert_eval_meta_data.json")
test_input_eval_metadata_path = os.path.join(parent, "data", "evaluation_util_convert_eval_meta_data.json")

# Create logger
logger = logging.getLogger("test_logger")
Expand Down Expand Up @@ -888,15 +888,15 @@ def test_convert_results_to_aoai_evaluation_results(self):

# Test the conversion function
def run_test():
converted_results = _convert_results_to_aoai_evaluation_results(
_convert_results_to_aoai_evaluation_results(
results=test_results,
logger=logger,
eval_meta_data=eval_metadata
)
return converted_results

# Run the async function
converted_results = run_test()
run_test()
converted_results = test_results

# Verify the structure
self.assertIn("metrics", converted_results)
Expand Down Expand Up @@ -958,11 +958,6 @@ def run_test():
self.assertIn("type", result)
self.assertIn("name", result)
self.assertIn("metric", result)
# Optional fields that might be present
optional_fields = ["score", "label", "reason", "threshold", "passed", "sample"]
for field in optional_fields:
if field in result:
self.assertIsNotNone(result[field])

# Verify evaluation summary structure
summary = converted_results["evaluation_summary"]
Expand Down Expand Up @@ -1007,11 +1002,12 @@ def run_test():

# Test with empty results
empty_results = {"metrics": {}, "rows": [], "studio_url": None}
empty_converted = _convert_results_to_aoai_evaluation_results(
_convert_results_to_aoai_evaluation_results(
results=empty_results,
logger=logger,
eval_meta_data=eval_metadata
)
empty_converted = empty_results

self.assertEqual(len(empty_converted["rows"]), 0)
self.assertEqual(len(empty_converted["evaluation_results_list"]), 0)
Expand Down