Skip to content
Merged
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
980e2fa
add eval result converter
YoYoJa Oct 3, 2025
57c73b8
Add result converter
YoYoJa Oct 6, 2025
1730b17
update converter params to optional
YoYoJa Oct 6, 2025
3bf93f7
add eval meta data
YoYoJa Oct 7, 2025
e47639d
merge converter change to include eval_meta_data
YoYoJa Oct 7, 2025
5b198b4
fix type
YoYoJa Oct 8, 2025
5fbbabe
remove useless file
YoYoJa Oct 8, 2025
6ca31a1
get eval meta data as input
YoYoJa Oct 8, 2025
ea93d1a
fix build errors
YoYoJa Oct 8, 2025
e6a9caa
remove useless import
YoYoJa Oct 8, 2025
f24f0e0
resolve comments
YoYoJa Oct 8, 2025
0abddb0
update
YoYoJa Oct 8, 2025
518b4af
update comments
YoYoJa Oct 8, 2025
74dfddc
merge otel
YoYoJa Oct 8, 2025
15d881e
merge result converter
YoYoJa Oct 8, 2025
5c44f70
fix checker failure
YoYoJa Oct 9, 2025
2ce023e
add error msg and error code
YoYoJa Oct 9, 2025
654e28f
merge main
YoYoJa Oct 9, 2025
32aad08
Surface evaluator error msg
YoYoJa Oct 10, 2025
d1449f5
surface out error
YoYoJa Oct 10, 2025
47d20e3
update
YoYoJa Oct 10, 2025
5cee7e4
update UT
YoYoJa Oct 10, 2025
9256912
fix usage
YoYoJa Oct 10, 2025
0ff811b
fix usage
YoYoJa Oct 10, 2025
18b0db2
resolve conflict
YoYoJa Oct 10, 2025
36b0761
merge type updat
YoYoJa Oct 10, 2025
59b0aab
make eval_meta_data optional
YoYoJa Oct 12, 2025
d4d768c
remove useless lines
YoYoJa Oct 12, 2025
7de4cd6
update param name to add underscore
YoYoJa Oct 14, 2025
cb88b43
merge remote
YoYoJa Oct 14, 2025
28d5d17
parse updated annotation results
YoYoJa Oct 16, 2025
eab85ca
merge remote
YoYoJa Oct 16, 2025
74a39b9
update trace_id
YoYoJa Oct 16, 2025
31458fe
merge remote
YoYoJa Oct 16, 2025
cb1cc34
expose sample data for sdk evaluators
YoYoJa Oct 16, 2025
189dd83
update
YoYoJa Oct 16, 2025
a9416a8
merge and resolve conflict
YoYoJa Oct 16, 2025
3cc294c
update
YoYoJa Oct 17, 2025
26c8a53
update
YoYoJa Oct 17, 2025
d2e40f9
fix UT
YoYoJa Oct 17, 2025
4a0a86c
remove print
YoYoJa Oct 17, 2025
3664e9a
merge remote
YoYoJa Oct 17, 2025
fa3ae5c
merge remote
YoYoJa Oct 17, 2025
e386fed
fix tests
YoYoJa Oct 17, 2025
023bff7
fix test
YoYoJa Oct 20, 2025
244b530
merge remote
YoYoJa Oct 20, 2025
ce3af31
merge main
YoYoJa Oct 20, 2025
ef92791
merge master and fix bug
YoYoJa Oct 21, 2025
c40feed
merge remote
YoYoJa Oct 21, 2025
04912f3
merge main
YoYoJa Oct 21, 2025
76b1951
Jessli/convert (#43556) merge main
YoYoJa Oct 21, 2025
7450510
merge remote
YoYoJa Oct 21, 2025
fe2779d
fix bug
YoYoJa Oct 21, 2025
e856faa
Jessli/convert Fix bug (#43557)
YoYoJa Oct 21, 2025
06d3f87
merge remote
YoYoJa Oct 21, 2025
ab46e31
merge remote needuv/structured-results-otel-logging
YoYoJa Oct 21, 2025
99424d5
fix bug
YoYoJa Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1736,6 +1736,8 @@ def _convert_results_to_aoai_evaluation_results(
criteria_type = criteria_name_types_from_meta[criteria_name].get("type", None)
evaluator_name = criteria_name_types_from_meta[criteria_name].get("evaluator_name", None)
if evaluator_name:
if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
evaluator_name = evaluator_name.replace("builtin.", "")
metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(evaluator_name, [])
if metrics_mapped and len(metrics_mapped) > 0:
metrics.extend(metrics_mapped)
Expand Down Expand Up @@ -1798,6 +1800,7 @@ def _convert_results_to_aoai_evaluation_results(
result_per_metric[metric] = {"score": metric_value}
else:
result_per_metric[metric]["score"] = metric_value
_append_indirect_attachments_to_results(result_per_metric, "score", metric, metric_value)
elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
label = metric_value
Expand All @@ -1809,6 +1812,8 @@ def _convert_results_to_aoai_evaluation_results(
else:
result_per_metric[metric]["label"] = metric_value
result_per_metric[metric]["passed"] = passed
_append_indirect_attachments_to_results(result_per_metric, "label", metric, label)
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, passed)
elif (
metric_key.endswith("_reason") and not metric_key.endswith("_finish_reason")
) or metric_key == "reason":
Expand All @@ -1817,18 +1822,21 @@ def _convert_results_to_aoai_evaluation_results(
result_per_metric[metric] = {"reason": metric_value}
else:
result_per_metric[metric]["reason"] = metric_value
_append_indirect_attachments_to_results(result_per_metric, "reason", metric, metric_value)
elif metric_key.endswith("_threshold") or metric_key == "threshold":
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
result_per_metric[metric] = {"threshold": metric_value}
else:
result_per_metric[metric]["threshold"] = metric_value
_append_indirect_attachments_to_results(result_per_metric, "threshold", metric, metric_value)
elif metric_key == "sample":
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
result_per_metric[metric] = {"sample": metric_value}
else:
result_per_metric[metric]["sample"] = metric_value
_append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value)
elif metric_key.endswith("_finish_reason"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
Expand All @@ -1841,6 +1849,9 @@ def _convert_results_to_aoai_evaluation_results(
and "finish_reason" not in result_per_metric[metric]["sample"]
):
result_per_metric[metric]["sample"]["finish_reason"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, metric_value, "finish_reason"
)
elif metric_key.endswith("_model"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
Expand All @@ -1853,6 +1864,7 @@ def _convert_results_to_aoai_evaluation_results(
and "model" not in result_per_metric[metric]["sample"]
):
result_per_metric[metric]["sample"]["model"] = metric_value
_append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value, "model")
elif metric_key.endswith("_sample_input"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
input_metric_val_json: Optional[List[Dict[str, Any]]] = []
Expand All @@ -1870,6 +1882,9 @@ def _convert_results_to_aoai_evaluation_results(
and "input" not in result_per_metric[metric]["sample"]
):
result_per_metric[metric]["sample"]["input"] = input_metric_val_json
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, input_metric_val_json, "input"
)
elif metric_key.endswith("_sample_output"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
output_metric_val_json: Optional[List[Dict[str, Any]]] = []
Expand All @@ -1887,6 +1902,9 @@ def _convert_results_to_aoai_evaluation_results(
and "output" not in result_per_metric[metric]["sample"]
):
result_per_metric[metric]["sample"]["output"] = output_metric_val_json
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, output_metric_val_json, "output"
)
elif metric_key.endswith("_total_tokens"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
Expand All @@ -1901,6 +1919,9 @@ def _convert_results_to_aoai_evaluation_results(
result_per_metric[metric]["sample"]["usage"] = {"total_tokens": metric_value}
else:
result_per_metric[metric]["sample"]["usage"]["total_tokens"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, metric_value, "usage", "total_tokens"
)
elif metric_key.endswith("_prompt_tokens"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
Expand All @@ -1915,6 +1936,9 @@ def _convert_results_to_aoai_evaluation_results(
result_per_metric[metric]["sample"]["usage"] = {"prompt_tokens": metric_value}
else:
result_per_metric[metric]["sample"]["usage"]["prompt_tokens"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, metric_value, "usage", "prompt_tokens"
)
elif metric_key.endswith("_completion_tokens"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
Expand All @@ -1929,6 +1953,9 @@ def _convert_results_to_aoai_evaluation_results(
result_per_metric[metric]["sample"]["usage"] = {"completion_tokens": metric_value}
else:
result_per_metric[metric]["sample"]["usage"]["completion_tokens"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, metric_value, "usage", "completion_tokens"
)
elif not any(
metric_key.endswith(suffix)
for suffix in [
Expand Down Expand Up @@ -1970,6 +1997,20 @@ def _convert_results_to_aoai_evaluation_results(
"metric": metric if metric is not None else criteria_name, # Use criteria name as metric
}
# Add optional fields
if (
metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"]
or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["code_vulnerability"]
or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["protected_material"]
):
copy_label = label
if copy_label is not None and isinstance(copy_label, bool) and copy_label == True:
label = "fail"
score = 0.0
passed = False
else:
label = "pass"
score = 1.0
passed = True
result_obj["score"] = score
result_obj["label"] = label
result_obj["reason"] = reason
Expand Down Expand Up @@ -2044,6 +2085,67 @@ def _convert_results_to_aoai_evaluation_results(
)


def _append_indirect_attachments_to_results(
current_result_dict: Dict[str, Any],
result_name: str,
metric: str,
metric_value: Any,
nested_result_name: Optional[str] = None,
secondnested_result_name: Optional[str] = None,
) -> None:
"""
Append indirect attachments to the current result dictionary.

:param current_result_dict: The current result dictionary to update
:type current_result_dict: Dict[str, Any]
:param result_name: The result name
:type result_name: str
:param metric: The metric name
:type metric: str
:param metric_value: The value of the metric
:type metric_value: Any
"""
if metric == "xpia" and result_name:
for metric_extended in ["xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"]:
if nested_result_name is None:
if metric_extended not in current_result_dict:
current_result_dict[metric_extended] = {result_name: metric_value}
else:
current_result_dict[metric_extended][result_name] = metric_value
elif nested_result_name is not None and secondnested_result_name is None:
if metric_extended not in current_result_dict:
current_result_dict[metric_extended] = {result_name: {nested_result_name: metric_value}}
elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
current_result_dict[metric_extended][result_name] = {nested_result_name: metric_value}
elif (
metric_extended in current_result_dict
and result_name in current_result_dict[metric_extended]
and nested_result_name not in current_result_dict[metric_extended][result_name]
):
current_result_dict[metric_extended][result_name][nested_result_name] = metric_value
elif nested_result_name is not None and secondnested_result_name is not None:
if metric_extended not in current_result_dict:
current_result_dict[metric_extended] = {
result_name: {nested_result_name: {secondnested_result_name: metric_value}}
}
elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
current_result_dict[metric_extended][result_name] = {
nested_result_name: {secondnested_result_name: metric_value}
}
elif (
metric_extended in current_result_dict
and result_name in current_result_dict[metric_extended]
and nested_result_name not in current_result_dict[metric_extended][result_name]
):
current_result_dict[metric_extended][result_name][nested_result_name] = {
secondnested_result_name: metric_value
}
else:
(
current_result_dict[metric_extended][result_name][nested_result_name][secondnested_result_name]
) = metric_value


def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
"""
Get the metric name from the testing criteria and metric key.
Expand All @@ -2058,6 +2160,16 @@ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metri
:rtype: str
"""
metric = None

if metric_key == "xpia_manipulated_content":
metric = "xpia_manipulated_content"
return metric
elif metric_key == "xpia_intrusion":
metric = "xpia_intrusion"
return metric
elif metric_key == "xpia_information_gathering":
metric = "xpia_information_gathering"
return metric
for expected_metric in metric_list:
if metric_key.startswith(expected_metric):
metric = expected_metric
Expand Down Expand Up @@ -2124,9 +2236,16 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge

# Extract usage statistics from aoai_result.sample
sample_data_list = []
dup_usage_list = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"].copy()
dup_usage_list.remove("xpia")
if isinstance(aoai_result, dict) and aoai_result["results"] and isinstance(aoai_result["results"], list):
for result_item in aoai_result["results"]:
if isinstance(result_item, dict) and "sample" in result_item and result_item["sample"]:
if (
isinstance(result_item, dict)
and "sample" in result_item
and result_item["sample"]
and result_item["metric"] not in dup_usage_list
):
sample_data_list.append(result_item["sample"])

for sample_data in sample_data_list:
Expand Down