Skip to content
Merged
Changes from all commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
980e2fa
add eval result converter
YoYoJa Oct 3, 2025
57c73b8
Add result converter
YoYoJa Oct 6, 2025
1730b17
update converter params to optional
YoYoJa Oct 6, 2025
3bf93f7
add eval meta data
YoYoJa Oct 7, 2025
e47639d
merge converter change to include eval_meta_data
YoYoJa Oct 7, 2025
5b198b4
fix type
YoYoJa Oct 8, 2025
5fbbabe
remove useless file
YoYoJa Oct 8, 2025
6ca31a1
get eval meta data as input
YoYoJa Oct 8, 2025
ea93d1a
fix build errors
YoYoJa Oct 8, 2025
e6a9caa
remove useless import
YoYoJa Oct 8, 2025
f24f0e0
resolve comments
YoYoJa Oct 8, 2025
0abddb0
update
YoYoJa Oct 8, 2025
518b4af
update comments
YoYoJa Oct 8, 2025
74dfddc
merge otel
YoYoJa Oct 8, 2025
15d881e
merge result converter
YoYoJa Oct 8, 2025
5c44f70
fix checker failure
YoYoJa Oct 9, 2025
2ce023e
add error msg and error code
YoYoJa Oct 9, 2025
654e28f
merge main
YoYoJa Oct 9, 2025
32aad08
Surface evaluator error msg
YoYoJa Oct 10, 2025
d1449f5
surface out error
YoYoJa Oct 10, 2025
47d20e3
update
YoYoJa Oct 10, 2025
5cee7e4
update UT
YoYoJa Oct 10, 2025
9256912
fix usage
YoYoJa Oct 10, 2025
0ff811b
fix usage
YoYoJa Oct 10, 2025
18b0db2
resolve conflict
YoYoJa Oct 10, 2025
36b0761
merge type updat
YoYoJa Oct 10, 2025
59b0aab
make eval_meta_data optional
YoYoJa Oct 12, 2025
d4d768c
remove useless lines
YoYoJa Oct 12, 2025
7de4cd6
update param name to add underscore
YoYoJa Oct 14, 2025
cb88b43
merge remote
YoYoJa Oct 14, 2025
28d5d17
parse updated annotation results
YoYoJa Oct 16, 2025
eab85ca
merge remote
YoYoJa Oct 16, 2025
74a39b9
update trace_id
YoYoJa Oct 16, 2025
31458fe
merge remote
YoYoJa Oct 16, 2025
cb1cc34
expose sample data for sdk evaluators
YoYoJa Oct 16, 2025
189dd83
update
YoYoJa Oct 16, 2025
ad4ae74
Fix column mapping bug for AOAI evaluators with custom data mapping (…
ebwinters Oct 16, 2025
a9416a8
merge and resolve conflict
YoYoJa Oct 16, 2025
bfb61df
Modify logic for message body on Microsoft.ApplicationInsights.Messag…
rads-1996 Oct 17, 2025
0caa472
Set-VcpkgWriteModeCache -- add token timeout param for cmake generate…
azure-sdk Oct 17, 2025
3cc294c
update
YoYoJa Oct 17, 2025
26c8a53
update
YoYoJa Oct 17, 2025
d2e40f9
fix UT
YoYoJa Oct 17, 2025
4a0a86c
remove print
YoYoJa Oct 17, 2025
3664e9a
merge remote
YoYoJa Oct 17, 2025
fa3ae5c
merge remote
YoYoJa Oct 17, 2025
e386fed
fix tests
YoYoJa Oct 17, 2025
378dc67
Added Tests and Samples for Paginated Queries (#43472)
andrewmathew1 Oct 17, 2025
df111e1
[Test Proxy] Support AARCH64 platform (#43428)
mccoyp Oct 17, 2025
7f667d1
Delete doc/dev/how_to_request_a_feature_in_sdk.md (#43415)
msyyc Oct 20, 2025
023bff7
fix test
YoYoJa Oct 20, 2025
244b530
merge remote
YoYoJa Oct 20, 2025
ae645e9
[AutoRelease] t2-iothub-2025-10-03-03336(can only be merged by SDK ow…
azure-sdk Oct 20, 2025
9411aab
[AutoRelease] t2-redisenterprise-2025-10-17-18412(can only be merged …
azure-sdk Oct 20, 2025
8294a71
Extend basic test for "project_client.agents" to do more operations (…
dargilco Oct 20, 2025
727f0b9
Sync eng/common directory with azure-sdk-tools for PR 12478 (#43457)
azure-sdk Oct 20, 2025
7209d25
Reorder error and warning log line processing (#43456)
azure-sdk Oct 20, 2025
5705d5d
[App Configuration] - Release 1.7.2 (#43520)
zhiyuanliang-ms Oct 20, 2025
0735aa8
Modify CODEOWNERS for Azure SDK ownership changes (#43524)
mrm9084 Oct 20, 2025
ce3af31
merge main
YoYoJa Oct 20, 2025
2c429df
Migrate Confidential Ledger library from swagger to typespec codegen …
catalinaperalta Oct 20, 2025
f37a4c8
update scripts (#43527)
azure-sdk Oct 20, 2025
b277c60
[AutoPR azure-mgmt-mongocluster]-generated-from-SDK Generation - Pyth…
azure-sdk Oct 20, 2025
5ab4faa
App Configuration Provider - Key Vault Refresh (#41882)
mrm9084 Oct 20, 2025
6ed4352
Increment package version after release of azure-appconfiguration (#4…
azure-sdk Oct 20, 2025
ed2067a
Patch `azure-template` back to `green` (#43533)
scbedd Oct 20, 2025
033110b
added brackets for sql query keyword value (#43525)
andrewmathew1 Oct 20, 2025
9a00094
update changelog (#43532)
catalinaperalta Oct 20, 2025
2037678
App Config Provider - Provider Refactor (#43196)
mrm9084 Oct 20, 2025
ef92791
merge master and fix bug
YoYoJa Oct 21, 2025
c40feed
merge remote
YoYoJa Oct 21, 2025
04912f3
merge main
YoYoJa Oct 21, 2025
7450510
merge remote
YoYoJa Oct 21, 2025
fe2779d
fix bug
YoYoJa Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1736,6 +1736,8 @@ def _convert_results_to_aoai_evaluation_results(
criteria_type = criteria_name_types_from_meta[criteria_name].get("type", None)
evaluator_name = criteria_name_types_from_meta[criteria_name].get("evaluator_name", None)
if evaluator_name:
if criteria_type=="azure_ai_evaluator" and evaluator_name.startswith("builtin."):
evaluator_name = evaluator_name.replace("builtin.", "")
metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(evaluator_name, [])
if metrics_mapped and len(metrics_mapped) > 0:
metrics.extend(metrics_mapped)
Expand Down Expand Up @@ -1798,6 +1800,9 @@ def _convert_results_to_aoai_evaluation_results(
result_per_metric[metric] = {"score": metric_value}
else:
result_per_metric[metric]["score"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "score", metric, metric_value
)
elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
label = metric_value
Expand All @@ -1809,6 +1814,12 @@ def _convert_results_to_aoai_evaluation_results(
else:
result_per_metric[metric]["label"] = metric_value
result_per_metric[metric]["passed"] = passed
_append_indirect_attachments_to_results(
result_per_metric, "label", metric, label
)
_append_indirect_attachments_to_results(
result_per_metric, "passed", metric, passed
)
elif (
metric_key.endswith("_reason") and not metric_key.endswith("_finish_reason")
) or metric_key == "reason":
Expand All @@ -1817,18 +1828,27 @@ def _convert_results_to_aoai_evaluation_results(
result_per_metric[metric] = {"reason": metric_value}
else:
result_per_metric[metric]["reason"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "reason", metric, metric_value
)
elif metric_key.endswith("_threshold") or metric_key == "threshold":
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
result_per_metric[metric] = {"threshold": metric_value}
else:
result_per_metric[metric]["threshold"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "threshold", metric, metric_value
)
elif metric_key == "sample":
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
result_per_metric[metric] = {"sample": metric_value}
else:
result_per_metric[metric]["sample"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, metric_value
)
elif metric_key.endswith("_finish_reason"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
Expand All @@ -1841,6 +1861,9 @@ def _convert_results_to_aoai_evaluation_results(
and "finish_reason" not in result_per_metric[metric]["sample"]
):
result_per_metric[metric]["sample"]["finish_reason"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, metric_value, "finish_reason"
)
elif metric_key.endswith("_model"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
Expand All @@ -1853,6 +1876,9 @@ def _convert_results_to_aoai_evaluation_results(
and "model" not in result_per_metric[metric]["sample"]
):
result_per_metric[metric]["sample"]["model"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, metric_value, "model"
)
elif metric_key.endswith("_sample_input"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
input_metric_val_json: Optional[List[Dict[str, Any]]] = []
Expand All @@ -1870,6 +1896,9 @@ def _convert_results_to_aoai_evaluation_results(
and "input" not in result_per_metric[metric]["sample"]
):
result_per_metric[metric]["sample"]["input"] = input_metric_val_json
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, input_metric_val_json, "input"
)
elif metric_key.endswith("_sample_output"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
output_metric_val_json: Optional[List[Dict[str, Any]]] = []
Expand All @@ -1887,6 +1916,9 @@ def _convert_results_to_aoai_evaluation_results(
and "output" not in result_per_metric[metric]["sample"]
):
result_per_metric[metric]["sample"]["output"] = output_metric_val_json
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, output_metric_val_json, "output"
)
elif metric_key.endswith("_total_tokens"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
Expand All @@ -1901,6 +1933,9 @@ def _convert_results_to_aoai_evaluation_results(
result_per_metric[metric]["sample"]["usage"] = {"total_tokens": metric_value}
else:
result_per_metric[metric]["sample"]["usage"]["total_tokens"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, metric_value, "usage", "total_tokens"
)
elif metric_key.endswith("_prompt_tokens"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
Expand All @@ -1915,6 +1950,9 @@ def _convert_results_to_aoai_evaluation_results(
result_per_metric[metric]["sample"]["usage"] = {"prompt_tokens": metric_value}
else:
result_per_metric[metric]["sample"]["usage"]["prompt_tokens"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, metric_value, "usage", "prompt_tokens"
)
elif metric_key.endswith("_completion_tokens"):
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
if metric not in result_per_metric:
Expand All @@ -1929,6 +1967,9 @@ def _convert_results_to_aoai_evaluation_results(
result_per_metric[metric]["sample"]["usage"] = {"completion_tokens": metric_value}
else:
result_per_metric[metric]["sample"]["usage"]["completion_tokens"] = metric_value
_append_indirect_attachments_to_results(
result_per_metric, "sample", metric, metric_value, "usage", "completion_tokens"
)
elif not any(
metric_key.endswith(suffix)
for suffix in [
Expand Down Expand Up @@ -1970,6 +2011,18 @@ def _convert_results_to_aoai_evaluation_results(
"metric": metric if metric is not None else criteria_name, # Use criteria name as metric
}
# Add optional fields
if(metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"]
or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["code_vulnerability"]
or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["protected_material"]):
copy_label = label
if copy_label is not None and isinstance(copy_label, bool) and copy_label == True:
label = "fail"
score = 0.0
passed = False
else:
label = "pass"
score = 1.0
passed = True
result_obj["score"] = score
result_obj["label"] = label
result_obj["reason"] = reason
Expand Down Expand Up @@ -2043,6 +2096,65 @@ def _convert_results_to_aoai_evaluation_results(
f"Summary statistics calculated for {len(converted_rows)} rows, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
)

def _append_indirect_attachments_to_results(current_result_dict: Dict[str, Any],
result_name: str,
metric: str,
metric_value: Any,
nested_result_name: Optional[str] = None,
secondnested_result_name: Optional[str] = None) -> None:
"""
Append indirect attachments to the current result dictionary.

:param current_result_dict: The current result dictionary to update
:type current_result_dict: Dict[str, Any]
:param result_name: The result name
:type result_name: str
:param metric: The metric name
:type metric: str
:param metric_value: The value of the metric
:type metric_value: Any
"""
if metric == "xpia" and result_name:
for metric_extended in ["xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"]:
if nested_result_name is None:
if metric_extended not in current_result_dict:
current_result_dict[metric_extended] = { result_name: metric_value }
else:
current_result_dict[metric_extended][result_name] = metric_value
elif nested_result_name is not None and secondnested_result_name is None:
if metric_extended not in current_result_dict:
current_result_dict[metric_extended] = {result_name : {nested_result_name: metric_value}}
elif (metric_extended in current_result_dict
and result_name not in current_result_dict[metric_extended]
):
current_result_dict[metric_extended][result_name] = {nested_result_name: metric_value}
elif (
metric_extended in current_result_dict
and result_name in current_result_dict[metric_extended]
and nested_result_name not in current_result_dict[metric_extended][result_name]
):
current_result_dict[metric_extended][result_name][nested_result_name] = metric_value
elif nested_result_name is not None and secondnested_result_name is not None:
if metric_extended not in current_result_dict:
current_result_dict[metric_extended] = {
result_name: {nested_result_name: {secondnested_result_name: metric_value}}
}
elif (metric_extended in current_result_dict
and result_name not in current_result_dict[metric_extended]
):
current_result_dict[metric_extended][result_name] = {
nested_result_name: {secondnested_result_name: metric_value}
}
elif (
metric_extended in current_result_dict
and result_name in current_result_dict[metric_extended]
and nested_result_name not in current_result_dict[metric_extended][result_name]
):
current_result_dict[metric_extended][result_name][nested_result_name] = {
secondnested_result_name: metric_value
}
else:
current_result_dict[metric_extended][result_name][nested_result_name][secondnested_result_name] = metric_value

def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
"""
Expand All @@ -2058,6 +2170,16 @@ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metri
:rtype: str
"""
metric = None

if metric_key == "xpia_manipulated_content":
metric = "xpia_manipulated_content"
return metric
elif metric_key == "xpia_intrusion":
metric = "xpia_intrusion"
return metric
elif metric_key == "xpia_information_gathering":
metric = "xpia_information_gathering"
return metric
for expected_metric in metric_list:
if metric_key.startswith(expected_metric):
metric = expected_metric
Expand Down Expand Up @@ -2124,9 +2246,12 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge

# Extract usage statistics from aoai_result.sample
sample_data_list = []
dup_usage_list = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"].copy()
dup_usage_list.remove("xpia")
if isinstance(aoai_result, dict) and aoai_result["results"] and isinstance(aoai_result["results"], list):
for result_item in aoai_result["results"]:
if isinstance(result_item, dict) and "sample" in result_item and result_item["sample"]:
if (isinstance(result_item, dict) and "sample" in result_item and result_item["sample"]
and result_item["metric"] not in dup_usage_list):
sample_data_list.append(result_item["sample"])

for sample_data in sample_data_list:
Expand Down
Loading