diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py index 766779f179d3..618509ede551 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +import json import logging import re @@ -60,6 +61,7 @@ def _split_evaluators_and_grader_configs( :return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders. :rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]] """ + LOGGER.info(f"AOAI: Splitting {len(evaluators)} evaluators into AOAI graders and standard evaluators...") true_evaluators = {} aoai_graders = {} for key, value in evaluators.items(): @@ -67,6 +69,7 @@ def _split_evaluators_and_grader_configs( aoai_graders[key] = value else: true_evaluators[key] = value + LOGGER.info(f"AOAI: Found {len(aoai_graders)} AOAI graders and {len(true_evaluators)} standard evaluators.") return true_evaluators, aoai_graders @@ -103,11 +106,18 @@ def _begin_aoai_evaluation( LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...") all_eval_run_info: List[OAIEvalRunCreationInfo] = [] - for selected_graders, selected_column_mapping in _get_graders_and_column_mappings(graders, column_mappings): + grader_mapping_list = list(_get_graders_and_column_mappings(graders, column_mappings)) + LOGGER.info(f"AOAI: Will create {len(grader_mapping_list)} separate evaluation run(s) based on column mappings.") + + for idx, (selected_graders, selected_column_mapping) in enumerate(grader_mapping_list): + LOGGER.info( + f"AOAI: Starting evaluation run {idx + 1}/{len(grader_mapping_list)} with {len(selected_graders)} grader(s)..." + ) all_eval_run_info.append( _begin_single_aoai_evaluation(selected_graders, data, selected_column_mapping, run_name) ) + LOGGER.info(f"AOAI: Successfully created {len(all_eval_run_info)} evaluation run(s).") return all_eval_run_info @@ -133,6 +143,7 @@ def _begin_single_aoai_evaluation( """ # Format data for eval group creation + LOGGER.info(f"AOAI: Preparing evaluation for {len(graders)} grader(s): {list(graders.keys())}") grader_name_list = [] grader_list = [] # It's expected that all graders supplied for a single eval run use the same credentials @@ -143,10 +154,12 @@ def _begin_single_aoai_evaluation( grader_name_list.append(name) grader_list.append(grader._grader_config) effective_column_mapping: Dict[str, str] = column_mapping or {} + LOGGER.info(f"AOAI: Generating data source config with {len(effective_column_mapping)} column mapping(s)...") data_source_config = _generate_data_source_config(data, effective_column_mapping) + LOGGER.info(f"AOAI: Data source config generated with schema type: {data_source_config.get('type')}") # Create eval group - # import pdb; pdb.set_trace() + LOGGER.info(f"AOAI: Creating eval group with {len(grader_list)} testing criteria...") eval_group_info = client.evals.create( data_source_config=data_source_config, testing_criteria=grader_list, metadata={"is_foundry_eval": "true"} ) @@ -167,6 +180,7 @@ def _begin_single_aoai_evaluation( grader_name_map[criteria.id] = name # Create eval run + LOGGER.info(f"AOAI: Creating eval run '{run_name}' with {len(data)} data rows...") eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, effective_column_mapping) LOGGER.info( f"AOAI: Eval run created with id {eval_run_id}." @@ -197,13 +211,16 @@ def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> T :raises EvaluationException: If the evaluation run fails or is not completed before timing out. """ + LOGGER.info(f"AOAI: Retrieving results from {len(all_run_info)} evaluation run(s)...") run_metrics = {} output_df = pd.DataFrame() - for run_info in all_run_info: + for idx, run_info in enumerate(all_run_info): + LOGGER.info(f"AOAI: Fetching results for run {idx + 1}/{len(all_run_info)} (ID: {run_info['eval_run_id']})...") cur_output_df, cur_run_metrics = _get_single_run_results(run_info) output_df = pd.concat([output_df, cur_output_df], axis=1) run_metrics.update(cur_run_metrics) + LOGGER.info(f"AOAI: Successfully retrieved all results. Combined dataframe shape: {output_df.shape}") return output_df, run_metrics @@ -223,8 +240,10 @@ def _get_single_run_results( :raises EvaluationException: If the evaluation run fails or is not completed before timing out. """ # Wait for evaluation run to complete + LOGGER.info(f"AOAI: Waiting for eval run {run_info['eval_run_id']} to complete...") run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"]) + LOGGER.info(f"AOAI: Eval run {run_info['eval_run_id']} completed with status: {run_results.status}") if run_results.status != "completed": raise EvaluationException( message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}" @@ -235,6 +254,7 @@ def _get_single_run_results( ) # Convert run results into a dictionary of metrics + LOGGER.info(f"AOAI: Processing results and calculating metrics for run {run_info['eval_run_id']}...") run_metrics: Dict[str, Any] = {} if run_results.per_testing_criteria_results is None: msg = ( @@ -255,8 +275,10 @@ def _get_single_run_results( ratio = passed / (passed + failed) if (passed + failed) else 0.0 formatted_column_name = f"{grader_name}.pass_rate" run_metrics[formatted_column_name] = ratio + LOGGER.info(f"AOAI: Grader '{grader_name}': {passed} passed, {failed} failed, pass_rate={ratio:.4f}") # Collect all results with pagination + LOGGER.info(f"AOAI: Collecting output items for run {run_info['eval_run_id']} with pagination...") all_results: List[Any] = [] next_cursor: Optional[str] = None limit = 100 # Max allowed by API @@ -280,6 +302,7 @@ def _get_single_run_results( else: break + LOGGER.info(f"AOAI: Collected {len(all_results)} total output items across all pages.") listed_results: Dict[str, List[Any]] = {"index": []} # Raw data has no order guarantees; capture datasource_item_id per row for ordering. for row_result in all_results: @@ -329,6 +352,7 @@ def _get_single_run_results( # Ensure all columns are the same length as the 'index' list num_rows = len(listed_results["index"]) + LOGGER.info(f"AOAI: Processing {num_rows} result rows into dataframe...") for col_name in list(listed_results.keys()): if col_name != "index": col_length = len(listed_results[col_name]) @@ -356,6 +380,7 @@ def _get_single_run_results( expected = run_info.get("expected_rows", None) if expected is not None: pre_len = len(output_df) + LOGGER.info(f"AOAI: Validating result count: expected {expected} rows, received {pre_len} rows.") # Assumes original datasource_item_id space is 0..expected-1 output_df = output_df.reindex(range(expected)) if pre_len != expected: @@ -388,6 +413,9 @@ def _get_single_run_results( # Reset to RangeIndex so downstream concatenation aligns on position output_df.reset_index(drop=True, inplace=True) + LOGGER.info( + f"AOAI: Successfully processed run {run_info['eval_run_id']} with final dataframe shape: {output_df.shape}" + ) return output_df, run_metrics @@ -481,11 +509,16 @@ def _get_graders_and_column_mappings( :rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]] """ + LOGGER.info(f"AOAI: Organizing {len(graders)} graders with column mappings...") if column_mappings is None: + LOGGER.info("AOAI: No column mappings provided, each grader will have its own eval run.") return [({name: grader}, None) for name, grader in graders.items()] default_mapping = column_mappings.get("default", None) if default_mapping is None: default_mapping = {} + LOGGER.info( + f"AOAI: Using default mapping with {len(default_mapping)} entries for graders without specific mappings." + ) return [ ({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping)) for name, grader in graders.items() @@ -593,17 +626,23 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di helper function. """ # Extract referenced data paths from mapping values of the form ${data.} (ignore ${run.outputs.*}) + LOGGER.info( + f"AOAI: Generating data source config for {len(input_data_df)} rows with {len(column_mapping)} column mapping(s)..." + ) referenced_paths: List[str] = [] for v in column_mapping.values(): m = DATA_PATH_PATTERN.match(v) if m: referenced_paths.append(m.group(1)) + LOGGER.info(f"AOAI: Found {len(referenced_paths)} referenced paths in column mappings: {referenced_paths}") # Decide if we have nested structures has_nested = any("." in p for p in referenced_paths) + LOGGER.info(f"AOAI: Schema generation mode: {'nested' if has_nested else 'flat'}") if not referenced_paths or not has_nested: # Legacy flat behavior (existing logic): treat each mapping key as independent string field + LOGGER.info("AOAI: Using flat schema generation (no nested structures detected).") data_source_config = { "type": "custom", "item_schema": { @@ -617,6 +656,7 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di for key in column_mapping.keys(): props[key] = {"type": "string"} req.append(key) + LOGGER.info(f"AOAI: Flat schema generated with {len(props)} properties: {list(props.keys())}") return data_source_config # NEW: If all nested paths share the same first segment (e.g. 'item'), @@ -625,12 +665,14 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di first_segments = {p.split(".")[0] for p in referenced_paths} strip_wrapper = False wrapper_name = None + LOGGER.info(f"AOAI: First segments in referenced paths: {first_segments}") if len(first_segments) == 1: only_seg = next(iter(first_segments)) # We only strip if that segment looks like the canonical wrapper. if only_seg == WRAPPER_KEY: strip_wrapper = True wrapper_name = only_seg + LOGGER.info(f"AOAI: All paths start with wrapper '{WRAPPER_KEY}', will strip from schema.") effective_paths = referenced_paths if strip_wrapper: @@ -645,9 +687,12 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di # If stripping produced at least one usable path, adopt; else fall back to original. if stripped: effective_paths = stripped + LOGGER.info(f"AOAI: Effective paths after stripping wrapper: {effective_paths}") + LOGGER.info(f"AOAI: Building nested schema from {len(effective_paths)} effective paths...") nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string") + LOGGER.info(f"AOAI: Nested schema generated successfully with type '{nested_schema.get('type')}'") return { "type": "custom", "item_schema": nested_schema, @@ -697,6 +742,23 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str] :return: A dictionary that can be used as the data source input for an OAI evaluation run. :rtype: Dict[str, Any] """ + + def _convert_value_to_string(val: Any) -> str: + """Convert a value to string representation for AOAI evaluation.""" + if val is None: + return "" + elif isinstance(val, (str, int, float, bool)): + return str(val) + else: + try: # Attempt to JSON serialize lists/dicts + return json.dumps(val, ensure_ascii=False) + except (TypeError, ValueError): + # Fallback for unserializable objects + return str(val) + + LOGGER.info( + f"AOAI: Building data source from {len(input_data_df)} rows with {len(column_mapping)} column mappings..." + ) # Gather path specs: list of tuples (original_mapping_value, relative_parts, dataframe_column_name) # relative_parts excludes the wrapper (so schema + content align). path_specs: List[Tuple[str, List[str], str]] = [] @@ -746,24 +808,21 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str] leaf_name = pieces[-1] path_specs.append((formatted_entry, [leaf_name], run_col)) + LOGGER.info(f"AOAI: Processed {len(path_specs)} path specifications from column mappings.") content: List[Dict[str, Any]] = [] for _, row in input_data_df.iterrows(): item_root: Dict[str, Any] = {} + # Track which dataframe columns have been processed via column_mapping + processed_cols: Set[str] = set() + for _, rel_parts, df_col in path_specs: # Safely fetch value val = row.get(df_col, None) # Convert value to string to match schema's "type": "string" leaves. - # (If you later infer types, you can remove the stringify.) - if val is None: - str_val = "" - elif isinstance(val, (str, int, float, bool)): - str_val = str(val) - else: - # Lists / dicts / other -> string for now - str_val = str(val) + str_val = _convert_value_to_string(val) # Insert into nested dict cursor = item_root @@ -776,8 +835,19 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str] leaf_key = rel_parts[-1] cursor[leaf_key] = str_val + # Mark this dataframe column as processed + processed_cols.add(df_col) + + # Add any unmapped dataframe columns directly to item_root + for col_name in input_data_df.columns: + if col_name not in processed_cols: + val = row.get(col_name, None) + str_val = _convert_value_to_string(val) + item_root[col_name] = str_val + content.append({WRAPPER_KEY: item_root}) + LOGGER.info(f"AOAI: Generated {len(content)} content items for data source.") return { "type": "jsonl", "source": { @@ -812,6 +882,7 @@ def _begin_eval_run( :rtype: str """ + LOGGER.info(f"AOAI: Creating eval run '{run_name}' for eval group {eval_group_id}...") data_source = _get_data_source(input_data_df, column_mapping) eval_run = client.evals.runs.create( eval_id=eval_group_id, @@ -820,6 +891,7 @@ def _begin_eval_run( metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"}, # TODO decide if we want to add our own timeout value? ) + LOGGER.info(f"AOAI: Eval run created successfully with ID: {eval_run.id}") return eval_run.id @@ -856,8 +928,11 @@ def _wait_for_run_conclusion( if total_wait > max_wait_seconds: wait_interval -= total_wait - max_wait_seconds sleep(wait_interval) + iters += 1 response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id) + LOGGER.info(f"AOAI: Polling iteration {iters}, status: {response.status}, total wait: {total_wait:.1f}s") if response.status not in ["queued", "in_progress"]: + LOGGER.info(f"AOAI: Eval run {eval_run_id} reached terminal status: {response.status}") return response if total_wait > max_wait_seconds: raise EvaluationException( diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/flat_test_data.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/flat_test_data.jsonl new file mode 100644 index 000000000000..1ae330125fc8 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/flat_test_data.jsonl @@ -0,0 +1,3 @@ +{"query": "What is the capital of France?", "response": "Paris is the capital of France.", "ground_truth": "Paris"} +{"query": "What is 2+2?", "response": "The answer is 4.", "ground_truth": "4"} +{"query": "Who wrote Hamlet?", "response": "William Shakespeare wrote Hamlet.", "ground_truth": "Shakespeare"} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/nested_test_data.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/nested_test_data.jsonl new file mode 100644 index 000000000000..17e785c16b5d --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/nested_test_data.jsonl @@ -0,0 +1,3 @@ +{"item": {"query": "What security policies exist?", "context": {"company": {"policy": {"security": {"passwords": {"rotation_days": 90, "min_length": 12}, "network": {"vpn": {"required": true, "provider": "Cisco"}}}}}}, "response": "Password rotation is required every 90 days with minimum 12 characters. VPN is required using Cisco provider.", "ground_truth": "Security policies include password rotation every 90 days and VPN requirement."}} +{"item": {"query": "What are the database settings?", "context": {"company": {"infrastructure": {"database": {"host": "db.example.com", "port": 5432, "type": "PostgreSQL"}}}}, "response": "The database is PostgreSQL hosted at db.example.com on port 5432.", "ground_truth": "PostgreSQL database on db.example.com:5432"}} +{"item": {"query": "What is the deployment process?", "context": {"company": {"devops": {"deployment": {"strategy": "blue-green", "frequency": "daily", "tools": ["Jenkins", "Kubernetes"]}}}}, "response": "We use blue-green deployment strategy daily with Jenkins and Kubernetes.", "ground_truth": "Blue-green deployment daily using Jenkins and Kubernetes"}} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/wrapped_flat_test_data.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/wrapped_flat_test_data.jsonl new file mode 100644 index 000000000000..195bd72ae4e7 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/wrapped_flat_test_data.jsonl @@ -0,0 +1,2 @@ +{"item": {"query": "Simple query", "response": "Simple response", "ground_truth": "Simple truth"}} +{"item": {"query": "Another query", "response": "Another response", "ground_truth": "Another truth"}} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py new file mode 100644 index 000000000000..c74991160e9d --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py @@ -0,0 +1,510 @@ +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +import pytest +import pandas as pd +import os +import pathlib +from typing import Dict, Any + +from azure.ai.evaluation._evaluate._evaluate_aoai import ( + _generate_data_source_config, + _get_data_source, + _build_schema_tree_from_paths, + WRAPPER_KEY, +) + + +def _get_file(name): + """Get the file from the unittest data folder.""" + data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") + return os.path.join(data_path, name) + + +@pytest.fixture +def flat_test_data(): + """Fixture for flat structure test data.""" + return pd.DataFrame( + [ + { + "query": "What is the capital of France?", + "response": "Paris is the capital of France.", + "ground_truth": "Paris", + }, + {"query": "What is 2+2?", "response": "The answer is 4.", "ground_truth": "4"}, + { + "query": "Who wrote Hamlet?", + "response": "William Shakespeare wrote Hamlet.", + "ground_truth": "Shakespeare", + }, + ] + ) + + +@pytest.fixture +def nested_test_data(): + """Fixture for nested structure test data.""" + return pd.DataFrame( + [ + { + "item.query": "What security policies exist?", + "item.context.company.policy.security.passwords.rotation_days": "90", + "item.context.company.policy.security.network.vpn.required": "true", + "item.response": "Password rotation is required every 90 days.", + "item.ground_truth": "Security policies include password rotation.", + }, + { + "item.query": "What are the database settings?", + "item.context.company.infrastructure.database.host": "db.example.com", + "item.context.company.infrastructure.database.port": "5432", + "item.response": "The database is PostgreSQL.", + "item.ground_truth": "PostgreSQL database", + }, + ] + ) + + +@pytest.fixture +def flat_test_data_file(): + """Fixture for flat test data file path.""" + return _get_file("flat_test_data.jsonl") + + +@pytest.fixture +def nested_test_data_file(): + """Fixture for nested test data file path.""" + return _get_file("nested_test_data.jsonl") + + +@pytest.fixture +def wrapped_flat_test_data_file(): + """Fixture for wrapped flat test data file path.""" + return _get_file("wrapped_flat_test_data.jsonl") + + +@pytest.mark.unittest +class TestBuildSchemaTreeFromPaths: + """Test suite for the _build_schema_tree_from_paths helper function.""" + + def test_single_level_paths(self): + """Test building schema with single-level paths.""" + paths = ["query", "response", "ground_truth"] + schema = _build_schema_tree_from_paths(paths, force_leaf_type="string") + + assert schema["type"] == "object" + assert "properties" in schema + assert "required" in schema + assert set(schema["properties"].keys()) == {"query", "response", "ground_truth"} + assert all(prop["type"] == "string" for prop in schema["properties"].values()) + assert set(schema["required"]) == {"query", "response", "ground_truth"} + + def test_nested_paths(self): + """Test building schema with nested paths.""" + paths = [ + "context.company.policy.security.passwords.rotation_days", + "context.company.policy.security.network.vpn.required", + "query", + ] + schema = _build_schema_tree_from_paths(paths, force_leaf_type="string") + + assert schema["type"] == "object" + assert "context" in schema["properties"] + assert schema["properties"]["context"]["type"] == "object" + + # Navigate nested structure + company = schema["properties"]["context"]["properties"]["company"] + assert company["type"] == "object" + + policy = company["properties"]["policy"] + assert policy["type"] == "object" + + security = policy["properties"]["security"] + assert security["type"] == "object" + + # Check leaf nodes + passwords = security["properties"]["passwords"] + assert passwords["properties"]["rotation_days"]["type"] == "string" + + network = security["properties"]["network"] + assert network["properties"]["vpn"]["properties"]["required"]["type"] == "string" + + # Check required arrays exist at each level + assert "required" in schema + assert "required" in schema["properties"]["context"] + + def test_empty_paths(self): + """Test building schema with empty paths list.""" + paths = [] + schema = _build_schema_tree_from_paths(paths, force_leaf_type="object") + + assert schema["type"] == "object" + + def test_mixed_depth_paths(self): + """Test building schema with paths of different depths.""" + paths = ["simple_field", "nested.field.deep", "nested.field.shallow", "another.path"] + schema = _build_schema_tree_from_paths(paths, force_leaf_type="string") + + assert "simple_field" in schema["properties"] + assert schema["properties"]["simple_field"]["type"] == "string" + + assert "nested" in schema["properties"] + nested = schema["properties"]["nested"] + assert nested["type"] == "object" + assert "field" in nested["properties"] + + +@pytest.mark.unittest +class TestGenerateDataSourceConfig: + """Test suite for the _generate_data_source_config function.""" + + def test_flat_column_mapping(self, flat_test_data): + """Test generating data source config with flat column mappings.""" + column_mapping = { + "query": "${data.query}", + "response": "${data.response}", + "ground_truth": "${data.ground_truth}", + } + + config = _generate_data_source_config(flat_test_data, column_mapping) + + assert config["type"] == "custom" + assert "item_schema" in config + assert config["item_schema"]["type"] == "object" + + properties = config["item_schema"]["properties"] + assert "query" in properties + assert "response" in properties + assert "ground_truth" in properties + + # All should be strings in flat mode + assert properties["query"]["type"] == "string" + assert properties["response"]["type"] == "string" + assert properties["ground_truth"]["type"] == "string" + + def test_nested_column_mapping_with_wrapper(self, nested_test_data): + """Test generating data source config with nested paths under wrapper.""" + column_mapping = { + "query": "${data.item.query}", + "passwords_rotation": "${data.item.context.company.policy.security.passwords.rotation_days}", + "vpn_required": "${data.item.context.company.policy.security.network.vpn.required}", + "response": "${data.item.response}", + } + + config = _generate_data_source_config(nested_test_data, column_mapping) + + assert config["type"] == "custom" + assert "item_schema" in config + schema = config["item_schema"] + + # Should be nested object since paths contain dots + assert schema["type"] == "object" + + # The wrapper should be stripped, so we should see inner structure + assert "query" in schema["properties"] + assert "response" in schema["properties"] + assert "context" in schema["properties"] + + # Verify nested structure + context = schema["properties"]["context"] + assert context["type"] == "object" + assert "company" in context["properties"] + + def test_nested_column_mapping_without_wrapper(self, nested_test_data): + """Test generating data source config with nested paths not using standard wrapper.""" + column_mapping = { + "query": "${data.custom.query}", + "field": "${data.custom.nested.field}", + } + + config = _generate_data_source_config(nested_test_data, column_mapping) + + assert config["type"] == "custom" + assert "item_schema" in config + schema = config["item_schema"] + + # Should be nested + assert schema["type"] == "object" + # Without wrapper stripping, should see 'custom' at top level + assert "custom" in schema["properties"] + + def test_mixed_data_and_run_outputs(self, flat_test_data): + """Test column mapping with both data and run.outputs references.""" + column_mapping = { + "query": "${data.query}", + "response": "${run.outputs.response}", + "ground_truth": "${data.ground_truth}", + } + + config = _generate_data_source_config(flat_test_data, column_mapping) + + # Only data.* paths should be in schema + properties = config["item_schema"]["properties"] + assert "query" in properties + assert "ground_truth" in properties + # run.outputs.response shouldn't create a schema property directly + + def test_empty_column_mapping(self, flat_test_data): + """Test with empty column mapping.""" + column_mapping = {} + + config = _generate_data_source_config(flat_test_data, column_mapping) + + # Should return flat schema with no properties + assert config["type"] == "custom" + assert config["item_schema"]["type"] == "object" + assert config["item_schema"]["properties"] == {} + + def test_no_data_references(self, flat_test_data): + """Test column mapping with no ${data.*} references.""" + column_mapping = {"response": "${run.outputs.response}", "result": "${run.outputs.result}"} + + config = _generate_data_source_config(flat_test_data, column_mapping) + + # Should return flat schema since no data paths referenced + assert config["type"] == "custom" + assert "response" in config["item_schema"]["properties"] + assert "result" in config["item_schema"]["properties"] + + def test_single_nested_path(self, flat_test_data): + """Test with a single nested path to ensure nested mode activates.""" + column_mapping = {"nested_field": "${data.item.context.field}"} + + config = _generate_data_source_config(flat_test_data, column_mapping) + + # Should generate nested schema + assert config["type"] == "custom" + schema = config["item_schema"] + assert schema["type"] == "object" + # After wrapper stripping, should see context + assert "context" in schema["properties"] + + +@pytest.mark.unittest +class TestGetDataSource: + """Test suite for the _get_data_source function.""" + + def test_flat_data_source_generation(self, flat_test_data): + """Test generating data source from flat data.""" + column_mapping = { + "query": "${data.query}", + "response": "${data.response}", + "ground_truth": "${data.ground_truth}", + } + + data_source = _get_data_source(flat_test_data, column_mapping) + + assert data_source["type"] == "jsonl" + assert "source" in data_source + assert data_source["source"]["type"] == "file_content" + + content = data_source["source"]["content"] + assert len(content) == 3 + + # Each item should be wrapped + for item in content: + assert WRAPPER_KEY in item + assert "query" in item[WRAPPER_KEY] + assert "response" in item[WRAPPER_KEY] + assert "ground_truth" in item[WRAPPER_KEY] + + def test_nested_data_source_generation(self, nested_test_data): + """Test generating data source from nested data.""" + column_mapping = { + "query": "${data.item.query}", + "rotation_days": "${data.item.context.company.policy.security.passwords.rotation_days}", + "vpn_required": "${data.item.context.company.policy.security.network.vpn.required}", + "response": "${data.item.response}", + } + + data_source = _get_data_source(nested_test_data, column_mapping) + + assert data_source["type"] == "jsonl" + content = data_source["source"]["content"] + assert len(content) == 2 + + # Verify nested structure is built correctly + first_item = content[0][WRAPPER_KEY] + assert "query" in first_item + assert "context" in first_item + assert "company" in first_item["context"] + assert "policy" in first_item["context"]["company"] + + # Check leaf values + passwords = first_item["context"]["company"]["policy"]["security"]["passwords"] + assert passwords["rotation_days"] == "90" + + vpn = first_item["context"]["company"]["policy"]["security"]["network"]["vpn"] + assert vpn["required"] == "true" + + def test_data_source_with_run_outputs(self, flat_test_data): + """Test data source generation with run.outputs mappings.""" + # Add __outputs column to simulate target function output + flat_test_data["__outputs.model_response"] = [ + "Generated response 1", + "Generated response 2", + "Generated response 3", + ] + + column_mapping = { + "query": "${data.query}", + "response": "${run.outputs.model_response}", + "ground_truth": "${data.ground_truth}", + } + + data_source = _get_data_source(flat_test_data, column_mapping) + + content = data_source["source"]["content"] + + # run.outputs should be mapped with just leaf name + for i, item in enumerate(content): + assert "model_response" in item[WRAPPER_KEY] + assert item[WRAPPER_KEY]["model_response"] == f"Generated response {i+1}" + + def test_data_source_with_unmapped_columns(self, flat_test_data): + """Test that unmapped columns are included in output.""" + # Add extra column not in mapping + flat_test_data["extra_field"] = ["extra1", "extra2", "extra3"] + + column_mapping = {"query": "${data.query}", "response": "${data.response}"} + + data_source = _get_data_source(flat_test_data, column_mapping) + + content = data_source["source"]["content"] + + # Unmapped columns should appear directly in item + for i, item in enumerate(content): + assert "extra_field" in item[WRAPPER_KEY] + assert "ground_truth" in item[WRAPPER_KEY] # Also unmapped + + def test_data_source_with_none_values(self, flat_test_data): + """Test data source generation handles None values correctly.""" + flat_test_data.loc[1, "response"] = None + + column_mapping = { + "query": "${data.query}", + "response": "${data.response}", + "ground_truth": "${data.ground_truth}", + } + + data_source = _get_data_source(flat_test_data, column_mapping) + + content = data_source["source"]["content"] + + # None should be converted to empty string + assert content[1][WRAPPER_KEY]["response"] == "" + + def test_data_source_with_numeric_values(self, flat_test_data): + """Test data source generation converts numeric values to strings.""" + flat_test_data["score"] = [95, 87, 92] + flat_test_data["confidence"] = [0.95, 0.87, 0.92] + + column_mapping = {"query": "${data.query}", "score": "${data.score}", "confidence": "${data.confidence}"} + + data_source = _get_data_source(flat_test_data, column_mapping) + + content = data_source["source"]["content"] + + # Numeric values should be converted to strings + assert content[0][WRAPPER_KEY]["score"] == "95" + assert content[0][WRAPPER_KEY]["confidence"] == "0.95" + assert isinstance(content[0][WRAPPER_KEY]["score"], str) + assert isinstance(content[0][WRAPPER_KEY]["confidence"], str) + + def test_empty_dataframe(self): + """Test data source generation with empty dataframe.""" + empty_df = pd.DataFrame() + column_mapping = {"query": "${data.query}"} + + data_source = _get_data_source(empty_df, column_mapping) + + assert data_source["type"] == "jsonl" + assert len(data_source["source"]["content"]) == 0 + + def test_complex_nested_structure(self): + """Test with complex multi-level nested structure.""" + df = pd.DataFrame( + [ + { + "item.a.b.c.d": "deep_value", + "item.a.b.x": "mid_value", + "item.a.y": "shallow_value", + "item.z": "top_value", + } + ] + ) + + column_mapping = { + "deep": "${data.item.a.b.c.d}", + "mid": "${data.item.a.b.x}", + "shallow": "${data.item.a.y}", + "top": "${data.item.z}", + } + + data_source = _get_data_source(df, column_mapping) + + content = data_source["source"]["content"] + item = content[0][WRAPPER_KEY] + + # Verify nested structure + assert item["a"]["b"]["c"]["d"] == "deep_value" + assert item["a"]["b"]["x"] == "mid_value" + assert item["a"]["y"] == "shallow_value" + assert item["z"] == "top_value" + + def test_data_source_preserves_row_order(self, flat_test_data): + """Test that data source preserves the order of rows.""" + column_mapping = {"query": "${data.query}", "response": "${data.response}"} + + data_source = _get_data_source(flat_test_data, column_mapping) + content = data_source["source"]["content"] + + # Verify order matches input + assert content[0][WRAPPER_KEY]["query"] == flat_test_data.iloc[0]["query"] + assert content[1][WRAPPER_KEY]["query"] == flat_test_data.iloc[1]["query"] + assert content[2][WRAPPER_KEY]["query"] == flat_test_data.iloc[2]["query"] + + +@pytest.mark.unittest +class TestDataSourceConfigIntegration: + """Integration tests for schema and data source generation working together.""" + + def test_flat_schema_and_data_alignment(self, flat_test_data): + """Test that schema and data are aligned for flat structure.""" + column_mapping = { + "query": "${data.query}", + "response": "${data.response}", + "ground_truth": "${data.ground_truth}", + } + + config = _generate_data_source_config(flat_test_data, column_mapping) + data_source = _get_data_source(flat_test_data, column_mapping) + + schema_props = config["item_schema"]["properties"] + data_item = data_source["source"]["content"][0][WRAPPER_KEY] + + # All schema properties should exist in data + for prop_name in schema_props.keys(): + assert prop_name in data_item + + def test_nested_schema_and_data_alignment(self, nested_test_data): + """Test that schema and data are aligned for nested structure.""" + column_mapping = { + "query": "${data.item.query}", + "rotation_days": "${data.item.context.company.policy.security.passwords.rotation_days}", + "response": "${data.item.response}", + } + + config = _generate_data_source_config(nested_test_data, column_mapping) + data_source = _get_data_source(nested_test_data, column_mapping) + + # Both should handle nested structure consistently + assert config["item_schema"]["type"] == "object" + assert WRAPPER_KEY in data_source["source"]["content"][0] + + # Verify nested paths exist in data + item = data_source["source"]["content"][0][WRAPPER_KEY] + assert "query" in item + assert "context" in item + assert "company" in item["context"] diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_nested_integration.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_nested_integration.py new file mode 100644 index 000000000000..8bfbdf1edad0 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_nested_integration.py @@ -0,0 +1,289 @@ +# ------------------------------------ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# ------------------------------------ + +import pytest +import pandas as pd +from unittest.mock import Mock, patch, MagicMock +from typing import Dict, Any + +from azure.ai.evaluation._evaluate._evaluate_aoai import ( + _generate_data_source_config, + _get_data_source, + _begin_eval_run, + WRAPPER_KEY, +) + + +@pytest.mark.unittest +class TestAOAINestedDataIntegration: + """Test suite for AOAI evaluation integration with nested data structures.""" + + def test_aoai_eval_run_with_flat_data(self): + """Test _begin_eval_run with flat data structure.""" + # Setup test data + input_df = pd.DataFrame( + [ + {"query": "What is AI?", "response": "AI is...", "ground_truth": "AI"}, + {"query": "What is ML?", "response": "ML is...", "ground_truth": "ML"}, + ] + ) + + column_mapping = { + "query": "${data.query}", + "response": "${data.response}", + "ground_truth": "${data.ground_truth}", + } + + # Mock the client + mock_client = Mock() + mock_run = Mock() + mock_run.id = "test-run-123" + mock_client.evals.runs.create.return_value = mock_run + + # Call the function + run_id = _begin_eval_run( + client=mock_client, + eval_group_id="test-group-456", + run_name="test-run", + input_data_df=input_df, + column_mapping=column_mapping, + ) + + # Verify the client was called + assert run_id == "test-run-123" + mock_client.evals.runs.create.assert_called_once() + + # Get the call arguments + call_kwargs = mock_client.evals.runs.create.call_args[1] + + # Verify eval_id + assert call_kwargs["eval_id"] == "test-group-456" + assert call_kwargs["name"] == "test-run" + + # Verify data_source structure + data_source = call_kwargs["data_source"] + assert data_source["type"] == "jsonl" + assert "source" in data_source + assert data_source["source"]["type"] == "file_content" + + # Verify content + content = data_source["source"]["content"] + assert len(content) == 2 + + # Each item should be wrapped + for item in content: + assert WRAPPER_KEY in item + assert "query" in item[WRAPPER_KEY] + assert "response" in item[WRAPPER_KEY] + assert "ground_truth" in item[WRAPPER_KEY] + + def test_aoai_eval_run_with_nested_data(self): + """Test _begin_eval_run with nested data structure.""" + # Setup nested test data + input_df = pd.DataFrame( + [ + { + "item.query": "Security question", + "item.context.company.policy.security.passwords.rotation_days": "90", + "item.context.company.policy.security.network.vpn.required": "true", + "item.response": "Password rotation is 90 days.", + "item.ground_truth": "90", + } + ] + ) + + column_mapping = { + "query": "${data.item.query}", + "rotation_days": "${data.item.context.company.policy.security.passwords.rotation_days}", + "vpn_required": "${data.item.context.company.policy.security.network.vpn.required}", + "response": "${data.item.response}", + "ground_truth": "${data.item.ground_truth}", + } + + # Mock the client + mock_client = Mock() + mock_run = Mock() + mock_run.id = "nested-run-789" + mock_client.evals.runs.create.return_value = mock_run + + # Call the function + run_id = _begin_eval_run( + client=mock_client, + eval_group_id="nested-group-101", + run_name="nested-test-run", + input_data_df=input_df, + column_mapping=column_mapping, + ) + + # Verify + assert run_id == "nested-run-789" + mock_client.evals.runs.create.assert_called_once() + + # Get the data source + call_kwargs = mock_client.evals.runs.create.call_args[1] + data_source = call_kwargs["data_source"] + content = data_source["source"]["content"] + + # Verify nested structure was built + assert len(content) == 1 + item_root = content[0][WRAPPER_KEY] + + # Check nested paths exist + assert "query" in item_root + assert "context" in item_root + assert "company" in item_root["context"] + assert "policy" in item_root["context"]["company"] + assert "security" in item_root["context"]["company"]["policy"] + assert "passwords" in item_root["context"]["company"]["policy"]["security"] + assert "rotation_days" in item_root["context"]["company"]["policy"]["security"]["passwords"] + assert item_root["context"]["company"]["policy"]["security"]["passwords"]["rotation_days"] == "90" + + def test_data_source_config_matches_data_source_for_nested(self): + """Test that schema config and data source align for nested structures.""" + input_df = pd.DataFrame( + [ + { + "item.query": "Test query", + "item.context.field1": "value1", + "item.context.field2": "value2", + "item.response": "Test response", + } + ] + ) + + column_mapping = { + "query": "${data.item.query}", + "field1": "${data.item.context.field1}", + "field2": "${data.item.context.field2}", + "response": "${data.item.response}", + } + + # Generate both config and data source + config = _generate_data_source_config(input_df, column_mapping) + data_source = _get_data_source(input_df, column_mapping) + + # Verify config structure + assert config["type"] == "custom" + schema = config["item_schema"] + assert schema["type"] == "object" + + # Verify schema has nested structure (wrapper stripped) + assert "query" in schema["properties"] + assert "context" in schema["properties"] + assert schema["properties"]["context"]["type"] == "object" + + # Verify data source structure matches + content = data_source["source"]["content"] + item_root = content[0][WRAPPER_KEY] + + # All schema properties should exist in data + assert "query" in item_root + assert "context" in item_root + assert "field1" in item_root["context"] + assert "field2" in item_root["context"] + assert "response" in item_root + + def test_data_source_config_matches_data_source_for_flat(self): + """Test that schema config and data source align for flat structures.""" + input_df = pd.DataFrame([{"query": "Test", "response": "Answer", "score": "5"}]) + + column_mapping = {"query": "${data.query}", "response": "${data.response}", "score": "${data.score}"} + + # Generate both config and data source + config = _generate_data_source_config(input_df, column_mapping) + data_source = _get_data_source(input_df, column_mapping) + + # Verify flat config structure + assert config["type"] == "custom" + schema = config["item_schema"] + assert schema["type"] == "object" + + # Flat mode: properties match mapping keys + assert set(schema["properties"].keys()) == {"query", "response", "score"} + + # Verify data source + content = data_source["source"]["content"] + item_root = content[0][WRAPPER_KEY] + + # All properties should exist + assert "query" in item_root + assert "response" in item_root + assert "score" in item_root + + def test_data_source_with_run_outputs_and_nested_data(self): + """Test data source generation with both run outputs and nested data.""" + input_df = pd.DataFrame( + [ + { + "item.query": "Test query", + "item.context.metadata.id": "123", + "__outputs.generated_response": "Generated text", + } + ] + ) + + column_mapping = { + "query": "${data.item.query}", + "metadata_id": "${data.item.context.metadata.id}", + "response": "${run.outputs.generated_response}", + } + + # Generate data source + data_source = _get_data_source(input_df, column_mapping) + + # Verify structure + content = data_source["source"]["content"] + item_root = content[0][WRAPPER_KEY] + + # Nested data paths + assert "query" in item_root + assert "context" in item_root + assert "metadata" in item_root["context"] + assert item_root["context"]["metadata"]["id"] == "123" + + # Run outputs (just leaf name) + assert "generated_response" in item_root + assert item_root["generated_response"] == "Generated text" + + def test_complex_nested_structure_multiple_branches(self): + """Test nested structure with multiple branches at same level.""" + input_df = pd.DataFrame( + [ + { + "item.user.name": "Alice", + "item.user.email": "alice@example.com", + "item.system.version": "1.0", + "item.system.region": "us-east", + "item.query": "Test", + } + ] + ) + + column_mapping = { + "name": "${data.item.user.name}", + "email": "${data.item.user.email}", + "version": "${data.item.system.version}", + "region": "${data.item.system.region}", + "query": "${data.item.query}", + } + + # Generate config and data + config = _generate_data_source_config(input_df, column_mapping) + data_source = _get_data_source(input_df, column_mapping) + + # Verify schema has both branches + schema = config["item_schema"] + assert "user" in schema["properties"] + assert "system" in schema["properties"] + assert "query" in schema["properties"] + + # Verify data has both branches + item_root = data_source["source"]["content"][0][WRAPPER_KEY] + assert "user" in item_root + assert "system" in item_root + assert item_root["user"]["name"] == "Alice" + assert item_root["user"]["email"] == "alice@example.com" + assert item_root["system"]["version"] == "1.0" + assert item_root["system"]["region"] == "us-east"