Skip to content

Commit ccbe558

Browse files
ninghul0lawrence
authored andcommitted
Evaluation: Fix the output_path parameter of evaluate API doesn't support relative path (Azure#38241)
* Fix output_path parameter doesn't support relative path * add comments * fix the test * update * minor update * update
1 parent fc56e05 commit ccbe558

File tree

6 files changed

+33
-11
lines changed

6 files changed

+33
-11
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@
88
- The `parallel` parameter has been removed from composite evaluators: `QAEvaluator`, `ContentSafetyChatEvaluator`, and `ContentSafetyMultimodalEvaluator`. To control evaluator parallelism, you can now use the `_parallel` keyword argument, though please note that this private parameter may change in the future.
99

1010
### Bugs Fixed
11+
- Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
1112
- Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
12-
- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
13+
- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
1314

1415
### Other Changes
1516
- Refined error messages for serviced-based evaluators and simulators.
1617
- Introduced environment variable `AI_EVALS_DISABLE_EXPERIMENTAL_WARNING` to disable the warning message for experimental features.
17-
- Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex.
18+
- Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex.
1819
- For the `DirectAttackSimulator`, the prompt templates used to generate simulated outputs for each Adversarial harm category will no longer be in a randomized order by default. To override this behavior, pass `randomize_order=True` when you call the `DirectAttackSimulator`, for example:
1920
```python
2021
adversarial_simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
@@ -83,7 +84,7 @@ outputs = asyncio.run(custom_simulator(
8384
- `SimilarityEvaluator`
8485
- `RetrievalEvaluator`
8586
- The following evaluators will now have a new key in their result output including LLM reasoning behind the score. The new key will follow the pattern "<metric_name>_reason". The reasoning is the result of a more detailed prompt template being used to generate the LLM response. Note that this requires the maximum number of tokens used to run these evaluators to be increased.
86-
87+
8788
| Evaluator | New `max_token` for Generation |
8889
| --- | --- |
8990
| `CoherenceEvaluator` | 800 |

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,12 @@ def __init__(self, client: Union[CodeClient, ProxyClient]) -> None:
3636
self.client = client
3737
self._is_batch_timeout_set_by_system = False
3838
self._is_otel_timeout_set_by_system = False
39+
self._original_cwd = os.getcwd()
3940

4041
def __enter__(self) -> None:
42+
# Preserve current working directory, as PF may change it without restoring it afterward
43+
self._original_cwd = os.getcwd()
44+
4145
if isinstance(self.client, CodeClient):
4246
ClientUserAgentUtil.append_user_agent(USER_AGENT)
4347
inject_openai_api()
@@ -64,6 +68,8 @@ def __exit__(
6468
exc_value: Optional[BaseException],
6569
exc_tb: Optional[types.TracebackType],
6670
) -> None:
71+
os.chdir(self._original_cwd)
72+
6773
if isinstance(self.client, CodeClient):
6874
recover_openai_api()
6975

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,12 @@ class TargetRunContext:
1717

1818
def __init__(self, upload_snapshot: bool) -> None:
1919
self._upload_snapshot = upload_snapshot
20+
self._original_cwd = os.getcwd()
2021

2122
def __enter__(self) -> None:
23+
# Preserve current working directory, as PF may change it without restoring it afterward
24+
self._original_cwd = os.getcwd()
25+
2226
# Address "[WinError 32] The process cannot access the file" error,
2327
# caused by conflicts when the venv and target function are in the same directory.
2428
# Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml).
@@ -31,5 +35,7 @@ def __exit__(
3135
exc_value: Optional[BaseException],
3236
exc_tb: Optional[types.TracebackType],
3337
) -> None:
38+
os.chdir(self._original_cwd)
39+
3440
if not self._upload_snapshot:
3541
os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
391391
)
392392

393393
output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
394-
if not os.path.exists(output_dir):
394+
if output_dir and not os.path.exists(output_dir):
395395
msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
396396
raise EvaluationException(
397397
message=msg,
@@ -698,7 +698,7 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
698698
if output_dict:
699699
print("======= Combined Run Summary (Per Evaluator) =======\n")
700700
print(json.dumps(output_dict, indent=4))
701-
print("\n====================================================")
701+
print("\n====================================================\n")
702702

703703

704704
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
@@ -888,9 +888,9 @@ def eval_batch_run(
888888
result_df_dict = result_df.to_dict("records")
889889
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
890890

891+
_print_summary(per_evaluator_results)
892+
891893
if output_path:
892894
_write_output(output_path, result)
893895

894-
_print_summary(per_evaluator_results)
895-
896896
return result

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
211211
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
212212
json.dump(data_dict, f)
213213

214+
print(f'Evaluation results saved to "{p.resolve()}".\n')
215+
214216

215217
def _apply_column_mapping(
216218
source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -396,14 +396,18 @@ def test_evaluate_output_dir_not_exist(self, mock_model_config, questions_file):
396396

397397
assert "The output directory './not_exist_dir' does not exist." in exc_info.value.args[0]
398398

399-
@pytest.mark.parametrize("use_pf_client", [True, False])
400-
def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_pf_client):
401-
output_path = os.path.join(tmpdir, "eval_test_results.jsonl")
399+
@pytest.mark.parametrize("use_relative_path", [True, False])
400+
def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_relative_path):
401+
# output_path is a file
402+
if use_relative_path:
403+
output_path = os.path.join(tmpdir, "eval_test_results.jsonl")
404+
else:
405+
output_path = "eval_test_results.jsonl"
406+
402407
result = evaluate(
403408
data=evaluate_test_data_jsonl_file,
404409
evaluators={"g": F1ScoreEvaluator()},
405410
output_path=output_path,
406-
_use_pf_client=use_pf_client,
407411
)
408412

409413
assert result is not None
@@ -415,6 +419,9 @@ def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_p
415419
data_from_file = json.loads(content)
416420
assert result["metrics"] == data_from_file["metrics"]
417421

422+
os.remove(output_path)
423+
424+
# output_path is a directory
418425
result = evaluate(
419426
data=evaluate_test_data_jsonl_file,
420427
evaluators={"g": F1ScoreEvaluator()},

0 commit comments

Comments
 (0)