Evaluation: Fix the output_path parameter of evaluate API doesn't support relative path (Azure#38241)

ninghu · l0lawrence · commit ccbe558604a5 · 2025-02-18T15:03:20.000-08:00
* Fix output_path parameter doesn't support relative path

* add comments

* fix the test

* update

* minor update

* update
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -8,13 +8,14 @@
 - The `parallel` parameter has been removed from composite evaluators: `QAEvaluator`, `ContentSafetyChatEvaluator`, and `ContentSafetyMultimodalEvaluator`. To control evaluator parallelism, you can now use the `_parallel` keyword argument, though please note that this private parameter may change in the future.
 
 ### Bugs Fixed
+- Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
 - Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
-- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation. 
+- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
 
 ### Other Changes
 - Refined error messages for serviced-based evaluators and simulators.
 - Introduced environment variable `AI_EVALS_DISABLE_EXPERIMENTAL_WARNING` to disable the warning message for experimental features.
-- Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the  `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex. 
+- Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the  `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex.
 - For the `DirectAttackSimulator`, the prompt templates used to generate simulated outputs for each Adversarial harm category will no longer be in a randomized order by default. To override this behavior, pass `randomize_order=True` when you call the `DirectAttackSimulator`, for example:
 ```python
 adversarial_simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
@@ -83,7 +84,7 @@ outputs = asyncio.run(custom_simulator(
   - `SimilarityEvaluator`
   - `RetrievalEvaluator`
 - The following evaluators will now have a new key in their result output including LLM reasoning behind the score. The new key will follow the pattern "<metric_name>_reason". The reasoning is the result of a more detailed prompt template being used to generate the LLM response. Note that this requires the maximum number of tokens used to run these evaluators to be increased.
-    
+
     | Evaluator | New `max_token` for Generation |
     | --- | --- |
     | `CoherenceEvaluator` | 800 |
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py
@@ -36,8 +36,12 @@ def __init__(self, client: Union[CodeClient, ProxyClient]) -> None:
         self.client = client
         self._is_batch_timeout_set_by_system = False
         self._is_otel_timeout_set_by_system = False
+        self._original_cwd = os.getcwd()
 
     def __enter__(self) -> None:
+        # Preserve current working directory, as PF may change it without restoring it afterward
+        self._original_cwd = os.getcwd()
+
         if isinstance(self.client, CodeClient):
             ClientUserAgentUtil.append_user_agent(USER_AGENT)
             inject_openai_api()
@@ -64,6 +68,8 @@ def __exit__(
         exc_value: Optional[BaseException],
         exc_tb: Optional[types.TracebackType],
     ) -> None:
+        os.chdir(self._original_cwd)
+
         if isinstance(self.client, CodeClient):
             recover_openai_api()
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py
@@ -17,8 +17,12 @@ class TargetRunContext:
 
     def __init__(self, upload_snapshot: bool) -> None:
         self._upload_snapshot = upload_snapshot
+        self._original_cwd = os.getcwd()
 
     def __enter__(self) -> None:
+        # Preserve current working directory, as PF may change it without restoring it afterward
+        self._original_cwd = os.getcwd()
+
         # Address "[WinError 32] The process cannot access the file" error,
         # caused by conflicts when the venv and target function are in the same directory.
         # Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml).
@@ -31,5 +35,7 @@ def __exit__(
         exc_value: Optional[BaseException],
         exc_tb: Optional[types.TracebackType],
     ) -> None:
+        os.chdir(self._original_cwd)
+
         if not self._upload_snapshot:
             os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -391,7 +391,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
             )
 
         output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
-        if not os.path.exists(output_dir):
+        if output_dir and not os.path.exists(output_dir):
             msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
             raise EvaluationException(
                 message=msg,
@@ -698,7 +698,7 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
     if output_dict:
         print("======= Combined Run Summary (Per Evaluator) =======\n")
         print(json.dumps(output_dict, indent=4))
-        print("\n====================================================")
+        print("\n====================================================\n")
 
 
 def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
@@ -888,9 +888,9 @@ def eval_batch_run(
     result_df_dict = result_df.to_dict("records")
     result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url}  # type: ignore
 
+    _print_summary(per_evaluator_results)
+
     if output_path:
         _write_output(output_path, result)
 
-    _print_summary(per_evaluator_results)
-
     return result
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py
@@ -211,6 +211,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
     with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
         json.dump(data_dict, f)
 
+    print(f'Evaluation results saved to "{p.resolve()}".\n')
+
 
 def _apply_column_mapping(
     source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -396,14 +396,18 @@ def test_evaluate_output_dir_not_exist(self, mock_model_config, questions_file):
 
         assert "The output directory './not_exist_dir' does not exist." in exc_info.value.args[0]
 
-    @pytest.mark.parametrize("use_pf_client", [True, False])
-    def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_pf_client):
-        output_path = os.path.join(tmpdir, "eval_test_results.jsonl")
+    @pytest.mark.parametrize("use_relative_path", [True, False])
+    def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_relative_path):
+        # output_path is a file
+        if use_relative_path:
+            output_path = os.path.join(tmpdir, "eval_test_results.jsonl")
+        else:
+            output_path = "eval_test_results.jsonl"
+
         result = evaluate(
             data=evaluate_test_data_jsonl_file,
             evaluators={"g": F1ScoreEvaluator()},
             output_path=output_path,
-            _use_pf_client=use_pf_client,
         )
 
         assert result is not None
@@ -415,6 +419,9 @@ def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_p
             data_from_file = json.loads(content)
             assert result["metrics"] == data_from_file["metrics"]
 
+        os.remove(output_path)
+
+        # output_path is a directory
         result = evaluate(
             data=evaluate_test_data_jsonl_file,
             evaluators={"g": F1ScoreEvaluator()},