Skip to content

Commit

Permalink
fix: fix numerical NaN experiment run logging error in EvalTask.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 641981976
  • Loading branch information
jsondai authored and copybara-github committed Jun 10, 2024
1 parent 4e2d87f commit 641faec
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 7 deletions.
61 changes: 57 additions & 4 deletions tests/unit/vertexai/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@
)
from vertexai import generative_models
from vertexai.preview import evaluation
from vertexai.preview.evaluation import _base as eval_base
from vertexai.preview.evaluation import _evaluation
from vertexai.preview.evaluation import utils
import numpy as np
import pandas as pd
import pytest


_TEST_PROJECT = "test-project"
_TEST_LOCATION = "us-central1"
_TEST_METRICS = (
Expand Down Expand Up @@ -78,6 +80,7 @@
text,text,text\n
"""

_TEST_EXPERIMENT = "test-experiment"

_MOCK_EXACT_MATCH_RESULT = (
gapic_evaluation_service_types.EvaluateInstancesResponse(
Expand Down Expand Up @@ -135,6 +138,19 @@
]
}
)
MOCK_EVAL_RESULT = eval_base.EvalResult(
summary_metrics={
"row_count": 1,
"mock_metric/mean": 1.0,
"mock_metric/std": np.nan,
},
metrics_table=pd.DataFrame(
{
"response": ["test"],
"mock_metric": [1.0],
}
),
)


@pytest.fixture
Expand Down Expand Up @@ -163,23 +179,22 @@ def teardown_method(self):
initializer.global_pool.shutdown(wait=True)

def test_create_eval_task(self):
test_experiment = "test_experiment_name"
test_content_column_name = "test_content_column_name"
test_reference_column_name = "test_reference_column_name"
test_response_column_name = "test_response_column_name"

test_eval_task = evaluation.EvalTask(
dataset=_TEST_EVAL_DATASET,
metrics=_TEST_METRICS,
experiment=test_experiment,
experiment=_TEST_EXPERIMENT,
content_column_name=test_content_column_name,
reference_column_name=test_reference_column_name,
response_column_name=test_response_column_name,
)

assert test_eval_task.dataset.equals(_TEST_EVAL_DATASET)
assert test_eval_task.metrics == _TEST_METRICS
assert test_eval_task.experiment == test_experiment
assert test_eval_task.experiment == _TEST_EXPERIMENT
assert test_eval_task.content_column_name == test_content_column_name
assert test_eval_task.reference_column_name == test_reference_column_name
assert test_eval_task.response_column_name == test_response_column_name
Expand Down Expand Up @@ -470,6 +485,44 @@ def test_compute_pairwise_metrics_without_inference(self, api_transport):
== 0.5
)

def test_eval_result_experiment_run_logging(self):
test_eval_task = evaluation.EvalTask(
dataset=_TEST_EVAL_DATASET,
metrics=_TEST_METRICS,
experiment=_TEST_EXPERIMENT,
)

with mock.patch.multiple(
metadata._experiment_tracker,
_experiment=mock.MagicMock(name=_TEST_EXPERIMENT),
_experiment_run=None,
set_experiment=mock.DEFAULT,
reset=mock.DEFAULT,
):
with mock.patch.multiple(
vertexai.preview,
start_run=mock.MagicMock(),
log_params=mock.DEFAULT,
log_metrics=mock.DEFAULT,
) as mock_metadata:
with mock.patch.object(
target=_evaluation,
attribute="evaluate",
side_effect=[MOCK_EVAL_RESULT],
):
test_result = test_eval_task.evaluate()

assert test_result.summary_metrics["row_count"] == 1
assert test_result.summary_metrics["mock_metric/mean"] == 1.0
assert test_result.summary_metrics["mock_metric/std"] == "NaN"
mock_metadata["log_metrics"].assert_called_once_with(
{
"row_count": 1,
"mock_metric/mean": 1.0,
"mock_metric/std": "NaN",
}
)


@pytest.mark.usefixtures("google_auth_mock")
class TestEvaluationErrors:
Expand Down
11 changes: 8 additions & 3 deletions vertexai/preview/evaluation/_eval_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from vertexai.preview.evaluation.metrics import (
_base as metrics_base,
)
import numpy as np

if TYPE_CHECKING:
import pandas as pd
Expand Down Expand Up @@ -284,9 +285,14 @@ def _evaluate_with_experiment(
reference_column_name=self.reference_column_name,
response_column_name=response_column_name,
)

eval_result.summary_metrics = {
k: ("NaN" if isinstance(v, float) and np.isnan(v) else v)
for k, v in eval_result.summary_metrics.items()
}
try:
vertexai.preview.log_metrics(eval_result.summary_metrics)
except (ValueError, TypeError, exceptions.InvalidArgument) as e:
except (TypeError, exceptions.InvalidArgument) as e:
_LOGGER.warning(f"Experiment metrics logging failed: {str(e)}")
return eval_result

Expand Down Expand Up @@ -366,8 +372,7 @@ def _validate_experiment_run(self) -> None:
if metadata._experiment_tracker.experiment_run:
raise ValueError(
"Experiment run already exists. Please specify the name of the"
" experiment run to assign current session with in this evaluate"
" method."
" experiment run to assign current session within this evaluation."
)

def _log_eval_experiment_param(
Expand Down

0 comments on commit 641faec

Please sign in to comment.