diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index 8107756229..bc1043ef03 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -86,6 +86,9 @@ log_params = metadata.metadata._experiment_tracker.log_params log_metrics = metadata.metadata._experiment_tracker.log_metrics +log_classification_metrics = ( + metadata.metadata._experiment_tracker.log_classification_metrics +) get_experiment_df = metadata.metadata._experiment_tracker.get_experiment_df start_run = metadata.metadata._experiment_tracker.start_run start_execution = metadata.metadata._experiment_tracker.start_execution @@ -110,6 +113,7 @@ "log", "log_params", "log_metrics", + "log_classification_metrics", "log_time_series_metrics", "get_experiment_df", "get_pipeline_df", diff --git a/google/cloud/aiplatform/metadata/experiment_run_resource.py b/google/cloud/aiplatform/metadata/experiment_run_resource.py index d61b62b7b2..326948dcec 100644 --- a/google/cloud/aiplatform/metadata/experiment_run_resource.py +++ b/google/cloud/aiplatform/metadata/experiment_run_resource.py @@ -39,6 +39,10 @@ from google.cloud.aiplatform.metadata import metadata from google.cloud.aiplatform.metadata import resource from google.cloud.aiplatform.metadata import utils as metadata_utils +from google.cloud.aiplatform.metadata.schema import utils as schema_utils +from google.cloud.aiplatform.metadata.schema.google import ( + artifact_schema as google_artifact_schema, +) from google.cloud.aiplatform.tensorboard import tensorboard_resource from google.cloud.aiplatform.utils import rest_utils @@ -990,6 +994,108 @@ def log_metrics(self, metrics: Dict[str, Union[float, int, str]]): # TODO: query the latest metrics artifact resource before logging. self._metadata_node.update(metadata={constants._METRIC_KEY: metrics}) + @_v1_not_supported + def log_classification_metrics( + self, + *, + labels: Optional[List[str]] = None, + matrix: Optional[List[List[int]]] = None, + fpr: Optional[List[float]] = None, + tpr: Optional[List[float]] = None, + threshold: Optional[List[float]] = None, + display_name: Optional[str] = None, + ): + """Create an artifact for classification metrics and log to ExperimentRun. Currently supports confusion matrix and ROC curve. + + ``` + my_run = aiplatform.ExperimentRun('my-run', experiment='my-experiment') + my_run.log_classification_metrics( + display_name='my-classification-metrics', + labels=['cat', 'dog'], + matrix=[[9, 1], [1, 9]], + fpr=[0.1, 0.5, 0.9], + tpr=[0.1, 0.7, 0.9], + threshold=[0.9, 0.5, 0.1], + ) + ``` + + Args: + labels (List[str]): + Optional. List of label names for the confusion matrix. Must be set if 'matrix' is set. + matrix (List[List[int]): + Optional. Values for the confusion matrix. Must be set if 'labels' is set. + fpr (List[float]): + Optional. List of false positive rates for the ROC curve. Must be set if 'tpr' or 'thresholds' is set. + tpr (List[float]): + Optional. List of true positive rates for the ROC curve. Must be set if 'fpr' or 'thresholds' is set. + threshold (List[float]): + Optional. List of thresholds for the ROC curve. Must be set if 'fpr' or 'tpr' is set. + display_name (str): + Optional. The user-defined name for the classification metric artifact. + + Raises: + ValueError: if 'labels' and 'matrix' are not set together + or if 'labels' and 'matrix' are not in the same length + or if 'fpr' and 'tpr' and 'threshold' are not set together + or if 'fpr' and 'tpr' and 'threshold' are not in the same length + """ + if (labels or matrix) and not (labels and matrix): + raise ValueError("labels and matrix must be set together.") + + if (fpr or tpr or threshold) and not (fpr and tpr and threshold): + raise ValueError("fpr, tpr, and thresholds must be set together.") + + if labels and matrix: + if len(matrix) != len(labels): + raise ValueError( + "Length of labels and matrix must be the same. " + "Got lengths {} and {} respectively.".format( + len(labels), len(matrix) + ) + ) + annotation_specs = [ + schema_utils.AnnotationSpec(display_name=label) for label in labels + ] + confusion_matrix = schema_utils.ConfusionMatrix( + annotation_specs=annotation_specs, + matrix=matrix, + ) + + if fpr and tpr and threshold: + if ( + len(fpr) != len(tpr) + or len(fpr) != len(threshold) + or len(tpr) != len(threshold) + ): + raise ValueError( + "Length of fpr, tpr and threshold must be the same. " + "Got lengths {}, {} and {} respectively.".format( + len(fpr), len(tpr), len(threshold) + ) + ) + + confidence_metrics = [ + schema_utils.ConfidenceMetric( + confidence_threshold=confidence_threshold, + false_positive_rate=false_positive_rate, + recall=recall, + ) + for confidence_threshold, false_positive_rate, recall in zip( + threshold, fpr, tpr + ) + ] + + classification_metrics = google_artifact_schema.ClassificationMetrics( + display_name=display_name, + confusion_matrix=confusion_matrix, + confidence_metrics=confidence_metrics, + ) + + classfication_metrics = classification_metrics.create() + self._metadata_node.add_artifacts_and_executions( + artifact_resource_names=[classfication_metrics.resource_name] + ) + @_v1_not_supported def get_time_series_data_frame(self) -> "pd.DataFrame": # noqa: F821 """Returns all time series in this Run as a DataFrame. @@ -1149,6 +1255,65 @@ def get_metrics(self) -> Dict[str, Union[float, int, str]]: else: return self._metadata_node.metadata[constants._METRIC_KEY] + @_v1_not_supported + def get_classification_metrics(self) -> List[Dict[str, Union[str, List]]]: + """Get all the classification metrics logged to this run. + + ``` + my_run = aiplatform.ExperimentRun('my-run', experiment='my-experiment') + metric = my_run.get_classification_metrics()[0] + print(metric) + ## print result: + { + "id": "e6c893a4-222e-4c60-a028-6a3b95dfc109", + "display_name": "my-classification-metrics", + "labels": ["cat", "dog"], + "matrix": [[9,1], [1,9]], + "fpr": [0.1, 0.5, 0.9], + "tpr": [0.1, 0.7, 0.9], + "thresholds": [0.9, 0.5, 0.1] + } + ``` + + Returns: + List of classification metrics logged to this experiment run. + """ + + artifact_list = artifact.Artifact.list( + filter=metadata_utils._make_filter_string( + in_context=[self.resource_name], + schema_title=google_artifact_schema.ClassificationMetrics.schema_title, + ), + project=self.project, + location=self.location, + credentials=self.credentials, + ) + + metrics = [] + for metric_artifact in artifact_list: + metric = {} + metric["id"] = metric_artifact.name + metric["display_name"] = metric_artifact.display_name + metadata = metric_artifact.metadata + if "confusionMatrix" in metadata: + metric["labels"] = [ + d["displayName"] + for d in metadata["confusionMatrix"]["annotationSpecs"] + ] + metric["matrix"] = metadata["confusionMatrix"]["rows"] + + if "confidenceMetrics" in metadata: + metric["fpr"] = [ + d["falsePositiveRate"] for d in metadata["confidenceMetrics"] + ] + metric["tpr"] = [d["recall"] for d in metadata["confidenceMetrics"]] + metric["threshold"] = [ + d["confidenceThreshold"] for d in metadata["confidenceMetrics"] + ] + metrics.append(metric) + + return metrics + @_v1_not_supported def associate_execution(self, execution: execution.Execution): """Associate an execution to this experiment run. diff --git a/google/cloud/aiplatform/metadata/metadata.py b/google/cloud/aiplatform/metadata/metadata.py index 6f67a6ddf6..d103a79733 100644 --- a/google/cloud/aiplatform/metadata/metadata.py +++ b/google/cloud/aiplatform/metadata/metadata.py @@ -15,8 +15,7 @@ # limitations under the License. # - -from typing import Dict, Union, Optional, Any +from typing import Dict, Union, Optional, Any, List from google.api_core import exceptions from google.auth import credentials as auth_credentials @@ -371,6 +370,62 @@ def log_metrics(self, metrics: Dict[str, Union[float, int, str]]): # query the latest metrics artifact resource before logging. self._experiment_run.log_metrics(metrics=metrics) + def log_classification_metrics( + self, + *, + labels: Optional[List[str]] = None, + matrix: Optional[List[List[int]]] = None, + fpr: Optional[List[float]] = None, + tpr: Optional[List[float]] = None, + threshold: Optional[List[float]] = None, + display_name: Optional[str] = None, + ): + """Create an artifact for classification metrics and log to ExperimentRun. Currently support confusion matrix and ROC curve. + + ``` + my_run = aiplatform.ExperimentRun('my-run', experiment='my-experiment') + my_run.log_classification_metrics( + display_name='my-classification-metrics', + labels=['cat', 'dog'], + matrix=[[9, 1], [1, 9]], + fpr=[0.1, 0.5, 0.9], + tpr=[0.1, 0.7, 0.9], + threshold=[0.9, 0.5, 0.1], + ) + ``` + + Args: + labels (List[str]): + Optional. List of label names for the confusion matrix. Must be set if 'matrix' is set. + matrix (List[List[int]): + Optional. Values for the confusion matrix. Must be set if 'labels' is set. + fpr (List[float]): + Optional. List of false positive rates for the ROC curve. Must be set if 'tpr' or 'thresholds' is set. + tpr (List[float]): + Optional. List of true positive rates for the ROC curve. Must be set if 'fpr' or 'thresholds' is set. + threshold (List[float]): + Optional. List of thresholds for the ROC curve. Must be set if 'fpr' or 'tpr' is set. + display_name (str): + Optional. The user-defined name for the classification metric artifact. + + Raises: + ValueError: if 'labels' and 'matrix' are not set together + or if 'labels' and 'matrix' are not in the same length + or if 'fpr' and 'tpr' and 'threshold' are not set together + or if 'fpr' and 'tpr' and 'threshold' are not in the same length + """ + + self._validate_experiment_and_run(method_name="log_classification_metrics") + # query the latest metrics artifact resource before logging. + self._experiment_run.log_classification_metrics( + display_name=display_name, + labels=labels, + matrix=matrix, + fpr=fpr, + tpr=tpr, + threshold=threshold, + ) + def _validate_experiment_and_run(self, method_name: str): """Validates Experiment and Run are set and raises informative error message. diff --git a/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py b/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py index e52f2f98b5..4941e42480 100644 --- a/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py +++ b/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py @@ -15,7 +15,7 @@ # limitations under the License. import copy -from typing import Optional, Dict +from typing import Optional, Dict, List from google.cloud.aiplatform.compat.types import artifact as gca_artifact from google.cloud.aiplatform.metadata.schema import base_artifact @@ -24,6 +24,12 @@ # The artifact property key for the resource_name _ARTIFACT_PROPERTY_KEY_RESOURCE_NAME = "resourceName" +_CLASSIFICATION_METRICS_AGGREGATION_TYPE = [ + "AGGREGATION_TYPE_UNSPECIFIED", + "MACRO_AVERAGE", + "MICRO_AVERAGE", +] + class VertexDataset(base_artifact.BaseArtifactSchema): """An artifact representing a Vertex Dataset.""" @@ -278,9 +284,17 @@ class ClassificationMetrics(base_artifact.BaseArtifactSchema): def __init__( self, *, + aggregation_type: Optional[str] = None, + aggregation_threshold: Optional[float] = None, + recall: Optional[float] = None, + precision: Optional[float] = None, + f1_score: Optional[float] = None, + accuracy: Optional[float] = None, au_prc: Optional[float] = None, au_roc: Optional[float] = None, log_loss: Optional[float] = None, + confusion_matrix: Optional[utils.ConfusionMatrix] = None, + confidence_metrics: Optional[List[utils.ConfidenceMetric]] = None, artifact_id: Optional[str] = None, uri: Optional[str] = None, display_name: Optional[str] = None, @@ -290,6 +304,22 @@ def __init__( state: Optional[gca_artifact.Artifact.State] = gca_artifact.Artifact.State.LIVE, ): """Args: + aggregation_type (str): + Optional. The way to generate the aggregated metrics. Choose from the following options: + "AGGREGATION_TYPE_UNSPECIFIED": Indicating unset, used for per-class sliced metrics + "MACRO_AVERAGE": The unweighted average, default behavior + "MICRO_AVERAGE": The weighted average + aggregation_threshold (float): + Optional. The threshold used to generate aggregated metrics, default 0 for multi-class classification, 0.5 for binary classification. + recall (float): + Optional. Recall (True Positive Rate) for the given confidence threshold. + precision (float): + Optional. Precision for the given confidence threshold. + f1_score (float): + Optional. The harmonic mean of recall and precision. + accuracy (float): + Optional. Accuracy is the fraction of predictions given the correct label. + For multiclass this is a micro-average metric. au_prc (float): Optional. The Area Under Precision-Recall Curve metric. Micro-averaged for the overall evaluation. @@ -298,6 +328,10 @@ def __init__( Micro-averaged for the overall evaluation. log_loss (float): Optional. The Log Loss metric. + confusion_matrix (utils.ConfusionMatrix): + Optional. Aggregated confusion matrix. + confidence_metrics (List[utils.ConfidenceMetric]): + Optional. List of metrics for different confidence thresholds. artifact_id (str): Optional. The portion of the Artifact name with the format. This is globally unique in a metadataStore: @@ -323,12 +357,35 @@ def __init__( check the validity of state transitions. """ extended_metadata = copy.deepcopy(metadata) if metadata else {} - if au_prc: + if aggregation_type: + if aggregation_type not in _CLASSIFICATION_METRICS_AGGREGATION_TYPE: + ## Todo: add negative test case for this + raise ValueError( + "aggregation_type can only be 'AGGREGATION_TYPE_UNSPECIFIED', 'MACRO_AVERAGE', or 'MICRO_AVERAGE'." + ) + extended_metadata["aggregationType"] = aggregation_type + if aggregation_threshold is not None: + extended_metadata["aggregationThreshold"] = aggregation_threshold + if recall is not None: + extended_metadata["recall"] = recall + if precision is not None: + extended_metadata["precision"] = precision + if f1_score is not None: + extended_metadata["f1Score"] = f1_score + if accuracy is not None: + extended_metadata["accuracy"] = accuracy + if au_prc is not None: extended_metadata["auPrc"] = au_prc - if au_roc: + if au_roc is not None: extended_metadata["auRoc"] = au_roc - if log_loss: + if log_loss is not None: extended_metadata["logLoss"] = log_loss + if confusion_matrix: + extended_metadata["confusionMatrix"] = confusion_matrix.to_dict() + if confidence_metrics: + extended_metadata["confidenceMetrics"] = [ + confidence_metric.to_dict() for confidence_metric in confidence_metrics + ] super(ClassificationMetrics, self).__init__( uri=uri, diff --git a/google/cloud/aiplatform/metadata/schema/utils.py b/google/cloud/aiplatform/metadata/schema/utils.py index 1b4a5e4f6c..c6e23735b6 100644 --- a/google/cloud/aiplatform/metadata/schema/utils.py +++ b/google/cloud/aiplatform/metadata/schema/utils.py @@ -143,6 +143,155 @@ def to_dict(self): return results +@dataclass +class AnnotationSpec: + """A class that represents the annotation spec of a Confusion Matrix. + Args: + display_name (str): + Optional. Display name for a column of a confusion matrix. + id (str): + Optional. Id for a column of a confusion matrix. + """ + + display_name: Optional[str] = None + id: Optional[str] = None + + def to_dict(self): + """ML metadata schema dictionary representation of this DataClass""" + results = {} + if self.display_name: + results["displayName"] = self.display_name + if self.id: + results["id"] = self.id + + return results + + +@dataclass +class ConfusionMatrix: + """A class that represents a Confusion Matrix. + Args: + matrix (List[List[int]]): + Required. A 2D array of integers that represets the values for the confusion matrix. + annotation_specs: (List(AnnotationSpec)): + Optional. List of column annotation specs which contains display_name (str) and id (str) + """ + + matrix: List[List[int]] + annotation_specs: Optional[List[AnnotationSpec]] = None + + def to_dict(self): + ## Todo: add a validation to check 'matrix' and 'annotation_specs' have the same length + """ML metadata schema dictionary representation of this DataClass""" + results = {} + if self.annotation_specs: + results["annotationSpecs"] = [ + annotation_spec.to_dict() for annotation_spec in self.annotation_specs + ] + if self.matrix: + results["rows"] = self.matrix + + return results + + +@dataclass +class ConfidenceMetric: + """A class that represents a Confidence Metric. + Args: + confidence_threshold (float): + Required. Metrics are computed with an assumption that the Model never returns predictions with a score lower than this value. + For binary classification this is the positive class threshold. For multi-class classification this is the confidence threshold. + recall (float): + Optional. Recall (True Positive Rate) for the given confidence threshold. + precision (float): + Optional. Precision for the given confidence threshold. + f1_score (float): + Optional. The harmonic mean of recall and precision. + max_predictions (int): + Optional. Metrics are computed with an assumption that the Model always returns at most this many predictions (ordered by their score, descendingly). + But they all still need to meet the `confidence_threshold`. + false_positive_rate (float): + Optional. False Positive Rate for the given confidence threshold. + accuracy (float): + Optional. Accuracy is the fraction of predictions given the correct label. For multiclass this is a micro-average metric. + true_positive_count (int): + Optional. The number of Model created labels that match a ground truth label. + false_positive_count (int): + Optional. The number of Model created labels that do not match a ground truth label. + false_negative_count (int): + Optional. The number of ground truth labels that are not matched by a Model created label. + true_negative_count (int): + Optional. The number of labels that were not created by the Model, but if they would, they would not match a ground truth label. + recall_at_1 (float): + Optional. The Recall (True Positive Rate) when only considering the label that has the highest prediction score + and not below the confidence threshold for each DataItem. + precision_at_1 (float): + Optional. The precision when only considering the label that has the highest prediction score + and not below the confidence threshold for each DataItem. + false_positive_rate_at_1 (float): + Optional. The False Positive Rate when only considering the label that has the highest prediction score + and not below the confidence threshold for each DataItem. + f1_score_at_1 (float): + Optional. The harmonic mean of recallAt1 and precisionAt1. + confusion_matrix (ConfusionMatrix): + Optional. Confusion matrix for the given confidence threshold. + """ + + confidence_threshold: float + recall: Optional[float] = None + precision: Optional[float] = None + f1_score: Optional[float] = None + max_predictions: Optional[int] = None + false_positive_rate: Optional[float] = None + accuracy: Optional[float] = None + true_positive_count: Optional[int] = None + false_positive_count: Optional[int] = None + false_negative_count: Optional[int] = None + true_negative_count: Optional[int] = None + recall_at_1: Optional[float] = None + precision_at_1: Optional[float] = None + false_positive_rate_at_1: Optional[float] = None + f1_score_at_1: Optional[float] = None + confusion_matrix: Optional[ConfusionMatrix] = None + + def to_dict(self): + """ML metadata schema dictionary representation of this DataClass""" + results = {} + results["confidenceThreshold"] = self.confidence_threshold + if self.recall is not None: + results["recall"] = self.recall + if self.precision is not None: + results["precision"] = self.precision + if self.f1_score is not None: + results["f1Score"] = self.f1_score + if self.max_predictions is not None: + results["maxPredictions"] = self.max_predictions + if self.false_positive_rate is not None: + results["falsePositiveRate"] = self.false_positive_rate + if self.accuracy is not None: + results["accuracy"] = self.accuracy + if self.true_positive_count is not None: + results["truePositiveCount"] = self.true_positive_count + if self.false_positive_count is not None: + results["falsePositiveCount"] = self.false_positive_count + if self.false_negative_count is not None: + results["falseNegativeCount"] = self.false_negative_count + if self.true_negative_count is not None: + results["trueNegativeCount"] = self.true_negative_count + if self.recall_at_1 is not None: + results["recallAt1"] = self.recall_at_1 + if self.precision_at_1 is not None: + results["precisionAt1"] = self.precision_at_1 + if self.false_positive_rate_at_1 is not None: + results["falsePositiveRateAt1"] = self.false_positive_rate_at_1 + if self.f1_score_at_1 is not None: + results["f1ScoreAt1"] = self.f1_score_at_1 + if self.confusion_matrix: + results["confusionMatrix"] = self.confusion_matrix.to_dict() + + return results + + def create_uri_from_resource_name(resource_name: str) -> str: """Construct the service URI for a given resource_name. Args: diff --git a/samples/model-builder/conftest.py b/samples/model-builder/conftest.py index 0f984aa7ee..b5bec845ba 100644 --- a/samples/model-builder/conftest.py +++ b/samples/model-builder/conftest.py @@ -1,4 +1,4 @@ -# Copyright 2021 Google LLC +# Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -643,6 +643,12 @@ def mock_time_series_metrics(): yield mock +@pytest.fixture +def mock_classification_metrics(): + mock = MagicMock() + yield mock + + @pytest.fixture def mock_get_execution(mock_execution): with patch.object(aiplatform, "Execution") as mock_get_execution: @@ -825,6 +831,13 @@ def mock_log_params(): yield mock_log_params +@pytest.fixture +def mock_log_classification_metrics(): + with patch.object(aiplatform, "log_classification_metrics") as mock_log_metrics: + mock_log_metrics.return_value = None + yield mock_log_metrics + + @pytest.fixture def mock_log_pipeline_job(): with patch.object(aiplatform, "log") as mock_log_pipeline_job: @@ -883,6 +896,13 @@ def mock_get_time_series_metrics(mock_time_series_metrics, mock_experiment_run): yield mock_get_time_series_metrics +@pytest.fixture +def mock_get_classification_metrics(mock_classification_metrics, mock_experiment_run): + with patch.object(mock_experiment_run, "get_classification_metrics") as mock_get_classification_metrics: + mock_get_classification_metrics.return_value = mock_classification_metrics + yield mock_get_classification_metrics + + """ ---------------------------------------------------------------------------- Model Versioning Fixtures diff --git a/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample.py b/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample.py new file mode 100644 index 0000000000..284ed9f968 --- /dev/null +++ b/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample.py @@ -0,0 +1,34 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Union + +from google.cloud import aiplatform + + +# [START aiplatform_sdk_get_experiment_run_classification_metrics_sample] +def get_experiment_run_classification_metrics_sample( + run_name: str, + experiment: Union[str, aiplatform.Experiment], + project: str, + location: str, +) -> List[Dict[str, Union[str, List]]]: + experiment_run = aiplatform.ExperimentRun( + run_name=run_name, experiment=experiment, project=project, location=location + ) + + return experiment_run.get_classification_metrics() + + +# [END aiplatform_sdk_get_experiment_run_classification_metrics_sample] diff --git a/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample_test.py b/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample_test.py new file mode 100644 index 0000000000..3f6deb80bf --- /dev/null +++ b/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample_test.py @@ -0,0 +1,34 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import get_experiment_run_classification_metrics_sample + +import pytest + +import test_constants as constants + + +@pytest.mark.usefixtures("mock_get_run") +def test_get_experiment_run_classification_metrics_sample(mock_get_classification_metrics, mock_classification_metrics): + + classification_metrics = get_experiment_run_classification_metrics_sample.get_experiment_run_classification_metrics_sample( + run_name=constants.EXPERIMENT_RUN_NAME, + experiment=constants.EXPERIMENT_NAME, + project=constants.PROJECT, + location=constants.LOCATION, + ) + + mock_get_classification_metrics.assert_called_with() + + assert classification_metrics is mock_classification_metrics diff --git a/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py b/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py new file mode 100644 index 0000000000..e178356c9c --- /dev/null +++ b/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py @@ -0,0 +1,47 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional + +from google.cloud import aiplatform + + +# [START aiplatform_sdk_log_classification_metrics_sample] +def log_classification_metrics_sample( + experiment_name: str, + run_name: str, + project: str, + location: str, + labels: Optional[List[str]] = None, + matrix: Optional[List[List[int]]] = None, + fpr: Optional[List[float]] = None, + tpr: Optional[List[float]] = None, + threshold: Optional[List[float]] = None, + display_name: Optional[str] = None, +) -> None: + aiplatform.init(experiment=experiment_name, project=project, location=location) + + aiplatform.start_run(run=run_name, resume=True) + + aiplatform.log_classification_metrics( + labels=labels, + matrix=matrix, + fpr=fpr, + tpr=tpr, + threshold=threshold, + display_name=display_name, + ) + + +# [END aiplatform_sdk_log_classification_metrics_sample] diff --git a/samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py b/samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py new file mode 100644 index 0000000000..c15fd0b123 --- /dev/null +++ b/samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py @@ -0,0 +1,45 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import log_classification_metrics_sample + +import pytest + +import test_constants as constants + + +@pytest.mark.usefixtures("mock_sdk_init", "mock_start_run") +def test_log_metrics_sample(mock_log_classification_metrics): + + log_classification_metrics_sample.log_classification_metrics_sample( + experiment_name=constants.EXPERIMENT_NAME, + run_name=constants.EXPERIMENT_RUN_NAME, + project=constants.PROJECT, + location=constants.LOCATION, + labels=constants.CLASSIFICATION_METRICS["labels"], + matrix=constants.CLASSIFICATION_METRICS["matrix"], + fpr=constants.CLASSIFICATION_METRICS["fpr"], + tpr=constants.CLASSIFICATION_METRICS["tpr"], + threshold=constants.CLASSIFICATION_METRICS["threshold"], + display_name=constants.CLASSIFICATION_METRICS["display_name"], + ) + + mock_log_classification_metrics.assert_called_with( + labels=constants.CLASSIFICATION_METRICS["labels"], + matrix=constants.CLASSIFICATION_METRICS["matrix"], + fpr=constants.CLASSIFICATION_METRICS["fpr"], + tpr=constants.CLASSIFICATION_METRICS["tpr"], + threshold=constants.CLASSIFICATION_METRICS["threshold"], + display_name=constants.CLASSIFICATION_METRICS["display_name"], + ) diff --git a/samples/model-builder/test_constants.py b/samples/model-builder/test_constants.py index 1ff2b1d96e..76f8d7673b 100644 --- a/samples/model-builder/test_constants.py +++ b/samples/model-builder/test_constants.py @@ -272,7 +272,14 @@ METRICS = {"accuracy": 0.1} PARAMS = {"learning_rate": 0.1} - +CLASSIFICATION_METRICS = { + "display_name": "my-classification-metrics", + "labels": ["cat", "dog"], + "matrix": [[9, 1], [1, 9]], + "fpr": [0.1, 0.5, 0.9], + "tpr": [0.1, 0.7, 0.9], + "threshold": [0.9, 0.5, 0.1], +} TEMPLATE_PATH = "pipeline.json" STEP = 1 diff --git a/tests/system/aiplatform/test_experiments.py b/tests/system/aiplatform/test_experiments.py index ada7c68f82..83d96d945e 100644 --- a/tests/system/aiplatform/test_experiments.py +++ b/tests/system/aiplatform/test_experiments.py @@ -37,6 +37,15 @@ _TIME_SERIES_METRIC_KEY = "accuracy" +_CLASSIFICATION_METRICS = { + "display_name": "my-classification-metrics", + "labels": ["cat", "dog"], + "matrix": [[9, 1], [1, 9]], + "fpr": [0.1, 0.5, 0.9], + "tpr": [0.1, 0.7, 0.9], + "threshold": [0.9, 0.5, 0.1], +} + @pytest.mark.usefixtures( "prepare_staging_bucket", "delete_staging_bucket", "tear_down_resources" @@ -145,6 +154,28 @@ def test_log_time_series_metrics(self): _TIME_SERIES_METRIC_KEY: [float(value) for value in range(5)], } + def test_log_classification_metrics(self, shared_state): + aiplatform.init( + project=e2e_base._PROJECT, + location=e2e_base._LOCATION, + experiment=self._experiment_name, + ) + aiplatform.start_run(_RUN, resume=True) + aiplatform.log_classification_metrics( + display_name=_CLASSIFICATION_METRICS["display_name"], + labels=_CLASSIFICATION_METRICS["labels"], + matrix=_CLASSIFICATION_METRICS["matrix"], + fpr=_CLASSIFICATION_METRICS["fpr"], + tpr=_CLASSIFICATION_METRICS["tpr"], + threshold=_CLASSIFICATION_METRICS["threshold"], + ) + + run = aiplatform.ExperimentRun(run_name=_RUN, experiment=self._experiment_name) + metrics = run.get_classification_metrics()[0] + metric_artifact = aiplatform.Artifact(metrics.pop("id")) + assert metrics == _CLASSIFICATION_METRICS + metric_artifact.delete() + def test_create_artifact(self, shared_state): ds = aiplatform.Artifact.create( schema_title="system.Dataset", diff --git a/tests/unit/aiplatform/test_metadata.py b/tests/unit/aiplatform/test_metadata.py index ba5a527683..a8a73b899e 100644 --- a/tests/unit/aiplatform/test_metadata.py +++ b/tests/unit/aiplatform/test_metadata.py @@ -56,6 +56,7 @@ from google.cloud.aiplatform.metadata import metadata from google.cloud.aiplatform.metadata import metadata_store from google.cloud.aiplatform.metadata import utils as metadata_utils + from google.cloud.aiplatform import utils from test_pipeline_jobs import mock_pipeline_service_get # noqa: F401 @@ -123,6 +124,16 @@ _TEST_METRICS = {_TEST_METRIC_KEY_1: 222, _TEST_METRIC_KEY_2: 1} _TEST_OTHER_METRICS = {_TEST_METRIC_KEY_2: 0.9} +# classification_metrics +_TEST_CLASSIFICATION_METRICS = { + "display_name": "my-classification-metrics", + "labels": ["cat", "dog"], + "matrix": [[9, 1], [1, 9]], + "fpr": [0.1, 0.5, 0.9], + "tpr": [0.1, 0.7, 0.9], + "threshold": [0.9, 0.5, 0.1], +} + # schema _TEST_WRONG_SCHEMA_TITLE = "system.WrongSchema" @@ -408,6 +419,50 @@ def query_execution_inputs_and_outputs_mock(): yield query_execution_inputs_and_outputs_mock +_TEST_CLASSIFICATION_METRICS_METADATA = { + "confusionMatrix": { + "annotationSpecs": [{"displayName": "cat"}, {"displayName": "dog"}], + "rows": [[9, 1], [1, 9]], + }, + "confidenceMetrics": [ + {"confidenceThreshold": 0.9, "recall": 0.1, "falsePositiveRate": 0.1}, + {"confidenceThreshold": 0.5, "recall": 0.7, "falsePositiveRate": 0.5}, + {"confidenceThreshold": 0.1, "recall": 0.9, "falsePositiveRate": 0.9}, + ], +} + +_TEST_CLASSIFICATION_METRICS_ARTIFACT = GapicArtifact( + name=_TEST_ARTIFACT_NAME, + display_name=_TEST_CLASSIFICATION_METRICS["display_name"], + schema_title=constants.GOOGLE_CLASSIFICATION_METRICS, + schema_version=constants._DEFAULT_SCHEMA_VERSION, + metadata=_TEST_CLASSIFICATION_METRICS_METADATA, + state=GapicArtifact.State.LIVE, +) + + +@pytest.fixture +def create_classification_metrics_artifact_mock(): + with patch.object( + MetadataServiceClient, "create_artifact" + ) as create_classification_metrics_artifact_mock: + create_classification_metrics_artifact_mock.return_value = ( + _TEST_CLASSIFICATION_METRICS_ARTIFACT + ) + yield create_classification_metrics_artifact_mock + + +@pytest.fixture +def get_classification_metrics_artifact_mock(): + with patch.object( + MetadataServiceClient, "get_artifact" + ) as get_classification_metrics_artifact_mock: + get_classification_metrics_artifact_mock.return_value = ( + _TEST_CLASSIFICATION_METRICS_ARTIFACT + ) + yield get_classification_metrics_artifact_mock + + @pytest.fixture def get_artifact_mock(): with patch.object(MetadataServiceClient, "get_artifact") as get_artifact_mock: @@ -1131,6 +1186,56 @@ def test_log_metrics(self, update_context_mock): update_context_mock.assert_called_once_with(context=_TRUE_CONTEXT) + @pytest.mark.usefixtures( + "get_metadata_store_mock", + "get_experiment_mock", + "create_experiment_run_context_mock", + "add_context_children_mock", + ) + def test_log_classification_metrics( + self, + create_classification_metrics_artifact_mock, + get_classification_metrics_artifact_mock, + add_context_artifacts_and_executions_mock, + ): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + experiment=_TEST_EXPERIMENT, + ) + aiplatform.start_run(_TEST_RUN) + aiplatform.log_classification_metrics( + display_name=_TEST_CLASSIFICATION_METRICS["display_name"], + labels=_TEST_CLASSIFICATION_METRICS["labels"], + matrix=_TEST_CLASSIFICATION_METRICS["matrix"], + fpr=_TEST_CLASSIFICATION_METRICS["fpr"], + tpr=_TEST_CLASSIFICATION_METRICS["tpr"], + threshold=_TEST_CLASSIFICATION_METRICS["threshold"], + ) + + expected_artifact = GapicArtifact( + display_name=_TEST_CLASSIFICATION_METRICS["display_name"], + schema_title=constants.GOOGLE_CLASSIFICATION_METRICS, + schema_version=constants._DEFAULT_SCHEMA_VERSION, + metadata=_TEST_CLASSIFICATION_METRICS_METADATA, + state=GapicArtifact.State.LIVE, + ) + create_classification_metrics_artifact_mock.assert_called_once_with( + parent=_TEST_PARENT, + artifact=expected_artifact, + artifact_id=None, + ) + + get_classification_metrics_artifact_mock.assert_called_once_with( + name=_TEST_ARTIFACT_NAME, retry=base._DEFAULT_RETRY + ) + + add_context_artifacts_and_executions_mock.assert_called_once_with( + context=_TEST_EXPERIMENT_RUN_CONTEXT_NAME, + artifacts=[_TEST_ARTIFACT_NAME], + executions=None, + ) + @pytest.mark.usefixtures( "get_metadata_store_mock", "get_experiment_mock", diff --git a/tests/unit/aiplatform/test_metadata_schema.py b/tests/unit/aiplatform/test_metadata_schema.py index 0003838968..8af4d351da 100644 --- a/tests/unit/aiplatform/test_metadata_schema.py +++ b/tests/unit/aiplatform/test_metadata_schema.py @@ -64,7 +64,7 @@ _TEST_DESCRIPTION = "test description" _TEST_METADATA = {"test-param1": 1, "test-param2": "test-value", "test-param3": True} _TEST_UPDATED_METADATA = { - "test-param1": 2, + "test-param1": 2.0, "test-param2": "test-value-1", "test-param3": False, } @@ -748,14 +748,46 @@ def test_classification_metrics_title_is_set_correctly(self): assert artifact.schema_title == "google.ClassificationMetrics" def test_classification_metrics_constructor_parameters_are_set_correctly(self): + aggregation_type = "MACRO_AVERAGE" + aggregation_threshold = 0.5 + recall = 0.5 + precision = 0.5 + f1_score = 0.5 + accuracy = 0.5 au_prc = 1.0 au_roc = 2.0 log_loss = 0.5 + confusion_matrix = utils.ConfusionMatrix( + matrix=[[9.0, 1.0], [1.0, 9.0]], + annotation_specs=[ + utils.AnnotationSpec(display_name="cat"), + utils.AnnotationSpec(display_name="dog"), + ], + ) + confidence_metrics = [ + utils.ConfidenceMetric( + confidence_threshold=0.9, recall=0.1, false_positive_rate=0.1 + ), + utils.ConfidenceMetric( + confidence_threshold=0.5, recall=0.5, false_positive_rate=0.7 + ), + utils.ConfidenceMetric( + confidence_threshold=0.1, recall=0.9, false_positive_rate=0.9 + ), + ] artifact = google_artifact_schema.ClassificationMetrics( + aggregation_type=aggregation_type, + aggregation_threshold=aggregation_threshold, + recall=recall, + precision=precision, + f1_score=f1_score, + accuracy=accuracy, au_prc=au_prc, au_roc=au_roc, log_loss=log_loss, + confusion_matrix=confusion_matrix, + confidence_metrics=confidence_metrics, artifact_id=_TEST_ARTIFACT_ID, uri=_TEST_URI, display_name=_TEST_DISPLAY_NAME, @@ -764,12 +796,22 @@ def test_classification_metrics_constructor_parameters_are_set_correctly(self): metadata=_TEST_UPDATED_METADATA, ) expected_metadata = { - "test-param1": 2.0, - "test-param2": "test-value-1", - "test-param3": False, - "auPrc": 1.0, - "auRoc": 2.0, - "logLoss": 0.5, + "test-param1": _TEST_UPDATED_METADATA["test-param1"], + "test-param2": _TEST_UPDATED_METADATA["test-param2"], + "test-param3": _TEST_UPDATED_METADATA["test-param3"], + "aggregationType": aggregation_type, + "aggregationThreshold": aggregation_threshold, + "recall": recall, + "precision": precision, + "f1Score": f1_score, + "accuracy": accuracy, + "auPrc": au_prc, + "auRoc": au_roc, + "logLoss": log_loss, + "confusionMatrix": confusion_matrix.to_dict(), + "confidenceMetrics": [ + confidence_metric.to_dict() for confidence_metric in confidence_metrics + ], } assert artifact.artifact_id == _TEST_ARTIFACT_ID