From 3ca2d283c8b1a73b77f7bba9cfa72dd59059bc50 Mon Sep 17 00:00:00 2001
From: Jaycee Li <102714969+jaycee-li@users.noreply.github.com>
Date: Mon, 26 Sep 2022 09:00:06 -0700
Subject: [PATCH 01/14] Experiments complex metrics (#8)

* feat: new class and API for metrics

* update system test

* update high level log method

* fix system test

* update example

* change from system schema to google schema
---
 google/cloud/aiplatform/__init__.py           |   4 +
 .../metadata/experiment_run_resource.py       | 156 ++++++++++++++++++
 google/cloud/aiplatform/metadata/metadata.py  |  59 ++++++-
 samples/model-builder/conftest.py             |   6 +
 .../log_classification_metrics_sample.py      |  47 ++++++
 .../log_classification_metrics_sample_test.py |  38 +++++
 samples/model-builder/test_constants.py       |   9 +-
 tests/system/aiplatform/test_experiments.py   |  31 ++++
 tests/unit/aiplatform/test_metadata.py        |  72 ++++++++
 9 files changed, 419 insertions(+), 3 deletions(-)
 create mode 100644 samples/model-builder/experiment_tracking/log_classification_metrics_sample.py
 create mode 100644 samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py

diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py
index 8107756229..bc1043ef03 100644
--- a/google/cloud/aiplatform/__init__.py
+++ b/google/cloud/aiplatform/__init__.py
@@ -86,6 +86,9 @@
 
 log_params = metadata.metadata._experiment_tracker.log_params
 log_metrics = metadata.metadata._experiment_tracker.log_metrics
+log_classification_metrics = (
+    metadata.metadata._experiment_tracker.log_classification_metrics
+)
 get_experiment_df = metadata.metadata._experiment_tracker.get_experiment_df
 start_run = metadata.metadata._experiment_tracker.start_run
 start_execution = metadata.metadata._experiment_tracker.start_execution
@@ -110,6 +113,7 @@
     "log",
     "log_params",
     "log_metrics",
+    "log_classification_metrics",
     "log_time_series_metrics",
     "get_experiment_df",
     "get_pipeline_df",
diff --git a/google/cloud/aiplatform/metadata/experiment_run_resource.py b/google/cloud/aiplatform/metadata/experiment_run_resource.py
index d61b62b7b2..31026fb23b 100644
--- a/google/cloud/aiplatform/metadata/experiment_run_resource.py
+++ b/google/cloud/aiplatform/metadata/experiment_run_resource.py
@@ -39,6 +39,7 @@
 from google.cloud.aiplatform.metadata import metadata
 from google.cloud.aiplatform.metadata import resource
 from google.cloud.aiplatform.metadata import utils as metadata_utils
+from google.cloud.aiplatform.metadata import schema
 from google.cloud.aiplatform.tensorboard import tensorboard_resource
 from google.cloud.aiplatform.utils import rest_utils
 
@@ -990,6 +991,103 @@ def log_metrics(self, metrics: Dict[str, Union[float, int, str]]):
             # TODO: query the latest metrics artifact resource before logging.
             self._metadata_node.update(metadata={constants._METRIC_KEY: metrics})
 
+    def log_classification_metrics(
+        self,
+        *,
+        labels: Optional[List[str]] = None,
+        matrix: Optional[List[List[int]]] = None,
+        fpr: Optional[List[float]] = None,
+        tpr: Optional[List[float]] = None,
+        threshold: Optional[List[float]] = None,
+        display_name: Optional[str] = None,
+    ):
+        """Create an artifact for classification metrics and log to ExperimentRun. Currently support confusion matrix and ROC curve.
+
+        ```
+        my_run = aiplatform.ExperimentRun('my-run', experiment='my-experiment')
+        my_run.log_classification_metrics(
+            display_name='my-classification-metrics',
+            labels=['cat', 'dog'],
+            matrix=[[9, 1], [1, 9]],
+            fpr=[0.1, 0.5, 0.9],
+            tpr=[0.1, 0.7, 0.9],
+            threshold=[0.9, 0.5, 0.1],
+        )
+        ```
+
+        Args:
+            labels (List[str]):
+                Optional. List of label names for the confusion matrix. Must be set if 'matrix' is set.
+            matrix (List[List[int]):
+                Optional. Values for the confusion matrix. Must be set if 'labels' is set.
+            fpr (List[float]):
+                Optional. List of false positive rates for the ROC curve. Must be set if 'tpr' or 'thresholds' is set.
+            tpr (List[float]):
+                Optional. List of true positive rates for the ROC curve. Must be set if 'fpr' or 'thresholds' is set.
+            threshold (List[float]):
+                Optional. List of thresholds for the ROC curve. Must be set if 'fpr' or 'tpr' is set.
+            display_name (str):
+                Optional. The user-defined name for the classification metric artifact.
+
+        Raises:
+            ValueError: if 'labels' and 'matrix' are not set together
+                        or if 'labels' and 'matrix' are not in the same length
+                        or if 'fpr' and 'tpr' and 'threshold' are not set together
+                        or if 'fpr' and 'tpr' and 'threshold' are not in the same length
+        """
+        if (labels or matrix) and not (labels and matrix):
+            raise ValueError("labels and matrix must be set together.")
+
+        if (fpr or tpr or threshold) and not (fpr and tpr and threshold):
+            raise ValueError("fpr, tpr, and thresholds must be set together.")
+
+        metadata = {}
+        if labels and matrix:
+            if len(matrix) != len(labels):
+                raise ValueError(
+                    "Length of labels and matrix must be the same. "
+                    "Got lengths {} and {} respectively.".format(
+                        len(labels), len(matrix)
+                    )
+                )
+
+            confusion_matrix = {
+                "annotationSpecs": [{"displayName": label} for label in labels],
+                "rows": matrix,
+            }
+            metadata["confusionMatrix"] = confusion_matrix
+
+        if fpr and tpr and threshold:
+            if (
+                len(fpr) != len(tpr)
+                or len(fpr) != len(threshold)
+                or len(tpr) != len(threshold)
+            ):
+                raise ValueError(
+                    "Length of fpr, tpr and threshold must be the same. "
+                    "Got lengths {}, {} and {} respectively.".format(
+                        len(fpr), len(tpr), len(threshold)
+                    )
+                )
+
+            metadata["confidenceMetrics"] = [
+                {
+                    "confidenceThreshold": threshold[i],
+                    "recall": tpr[i],
+                    "falsePositiveRate": fpr[i],
+                }
+                for i in range(len(fpr))
+            ]
+
+        classification_metrics = schema.google.artifact_schema.ClassificationMetrics(
+            display_name=display_name,
+            metadata=metadata,
+        )
+        classfication_metrics = classification_metrics.create()
+        self._metadata_node.add_artifacts_and_executions(
+            artifact_resource_names=[classfication_metrics.resource_name]
+        )
+
     @_v1_not_supported
     def get_time_series_data_frame(self) -> "pd.DataFrame":  # noqa: F821
         """Returns all time series in this Run as a DataFrame.
@@ -1149,6 +1247,64 @@ def get_metrics(self) -> Dict[str, Union[float, int, str]]:
         else:
             return self._metadata_node.metadata[constants._METRIC_KEY]
 
+    def get_classification_metrics(self) -> List[Dict[str, Union[str, List]]]:
+        """Get all the classification metrics logged to this run.
+
+        ```
+        my_run = aiplatform.ExperimentRun('my-run', experiment='my-experiment')
+        metric = my_run.get_classification_metrics()[0]
+        print(metric)
+        ## print result:
+            {
+                "id": "e6c893a4-222e-4c60-a028-6a3b95dfc109",
+                "display_name": "my-classification-metrics",
+                "labels": ["cat", "dog"],
+                "matrix": [[9,1], [1,9]],
+                "fpr": [0.1, 0.5, 0.9],
+                "tpr": [0.1, 0.7, 0.9],
+                "thresholds": [0.9, 0.5, 0.1]
+            }
+        ```
+
+        Returns:
+            List of classification metrics logged to this experiment run.
+        """
+
+        artifact_list = artifact.Artifact.list(
+            filter=metadata_utils._make_filter_string(
+                in_context=[self.resource_name],
+                schema_title="google.ClassificationMetrics",
+            ),
+            project=self.project,
+            location=self.location,
+            credentials=self.credentials,
+        )
+
+        metrics = []
+        for metric_artifact in artifact_list:
+            metric = {}
+            metric["id"] = metric_artifact.name
+            metric["display_name"] = metric_artifact.display_name
+            metadata = metric_artifact.metadata
+            if "confusionMatrix" in metadata:
+                metric["labels"] = [
+                    d["displayName"]
+                    for d in metadata["confusionMatrix"]["annotationSpecs"]
+                ]
+                metric["matrix"] = metadata["confusionMatrix"]["rows"]
+
+            if "confidenceMetrics" in metadata:
+                metric["fpr"] = [
+                    d["falsePositiveRate"] for d in metadata["confidenceMetrics"]
+                ]
+                metric["tpr"] = [d["recall"] for d in metadata["confidenceMetrics"]]
+                metric["threshold"] = [
+                    d["confidenceThreshold"] for d in metadata["confidenceMetrics"]
+                ]
+            metrics.append(metric)
+
+        return metrics
+
     @_v1_not_supported
     def associate_execution(self, execution: execution.Execution):
         """Associate an execution to this experiment run.
diff --git a/google/cloud/aiplatform/metadata/metadata.py b/google/cloud/aiplatform/metadata/metadata.py
index 6f67a6ddf6..d103a79733 100644
--- a/google/cloud/aiplatform/metadata/metadata.py
+++ b/google/cloud/aiplatform/metadata/metadata.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 #
 
-
-from typing import Dict, Union, Optional, Any
+from typing import Dict, Union, Optional, Any, List
 
 from google.api_core import exceptions
 from google.auth import credentials as auth_credentials
@@ -371,6 +370,62 @@ def log_metrics(self, metrics: Dict[str, Union[float, int, str]]):
         # query the latest metrics artifact resource before logging.
         self._experiment_run.log_metrics(metrics=metrics)
 
+    def log_classification_metrics(
+        self,
+        *,
+        labels: Optional[List[str]] = None,
+        matrix: Optional[List[List[int]]] = None,
+        fpr: Optional[List[float]] = None,
+        tpr: Optional[List[float]] = None,
+        threshold: Optional[List[float]] = None,
+        display_name: Optional[str] = None,
+    ):
+        """Create an artifact for classification metrics and log to ExperimentRun. Currently support confusion matrix and ROC curve.
+
+        ```
+        my_run = aiplatform.ExperimentRun('my-run', experiment='my-experiment')
+        my_run.log_classification_metrics(
+            display_name='my-classification-metrics',
+            labels=['cat', 'dog'],
+            matrix=[[9, 1], [1, 9]],
+            fpr=[0.1, 0.5, 0.9],
+            tpr=[0.1, 0.7, 0.9],
+            threshold=[0.9, 0.5, 0.1],
+        )
+        ```
+
+        Args:
+            labels (List[str]):
+                Optional. List of label names for the confusion matrix. Must be set if 'matrix' is set.
+            matrix (List[List[int]):
+                Optional. Values for the confusion matrix. Must be set if 'labels' is set.
+            fpr (List[float]):
+                Optional. List of false positive rates for the ROC curve. Must be set if 'tpr' or 'thresholds' is set.
+            tpr (List[float]):
+                Optional. List of true positive rates for the ROC curve. Must be set if 'fpr' or 'thresholds' is set.
+            threshold (List[float]):
+                Optional. List of thresholds for the ROC curve. Must be set if 'fpr' or 'tpr' is set.
+            display_name (str):
+                Optional. The user-defined name for the classification metric artifact.
+
+        Raises:
+            ValueError: if 'labels' and 'matrix' are not set together
+                        or if 'labels' and 'matrix' are not in the same length
+                        or if 'fpr' and 'tpr' and 'threshold' are not set together
+                        or if 'fpr' and 'tpr' and 'threshold' are not in the same length
+        """
+
+        self._validate_experiment_and_run(method_name="log_classification_metrics")
+        # query the latest metrics artifact resource before logging.
+        self._experiment_run.log_classification_metrics(
+            display_name=display_name,
+            labels=labels,
+            matrix=matrix,
+            fpr=fpr,
+            tpr=tpr,
+            threshold=threshold,
+        )
+
     def _validate_experiment_and_run(self, method_name: str):
         """Validates Experiment and Run are set and raises informative error message.
 
diff --git a/samples/model-builder/conftest.py b/samples/model-builder/conftest.py
index 0f984aa7ee..a94ba6fe15 100644
--- a/samples/model-builder/conftest.py
+++ b/samples/model-builder/conftest.py
@@ -824,6 +824,12 @@ def mock_log_params():
         mock_log_params.return_value = None
         yield mock_log_params
 
+@pytest.fixture
+def mock_log_classification_metrics():
+    with patch.object(aiplatform, "log_classification_metrics") as mock_log_metrics:
+        mock_log_metrics.return_value = None
+        yield mock_log_metrics
+
 
 @pytest.fixture
 def mock_log_pipeline_job():
diff --git a/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py b/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py
new file mode 100644
index 0000000000..7b908e5b13
--- /dev/null
+++ b/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py
@@ -0,0 +1,47 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+from google.cloud import aiplatform
+
+
+#  [START aiplatform_sdk_log_classification_metrics_sample]
+def log_classification_metrics_sample(
+    experiment_name: str,
+    run_name: str,
+    project: str,
+    location: str,
+    labels: Optional[List[str]] = None,
+    matrix: Optional[List[List[int]]] = None,
+    fpr: Optional[List[float]] = None,
+    tpr: Optional[List[float]] = None,
+    threshold: Optional[List[float]] = None,
+    display_name: Optional[str] = None,
+):
+    aiplatform.init(experiment=experiment_name, project=project, location=location)
+
+    aiplatform.start_run(run=run_name, resume=True)
+
+    aiplatform.log_classification_metrics(
+        labels=labels,
+        matrix=matrix,
+        fpr=fpr,
+        tpr=tpr,
+        threshold=threshold,
+        display_name=display_name,
+    )
+
+
+#  [END aiplatform_sdk_log_params_sample]
diff --git a/samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py b/samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py
new file mode 100644
index 0000000000..794fcc413b
--- /dev/null
+++ b/samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py
@@ -0,0 +1,38 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import log_classification_metrics_sample
+
+import pytest
+
+import test_constants as constants
+
+
+@pytest.mark.usefixtures("mock_sdk_init", "mock_start_run")
+def test_log_metrics_sample(mock_log_classification_metrics):
+
+    log_classification_metrics_sample.log_classification_metrics_sample(
+        experiment_name=constants.EXPERIMENT_NAME,
+        run_name=constants.EXPERIMENT_RUN_NAME,
+        project=constants.PROJECT,
+        location=constants.LOCATION,
+        labels=constants.CLASSIFICATION_METRICS["labels"],
+        matrix=constants.CLASSIFICATION_METRICS["matrix"],
+        fpr=constants.CLASSIFICATION_METRICS["fpr"],
+        tpr=constants.CLASSIFICATION_METRICS["tpr"],
+        threshold=constants.CLASSIFICATION_METRICS["threshold"],
+        display_name=constants.CLASSIFICATION_METRICS["display_name"],
+    )
+
+    mock_log_classification_metrics.assert_called_with(constants.CLASSIFICATION_METRICS)
diff --git a/samples/model-builder/test_constants.py b/samples/model-builder/test_constants.py
index 1ff2b1d96e..76f8d7673b 100644
--- a/samples/model-builder/test_constants.py
+++ b/samples/model-builder/test_constants.py
@@ -272,7 +272,14 @@
 
 METRICS = {"accuracy": 0.1}
 PARAMS = {"learning_rate": 0.1}
-
+CLASSIFICATION_METRICS = {
+    "display_name": "my-classification-metrics",
+    "labels": ["cat", "dog"],
+    "matrix": [[9, 1], [1, 9]],
+    "fpr": [0.1, 0.5, 0.9],
+    "tpr": [0.1, 0.7, 0.9],
+    "threshold": [0.9, 0.5, 0.1],
+}
 TEMPLATE_PATH = "pipeline.json"
 
 STEP = 1
diff --git a/tests/system/aiplatform/test_experiments.py b/tests/system/aiplatform/test_experiments.py
index ada7c68f82..83d96d945e 100644
--- a/tests/system/aiplatform/test_experiments.py
+++ b/tests/system/aiplatform/test_experiments.py
@@ -37,6 +37,15 @@
 
 _TIME_SERIES_METRIC_KEY = "accuracy"
 
+_CLASSIFICATION_METRICS = {
+    "display_name": "my-classification-metrics",
+    "labels": ["cat", "dog"],
+    "matrix": [[9, 1], [1, 9]],
+    "fpr": [0.1, 0.5, 0.9],
+    "tpr": [0.1, 0.7, 0.9],
+    "threshold": [0.9, 0.5, 0.1],
+}
+
 
 @pytest.mark.usefixtures(
     "prepare_staging_bucket", "delete_staging_bucket", "tear_down_resources"
@@ -145,6 +154,28 @@ def test_log_time_series_metrics(self):
             _TIME_SERIES_METRIC_KEY: [float(value) for value in range(5)],
         }
 
+    def test_log_classification_metrics(self, shared_state):
+        aiplatform.init(
+            project=e2e_base._PROJECT,
+            location=e2e_base._LOCATION,
+            experiment=self._experiment_name,
+        )
+        aiplatform.start_run(_RUN, resume=True)
+        aiplatform.log_classification_metrics(
+            display_name=_CLASSIFICATION_METRICS["display_name"],
+            labels=_CLASSIFICATION_METRICS["labels"],
+            matrix=_CLASSIFICATION_METRICS["matrix"],
+            fpr=_CLASSIFICATION_METRICS["fpr"],
+            tpr=_CLASSIFICATION_METRICS["tpr"],
+            threshold=_CLASSIFICATION_METRICS["threshold"],
+        )
+
+        run = aiplatform.ExperimentRun(run_name=_RUN, experiment=self._experiment_name)
+        metrics = run.get_classification_metrics()[0]
+        metric_artifact = aiplatform.Artifact(metrics.pop("id"))
+        assert metrics == _CLASSIFICATION_METRICS
+        metric_artifact.delete()
+
     def test_create_artifact(self, shared_state):
         ds = aiplatform.Artifact.create(
             schema_title="system.Dataset",
diff --git a/tests/unit/aiplatform/test_metadata.py b/tests/unit/aiplatform/test_metadata.py
index ba5a527683..3dacac2547 100644
--- a/tests/unit/aiplatform/test_metadata.py
+++ b/tests/unit/aiplatform/test_metadata.py
@@ -123,6 +123,16 @@
 _TEST_METRICS = {_TEST_METRIC_KEY_1: 222, _TEST_METRIC_KEY_2: 1}
 _TEST_OTHER_METRICS = {_TEST_METRIC_KEY_2: 0.9}
 
+# classification_metrics
+_TEST_CLASSIFICATION_METRICS = {
+    "display_name": "my-classification-metrics",
+    "labels": ["cat", "dog"],
+    "matrix": [[9, 1], [1, 9]],
+    "fpr": [0.1, 0.5, 0.9],
+    "tpr": [0.1, 0.7, 0.9],
+    "threshold": [0.9, 0.5, 0.1],
+}
+
 # schema
 _TEST_WRONG_SCHEMA_TITLE = "system.WrongSchema"
 
@@ -408,6 +418,34 @@ def query_execution_inputs_and_outputs_mock():
         yield query_execution_inputs_and_outputs_mock
 
 
+_TEST_CLASSIFICATION_METRICS_METADATA = {
+    "confusionMatrix": {
+        "annotationSpecs": [{"displayName": "cat"}, {"displayName": "dog"}],
+        "rows": [{"row": [9, 1]}, {"row": [1, 9]}],
+    },
+    "confidenceMetrics": [
+        {"confidenceThreshold": 0.9, "recall": 0.1, "falsePositiveRate": 0.1},
+        {"confidenceThreshold": 0.5, "recall": 0.5, "falsePositiveRate": 0.7},
+        {"confidenceThreshold": 0.1, "recall": 0.9, "falsePositiveRate": 0.9},
+    ],
+}
+
+_TEST_CLASSIFICATION_METRICS_ARTIFACT = GapicArtifact(
+    name=_TEST_ARTIFACT_NAME,
+    display_name=_TEST_CLASSIFICATION_METRICS["display_name"],
+    schema_title=constants.GOOGLE_CLASSIFICATION_METRICS,
+    schema_version=constants._DEFAULT_SCHEMA_VERSION,
+    metadata=_TEST_CLASSIFICATION_METRICS_METADATA,
+)
+
+
+@pytest.fixture
+def create_artifact_mock():
+    with patch.object(MetadataServiceClient, "create_artifact") as create_artifact_mock:
+        create_artifact_mock.return_value = _TEST_CLASSIFICATION_METRICS_ARTIFACT
+        yield create_artifact_mock
+
+
 @pytest.fixture
 def get_artifact_mock():
     with patch.object(MetadataServiceClient, "get_artifact") as get_artifact_mock:
@@ -1131,6 +1169,40 @@ def test_log_metrics(self, update_context_mock):
 
         update_context_mock.assert_called_once_with(context=_TRUE_CONTEXT)
 
+    @pytest.mark.usefixtures(
+        "get_metadata_store_mock",
+        "get_experiment_mock",
+        "create_experiment_run_context_mock",
+        "add_context_children_mock",
+    )
+    def test_log_classification_metrics(
+        self,
+        create_artifact_mock,
+        add_context_artifacts_and_executions_mock,
+    ):
+        aiplatform.init(
+            project=_TEST_PROJECT,
+            location=_TEST_LOCATION,
+            experiment=_TEST_EXPERIMENT,
+        )
+        aiplatform.start_run(_TEST_RUN)
+        aiplatform.log_classification_metrics(
+            display_name=_TEST_CLASSIFICATION_METRICS["display_name"],
+            labels=_TEST_CLASSIFICATION_METRICS["labels"],
+            matrix=_TEST_CLASSIFICATION_METRICS["matrix"],
+            fpr=_TEST_CLASSIFICATION_METRICS["fpr"],
+            tpr=_TEST_CLASSIFICATION_METRICS["tpr"],
+            threshold=_TEST_CLASSIFICATION_METRICS["threshold"],
+        )
+
+        create_artifact_mock.assert_called_once_with(
+            metadata=_TEST_CLASSIFICATION_METRICS_METADATA
+        )
+
+        add_context_artifacts_and_executions_mock.assert_called_once_with(
+            artifact_resource_names=[_TEST_ARTIFACT_NAME]
+        )
+
     @pytest.mark.usefixtures(
         "get_metadata_store_mock",
         "get_experiment_mock",

From ee36af93f08f860813b5fc863c635a766e5e497e Mon Sep 17 00:00:00 2001
From: jaycee-li <jayceeli@google.com>
Date: Mon, 26 Sep 2022 09:15:36 -0700
Subject: [PATCH 02/14] fix: import error

---
 google/cloud/aiplatform/metadata/experiment_run_resource.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/google/cloud/aiplatform/metadata/experiment_run_resource.py b/google/cloud/aiplatform/metadata/experiment_run_resource.py
index 31026fb23b..3aedaee44d 100644
--- a/google/cloud/aiplatform/metadata/experiment_run_resource.py
+++ b/google/cloud/aiplatform/metadata/experiment_run_resource.py
@@ -39,7 +39,9 @@
 from google.cloud.aiplatform.metadata import metadata
 from google.cloud.aiplatform.metadata import resource
 from google.cloud.aiplatform.metadata import utils as metadata_utils
-from google.cloud.aiplatform.metadata import schema
+from google.cloud.aiplatform.metadata.schema.google import (
+    artifact_schema as google_artifact_schema,
+)
 from google.cloud.aiplatform.tensorboard import tensorboard_resource
 from google.cloud.aiplatform.utils import rest_utils
 
@@ -1079,7 +1081,7 @@ def log_classification_metrics(
                 for i in range(len(fpr))
             ]
 
-        classification_metrics = schema.google.artifact_schema.ClassificationMetrics(
+        classification_metrics = google_artifact_schema.ClassificationMetrics(
             display_name=display_name,
             metadata=metadata,
         )

From c1aa713f71e8c23ea83c72ab087a4e1176b0c4c6 Mon Sep 17 00:00:00 2001
From: jaycee-li <jayceeli@google.com>
Date: Mon, 26 Sep 2022 09:26:24 -0700
Subject: [PATCH 03/14] Update log_classification_metrics_sample.py

---
 .../experiment_tracking/log_classification_metrics_sample.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py b/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py
index 7b908e5b13..0f93ab35bf 100644
--- a/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py
+++ b/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py
@@ -44,4 +44,4 @@ def log_classification_metrics_sample(
     )
 
 
-#  [END aiplatform_sdk_log_params_sample]
+#  [END aiplatform_sdk_log_classification_metrics_sample]

From 0ea90bf1090e6edd88ce52ef17e6549ccbfc8f2f Mon Sep 17 00:00:00 2001
From: Jaycee Li <102714969+jaycee-li@users.noreply.github.com>
Date: Mon, 26 Sep 2022 09:43:09 -0700
Subject: [PATCH 04/14] Update
 samples/model-builder/experiment_tracking/log_classification_metrics_sample.py

Co-authored-by: Dan Lee <71398022+dandhlee@users.noreply.github.com>
---
 .../experiment_tracking/log_classification_metrics_sample.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py b/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py
index 0f93ab35bf..e178356c9c 100644
--- a/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py
+++ b/samples/model-builder/experiment_tracking/log_classification_metrics_sample.py
@@ -29,7 +29,7 @@ def log_classification_metrics_sample(
     tpr: Optional[List[float]] = None,
     threshold: Optional[List[float]] = None,
     display_name: Optional[str] = None,
-):
+) -> None:
     aiplatform.init(experiment=experiment_name, project=project, location=location)
 
     aiplatform.start_run(run=run_name, resume=True)

From 33b03ff7b31c5d8a6ee57a22f8e67d8f57329ca9 Mon Sep 17 00:00:00 2001
From: jaycee-li <jayceeli@google.com>
Date: Mon, 26 Sep 2022 09:56:33 -0700
Subject: [PATCH 05/14] Update log_classification_metrics_sample_test.py

---
 .../log_classification_metrics_sample_test.py            | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py b/samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py
index 794fcc413b..c15fd0b123 100644
--- a/samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py
+++ b/samples/model-builder/experiment_tracking/log_classification_metrics_sample_test.py
@@ -35,4 +35,11 @@ def test_log_metrics_sample(mock_log_classification_metrics):
         display_name=constants.CLASSIFICATION_METRICS["display_name"],
     )
 
-    mock_log_classification_metrics.assert_called_with(constants.CLASSIFICATION_METRICS)
+    mock_log_classification_metrics.assert_called_with(
+        labels=constants.CLASSIFICATION_METRICS["labels"],
+        matrix=constants.CLASSIFICATION_METRICS["matrix"],
+        fpr=constants.CLASSIFICATION_METRICS["fpr"],
+        tpr=constants.CLASSIFICATION_METRICS["tpr"],
+        threshold=constants.CLASSIFICATION_METRICS["threshold"],
+        display_name=constants.CLASSIFICATION_METRICS["display_name"],
+    )

From 5bf0d13f146cfd0705be97987ff784bf0d331c72 Mon Sep 17 00:00:00 2001
From: Jaycee Li <102714969+jaycee-li@users.noreply.github.com>
Date: Mon, 26 Sep 2022 10:59:45 -0700
Subject: [PATCH 06/14] Update samples/model-builder/conftest.py

Co-authored-by: Dan Lee <71398022+dandhlee@users.noreply.github.com>
---
 samples/model-builder/conftest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/samples/model-builder/conftest.py b/samples/model-builder/conftest.py
index a94ba6fe15..a303a3be6d 100644
--- a/samples/model-builder/conftest.py
+++ b/samples/model-builder/conftest.py
@@ -824,6 +824,7 @@ def mock_log_params():
         mock_log_params.return_value = None
         yield mock_log_params
 
+
 @pytest.fixture
 def mock_log_classification_metrics():
     with patch.object(aiplatform, "log_classification_metrics") as mock_log_metrics:

From df5c5d1b3f21f36289b32228048a634b34ea44ab Mon Sep 17 00:00:00 2001
From: jaycee-li <jayceeli@google.com>
Date: Mon, 26 Sep 2022 13:20:47 -0700
Subject: [PATCH 07/14] fix: unit test

---
 tests/unit/aiplatform/test_metadata.py | 41 ++++++++++++++++++--------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/tests/unit/aiplatform/test_metadata.py b/tests/unit/aiplatform/test_metadata.py
index 3dacac2547..aa0907def1 100644
--- a/tests/unit/aiplatform/test_metadata.py
+++ b/tests/unit/aiplatform/test_metadata.py
@@ -54,8 +54,12 @@
 from google.cloud.aiplatform.metadata import constants
 from google.cloud.aiplatform.metadata import experiment_run_resource
 from google.cloud.aiplatform.metadata import metadata
+from google.cloud.aiplatform.metadata import artifact
+from google.cloud.aiplatform.metadata import context
 from google.cloud.aiplatform.metadata import metadata_store
 from google.cloud.aiplatform.metadata import utils as metadata_utils
+from google.cloud.aiplatform.metadata.schema import base_artifact
+
 from google.cloud.aiplatform import utils
 
 from test_pipeline_jobs import mock_pipeline_service_get  # noqa: F401
@@ -430,7 +434,7 @@ def query_execution_inputs_and_outputs_mock():
     ],
 }
 
-_TEST_CLASSIFICATION_METRICS_ARTIFACT = GapicArtifact(
+_TEST_CLASSIFICATION_METRICS_ARTIFACT_RESOURCE = GapicArtifact(
     name=_TEST_ARTIFACT_NAME,
     display_name=_TEST_CLASSIFICATION_METRICS["display_name"],
     schema_title=constants.GOOGLE_CLASSIFICATION_METRICS,
@@ -438,12 +442,29 @@ def query_execution_inputs_and_outputs_mock():
     metadata=_TEST_CLASSIFICATION_METRICS_METADATA,
 )
 
+_TEST_CLASSIFICATION_METRICS_ARTIFACT = artifact.Artifact._empty_constructor()
+_TEST_CLASSIFICATION_METRICS_ARTIFACT._gca_resource = (
+    _TEST_CLASSIFICATION_METRICS_ARTIFACT_RESOURCE
+)
+
 
 @pytest.fixture
-def create_artifact_mock():
-    with patch.object(MetadataServiceClient, "create_artifact") as create_artifact_mock:
-        create_artifact_mock.return_value = _TEST_CLASSIFICATION_METRICS_ARTIFACT
-        yield create_artifact_mock
+def create_classification_metrics_artifact_mock():
+    with patch.object(
+        base_artifact.BaseArtifactSchema, "create"
+    ) as create_classification_metrics_artifact_mock:
+        create_classification_metrics_artifact_mock.return_value = (
+            _TEST_CLASSIFICATION_METRICS_ARTIFACT
+        )
+        yield create_classification_metrics_artifact_mock
+
+
+@pytest.fixture
+def context_add_artifacts_and_executions_mock():
+    with patch.object(
+        context.Context, "add_artifacts_and_executions"
+    ) as context_add_artifacts_and_executions_mock:
+        yield context_add_artifacts_and_executions_mock
 
 
 @pytest.fixture
@@ -1174,11 +1195,11 @@ def test_log_metrics(self, update_context_mock):
         "get_experiment_mock",
         "create_experiment_run_context_mock",
         "add_context_children_mock",
+        "create_classification_metrics_artifact_mock",
     )
     def test_log_classification_metrics(
         self,
-        create_artifact_mock,
-        add_context_artifacts_and_executions_mock,
+        context_add_artifacts_and_executions_mock,
     ):
         aiplatform.init(
             project=_TEST_PROJECT,
@@ -1195,11 +1216,7 @@ def test_log_classification_metrics(
             threshold=_TEST_CLASSIFICATION_METRICS["threshold"],
         )
 
-        create_artifact_mock.assert_called_once_with(
-            metadata=_TEST_CLASSIFICATION_METRICS_METADATA
-        )
-
-        add_context_artifacts_and_executions_mock.assert_called_once_with(
+        context_add_artifacts_and_executions_mock.assert_called_once_with(
             artifact_resource_names=[_TEST_ARTIFACT_NAME]
         )
 

From a82194a6c5d6a0dd9f467009d9074b8e2bb1ca51 Mon Sep 17 00:00:00 2001
From: jaycee-li <jayceeli@google.com>
Date: Wed, 28 Sep 2022 12:50:35 -0700
Subject: [PATCH 08/14] fix comments

---
 .../aiplatform/metadata/experiment_run_resource.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/google/cloud/aiplatform/metadata/experiment_run_resource.py b/google/cloud/aiplatform/metadata/experiment_run_resource.py
index 3aedaee44d..a17d616b86 100644
--- a/google/cloud/aiplatform/metadata/experiment_run_resource.py
+++ b/google/cloud/aiplatform/metadata/experiment_run_resource.py
@@ -993,6 +993,7 @@ def log_metrics(self, metrics: Dict[str, Union[float, int, str]]):
             # TODO: query the latest metrics artifact resource before logging.
             self._metadata_node.update(metadata={constants._METRIC_KEY: metrics})
 
+    @_v1_not_supported
     def log_classification_metrics(
         self,
         *,
@@ -1003,7 +1004,7 @@ def log_classification_metrics(
         threshold: Optional[List[float]] = None,
         display_name: Optional[str] = None,
     ):
-        """Create an artifact for classification metrics and log to ExperimentRun. Currently support confusion matrix and ROC curve.
+        """Create an artifact for classification metrics and log to ExperimentRun. Currently supports confusion matrix and ROC curve.
 
         ```
         my_run = aiplatform.ExperimentRun('my-run', experiment='my-experiment')
@@ -1074,11 +1075,13 @@ def log_classification_metrics(
 
             metadata["confidenceMetrics"] = [
                 {
-                    "confidenceThreshold": threshold[i],
-                    "recall": tpr[i],
-                    "falsePositiveRate": fpr[i],
+                    "confidenceThreshold": confidenceThreshold,
+                    "recall": recall,
+                    "falsePositiveRate": falsePositiveRate,
                 }
-                for i in range(len(fpr))
+                for falsePositiveRate, recall, confidenceThreshold in zip(
+                    fpr, tpr, threshold
+                )
             ]
 
         classification_metrics = google_artifact_schema.ClassificationMetrics(
@@ -1249,6 +1252,7 @@ def get_metrics(self) -> Dict[str, Union[float, int, str]]:
         else:
             return self._metadata_node.metadata[constants._METRIC_KEY]
 
+    @_v1_not_supported
     def get_classification_metrics(self) -> List[Dict[str, Union[str, List]]]:
         """Get all the classification metrics logged to this run.
 

From 9c04a50a485a88dc074531a00888c1eb5b6be7b3 Mon Sep 17 00:00:00 2001
From: jaycee-li <jayceeli@google.com>
Date: Wed, 28 Sep 2022 18:33:20 -0700
Subject: [PATCH 09/14] fix comments and update google.ClassificationMetrics

---
 tests/unit/aiplatform/test_metadata_schema.py | 55 ++++++++++++++++---
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/tests/unit/aiplatform/test_metadata_schema.py b/tests/unit/aiplatform/test_metadata_schema.py
index 0003838968..2a1705feeb 100644
--- a/tests/unit/aiplatform/test_metadata_schema.py
+++ b/tests/unit/aiplatform/test_metadata_schema.py
@@ -64,7 +64,6 @@
 _TEST_DESCRIPTION = "test description"
 _TEST_METADATA = {"test-param1": 1, "test-param2": "test-value", "test-param3": True}
 _TEST_UPDATED_METADATA = {
-    "test-param1": 2,
     "test-param2": "test-value-1",
     "test-param3": False,
 }
@@ -748,14 +747,46 @@ def test_classification_metrics_title_is_set_correctly(self):
         assert artifact.schema_title == "google.ClassificationMetrics"
 
     def test_classification_metrics_constructor_parameters_are_set_correctly(self):
+        aggregation_type = "MACRO_AVERAGE"
+        aggregation_threshold = 0.5
+        recall = 0.5
+        precision = 0.5
+        f1_score = 0.5
+        accuracy = 0.5
         au_prc = 1.0
         au_roc = 2.0
         log_loss = 0.5
+        confusion_matrix = utils.ConfusionMatrix(
+            matrix=[[9.0, 1.0], [1.0, 9.0]],
+            annotation_specs=[
+                utils.AnnotationSpec(display_name="cat"),
+                utils.AnnotationSpec(display_name="dog"),
+            ],
+        )
+        confidence_metrics = [
+            utils.ConfidenceMetric(
+                confidence_threshold=0.9, recall=0.1, false_positive_rate=0.1
+            ),
+            utils.ConfidenceMetric(
+                confidence_threshold=0.5, recall=0.5, false_positive_rate=0.7
+            ),
+            utils.ConfidenceMetric(
+                confidence_threshold=0.1, recall=0.9, false_positive_rate=0.9
+            ),
+        ]
 
         artifact = google_artifact_schema.ClassificationMetrics(
+            aggregation_type=aggregation_type,
+            aggregation_threshold=aggregation_threshold,
+            recall=recall,
+            precision=precision,
+            f1_score=f1_score,
+            accuracy=accuracy,
             au_prc=au_prc,
             au_roc=au_roc,
             log_loss=log_loss,
+            confusion_matrix=confusion_matrix,
+            confidence_metrics=confidence_metrics,
             artifact_id=_TEST_ARTIFACT_ID,
             uri=_TEST_URI,
             display_name=_TEST_DISPLAY_NAME,
@@ -764,12 +795,22 @@ def test_classification_metrics_constructor_parameters_are_set_correctly(self):
             metadata=_TEST_UPDATED_METADATA,
         )
         expected_metadata = {
-            "test-param1": 2.0,
-            "test-param2": "test-value-1",
-            "test-param3": False,
-            "auPrc": 1.0,
-            "auRoc": 2.0,
-            "logLoss": 0.5,
+            "test-param1": _TEST_UPDATED_METADATA["test-param1"],
+            "test-param2": _TEST_UPDATED_METADATA["test-param2"],
+            "test-param3": _TEST_UPDATED_METADATA["test-param3"],
+            "aggregationType": aggregation_type,
+            "aggregationThreshold": aggregation_threshold,
+            "recall": recall,
+            "precision": precision,
+            "f1Score": f1_score,
+            "accuracy": accuracy,
+            "auPrc": au_prc,
+            "auRoc": au_roc,
+            "logLoss": log_loss,
+            "confusionMatrix": confusion_matrix.to_dict(),
+            "confidenceMetrics": [
+                confidence_metric.to_dict() for confidence_metric in confidence_metrics
+            ],
         }
 
         assert artifact.artifact_id == _TEST_ARTIFACT_ID

From 796f196804935a5b5eda0ee969bb73142632f2e6 Mon Sep 17 00:00:00 2001
From: jaycee-li <jayceeli@google.com>
Date: Wed, 28 Sep 2022 18:36:30 -0700
Subject: [PATCH 10/14] fix comments and update ClassificationMetrics class

---
 .../metadata/experiment_run_resource.py       |  37 +++--
 .../metadata/schema/google/artifact_schema.py |  58 ++++++-
 .../cloud/aiplatform/metadata/schema/utils.py | 148 ++++++++++++++++++
 tests/unit/aiplatform/test_metadata.py        |  56 ++++---
 tests/unit/aiplatform/test_metadata_schema.py |   1 +
 5 files changed, 262 insertions(+), 38 deletions(-)

diff --git a/google/cloud/aiplatform/metadata/experiment_run_resource.py b/google/cloud/aiplatform/metadata/experiment_run_resource.py
index a17d616b86..326948dcec 100644
--- a/google/cloud/aiplatform/metadata/experiment_run_resource.py
+++ b/google/cloud/aiplatform/metadata/experiment_run_resource.py
@@ -39,6 +39,7 @@
 from google.cloud.aiplatform.metadata import metadata
 from google.cloud.aiplatform.metadata import resource
 from google.cloud.aiplatform.metadata import utils as metadata_utils
+from google.cloud.aiplatform.metadata.schema import utils as schema_utils
 from google.cloud.aiplatform.metadata.schema.google import (
     artifact_schema as google_artifact_schema,
 )
@@ -1044,7 +1045,6 @@ def log_classification_metrics(
         if (fpr or tpr or threshold) and not (fpr and tpr and threshold):
             raise ValueError("fpr, tpr, and thresholds must be set together.")
 
-        metadata = {}
         if labels and matrix:
             if len(matrix) != len(labels):
                 raise ValueError(
@@ -1053,12 +1053,13 @@ def log_classification_metrics(
                         len(labels), len(matrix)
                     )
                 )
-
-            confusion_matrix = {
-                "annotationSpecs": [{"displayName": label} for label in labels],
-                "rows": matrix,
-            }
-            metadata["confusionMatrix"] = confusion_matrix
+            annotation_specs = [
+                schema_utils.AnnotationSpec(display_name=label) for label in labels
+            ]
+            confusion_matrix = schema_utils.ConfusionMatrix(
+                annotation_specs=annotation_specs,
+                matrix=matrix,
+            )
 
         if fpr and tpr and threshold:
             if (
@@ -1073,21 +1074,23 @@ def log_classification_metrics(
                     )
                 )
 
-            metadata["confidenceMetrics"] = [
-                {
-                    "confidenceThreshold": confidenceThreshold,
-                    "recall": recall,
-                    "falsePositiveRate": falsePositiveRate,
-                }
-                for falsePositiveRate, recall, confidenceThreshold in zip(
-                    fpr, tpr, threshold
+            confidence_metrics = [
+                schema_utils.ConfidenceMetric(
+                    confidence_threshold=confidence_threshold,
+                    false_positive_rate=false_positive_rate,
+                    recall=recall,
+                )
+                for confidence_threshold, false_positive_rate, recall in zip(
+                    threshold, fpr, tpr
                 )
             ]
 
         classification_metrics = google_artifact_schema.ClassificationMetrics(
             display_name=display_name,
-            metadata=metadata,
+            confusion_matrix=confusion_matrix,
+            confidence_metrics=confidence_metrics,
         )
+
         classfication_metrics = classification_metrics.create()
         self._metadata_node.add_artifacts_and_executions(
             artifact_resource_names=[classfication_metrics.resource_name]
@@ -1279,7 +1282,7 @@ def get_classification_metrics(self) -> List[Dict[str, Union[str, List]]]:
         artifact_list = artifact.Artifact.list(
             filter=metadata_utils._make_filter_string(
                 in_context=[self.resource_name],
-                schema_title="google.ClassificationMetrics",
+                schema_title=google_artifact_schema.ClassificationMetrics.schema_title,
             ),
             project=self.project,
             location=self.location,
diff --git a/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py b/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py
index e52f2f98b5..e1b25f048d 100644
--- a/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py
+++ b/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 import copy
-from typing import Optional, Dict
+from typing import Optional, Dict, List
 
 from google.cloud.aiplatform.compat.types import artifact as gca_artifact
 from google.cloud.aiplatform.metadata.schema import base_artifact
@@ -24,6 +24,12 @@
 # The artifact property key for the resource_name
 _ARTIFACT_PROPERTY_KEY_RESOURCE_NAME = "resourceName"
 
+_CLASSIFICATION_METRICS_AGGREGATION_TYPE = [
+    "AGGREGATION_TYPE_UNSPECIFIED",
+    "MACRO_AVERAGE",
+    "MICRO_AVERAGE",
+]
+
 
 class VertexDataset(base_artifact.BaseArtifactSchema):
     """An artifact representing a Vertex Dataset."""
@@ -278,9 +284,17 @@ class ClassificationMetrics(base_artifact.BaseArtifactSchema):
     def __init__(
         self,
         *,
+        aggregation_type: Optional[str] = None,
+        aggregation_threshold: Optional[float] = None,
+        recall: Optional[float] = None,
+        precision: Optional[float] = None,
+        f1_score: Optional[float] = None,
+        accuracy: Optional[float] = None,
         au_prc: Optional[float] = None,
         au_roc: Optional[float] = None,
         log_loss: Optional[float] = None,
+        confusion_matrix: Optional[utils.ConfusionMatrix] = None,
+        confidence_metrics: Optional[List[utils.ConfidenceMetric]] = None,
         artifact_id: Optional[str] = None,
         uri: Optional[str] = None,
         display_name: Optional[str] = None,
@@ -290,6 +304,22 @@ def __init__(
         state: Optional[gca_artifact.Artifact.State] = gca_artifact.Artifact.State.LIVE,
     ):
         """Args:
+        aggregation_type (str):
+            Optional. The way to generate the aggregated metrics. Choose from the following options:
+            "AGGREGATION_TYPE_UNSPECIFIED": Indicating unset, used for per-class sliced metrics
+            "MACRO_AVERAGE": The unweighted average, default behavior
+            "MICRO_AVERAGE": The weighted average
+        aggregation_threshold (float):
+            Optional. The threshold used to generate aggregated metrics, default 0 for multi-class classification, 0.5 for binary classification.
+        recall (float):
+            Optional. Recall (True Positive Rate) for the given confidence threshold.
+        precision (float):
+            Optional. Precision for the given confidence threshold.
+        f1_score (float):
+            Optional. The harmonic mean of recall and precision.
+        accuracy (float):
+            Optional. Accuracy is the fraction of predictions given the correct label.
+            For multiclass this is a micro-average metric.
         au_prc (float):
             Optional. The Area Under Precision-Recall Curve metric.
             Micro-averaged for the overall evaluation.
@@ -298,6 +328,10 @@ def __init__(
             Micro-averaged for the overall evaluation.
         log_loss (float):
             Optional. The Log Loss metric.
+        confusion_matrix (utils.ConfusionMatrix):
+            Optional. Aggregated confusion matrix.
+        confidence_metrics (List[utils.ConfidenceMetric]):
+            Optional. List of metrics for different confidence thresholds.
         artifact_id (str):
             Optional. The <resource_id> portion of the Artifact name with
             the format. This is globally unique in a metadataStore:
@@ -323,12 +357,34 @@ def __init__(
             check the validity of state transitions.
         """
         extended_metadata = copy.deepcopy(metadata) if metadata else {}
+        if aggregation_type:
+            if aggregation_type not in _CLASSIFICATION_METRICS_AGGREGATION_TYPE:
+                raise ValueError(
+                    "aggregation_type can only be 'AGGREGATION_TYPE_UNSPECIFIED', 'MACRO_AVERAGE', or 'MICRO_AVERAGE'."
+                )
+            extended_metadata["aggregationType"] = aggregation_type
+        if aggregation_threshold:
+            extended_metadata["aggregationThreshold"] = aggregation_threshold
+        if recall:
+            extended_metadata["recall"] = recall
+        if precision:
+            extended_metadata["precision"] = precision
+        if f1_score:
+            extended_metadata["f1Score"] = f1_score
+        if accuracy:
+            extended_metadata["accuracy"] = accuracy
         if au_prc:
             extended_metadata["auPrc"] = au_prc
         if au_roc:
             extended_metadata["auRoc"] = au_roc
         if log_loss:
             extended_metadata["logLoss"] = log_loss
+        if confusion_matrix:
+            extended_metadata["confusionMatrix"] = confusion_matrix.to_dict()
+        if confidence_metrics:
+            extended_metadata["confidenceMetrics"] = [
+                confidence_metric.to_dict() for confidence_metric in confidence_metrics
+            ]
 
         super(ClassificationMetrics, self).__init__(
             uri=uri,
diff --git a/google/cloud/aiplatform/metadata/schema/utils.py b/google/cloud/aiplatform/metadata/schema/utils.py
index 1b4a5e4f6c..fbda44d6ae 100644
--- a/google/cloud/aiplatform/metadata/schema/utils.py
+++ b/google/cloud/aiplatform/metadata/schema/utils.py
@@ -143,6 +143,154 @@ def to_dict(self):
         return results
 
 
+@dataclass
+class AnnotationSpec:
+    """A class that represents the annotation spec of a Confusion Matrix.
+    Args:
+        display_name (str):
+            Optional. Display name for a column of a confusion matrix.
+        id (List[str]):
+            Optional. Id for a column of a confusion matrix.
+    """
+
+    display_name: Optional[str] = None
+    id: Optional[str] = None
+
+    def to_dict(self):
+        """ML metadata schema dictionary representation of this DataClass"""
+        results = {}
+        if self.display_name:
+            results["displayName"] = self.display_name
+        if self.id:
+            results["id"] = self.id
+
+        return results
+
+
+@dataclass
+class ConfusionMatrix:
+    """A class that represents a Confusion Matrix.
+    Args:
+        matrix (List[List[int]]):
+            Required. A 2D array of integers that represets the values for the confusion matrix.
+        annotation_specs: (List(AnnotationSpec)):
+            Optional. List of column annotation specs which contains display_name (str) and id (str)
+    """
+
+    matrix: List[List[int]]
+    annotation_specs: Optional[List[AnnotationSpec]] = None
+
+    def to_dict(self):
+        """ML metadata schema dictionary representation of this DataClass"""
+        results = {}
+        if self.annotation_specs:
+            results["annotationSpecs"] = [
+                annotation_spec.to_dict() for annotation_spec in self.annotation_specs
+            ]
+        if self.matrix:
+            results["rows"] = self.matrix
+
+        return results
+
+
+@dataclass
+class ConfidenceMetric:
+    """A class that represents a Confidence Metric.
+    Args:
+        confidence_threshold (float):
+            Required. Metrics are computed with an assumption that the Model never returns predictions with a score lower than this value.
+            For binary classification this is the positive class threshold. For multi-class classification this is the confidence threshold.
+        recall (float):
+            Optional. Recall (True Positive Rate) for the given confidence threshold.
+        precision (float):
+            Optional. Precision for the given confidence threshold.
+        f1_score (float):
+            Optional. The harmonic mean of recall and precision.
+        max_predictions (int):
+            Optional. Metrics are computed with an assumption that the Model always returns at most this many predictions (ordered by their score, descendingly).
+            But they all still need to meet the `confidence_threshold`.
+        false_positive_rate (float):
+            Optional. False Positive Rate for the given confidence threshold.
+        accuracy (float):
+            Optional. Accuracy is the fraction of predictions given the correct label. For multiclass this is a micro-average metric.
+        true_positive_count (int):
+            Optional. The number of Model created labels that match a ground truth label.
+        false_positive_count (int):
+            Optional. The number of Model created labels that do not match a ground truth label.
+        false_negative_count (int):
+            Optional. The number of ground truth labels that are not matched by a Model created label.
+        true_negative_count (int):
+            Optional. The number of labels that were not created by the Model, but if they would, they would not match a ground truth label.
+        recall_at_1 (float):
+            Optional. The Recall (True Positive Rate) when only considering the label that has the highest prediction score
+            and not below the confidence threshold for each DataItem.
+        precision_at_1 (float):
+            Optional. The precision when only considering the label that has the highest prediction score
+            and not below the confidence threshold for each DataItem.
+        false_positive_rate_at_1 (float):
+            Optional. The False Positive Rate when only considering the label that has the highest prediction score
+            and not below the confidence threshold for each DataItem.
+        f1_score_at_1 (float):
+            Optional. The harmonic mean of recallAt1 and precisionAt1.
+        confusion_matrix (ConfusionMatrix):
+            Optional. Confusion matrix for the given confidence threshold.
+    """
+
+    confidence_threshold: float
+    recall: Optional[float] = None
+    precision: Optional[float] = None
+    f1_score: Optional[float] = None
+    max_predictions: Optional[int] = None
+    false_positive_rate: Optional[float] = None
+    accuracy: Optional[float] = None
+    true_positive_count: Optional[int] = None
+    false_positive_count: Optional[int] = None
+    false_negative_count: Optional[int] = None
+    true_negative_count: Optional[int] = None
+    recall_at_1: Optional[float] = None
+    precision_at_1: Optional[float] = None
+    false_positive_rate_at_1: Optional[float] = None
+    f1_score_at_1: Optional[float] = None
+    confusion_matrix: Optional[ConfusionMatrix] = None
+
+    def to_dict(self):
+        """ML metadata schema dictionary representation of this DataClass"""
+        results = {}
+        results["confidenceThreshold"] = self.confidence_threshold
+        if self.recall:
+            results["recall"] = self.recall
+        if self.precision:
+            results["precision"] = self.precision
+        if self.f1_score:
+            results["f1Score"] = self.f1_score
+        if self.max_predictions:
+            results["maxPredictions"] = self.max_predictions
+        if self.false_positive_rate:
+            results["falsePositiveRate"] = self.false_positive_rate
+        if self.accuracy:
+            results["accuracy"] = self.accuracy
+        if self.true_positive_count:
+            results["truePositiveCount"] = self.true_positive_count
+        if self.false_positive_count:
+            results["falsePositiveCount"] = self.false_positive_count
+        if self.false_negative_count:
+            results["falseNegativeCount"] = self.false_negative_count
+        if self.true_negative_count:
+            results["trueNegativeCount"] = self.true_negative_count
+        if self.recall_at_1:
+            results["recallAt1"] = self.recall_at_1
+        if self.precision_at_1:
+            results["precisionAt1"] = self.precision_at_1
+        if self.false_positive_rate_at_1:
+            results["falsePositiveRateAt1"] = self.false_positive_rate_at_1
+        if self.f1_score_at_1:
+            results["f1ScoreAt1"] = self.f1_score_at_1
+        if self.confusion_matrix:
+            results["confusionMatrix"] = self.confusion_matrix.to_dict()
+
+        return results
+
+
 def create_uri_from_resource_name(resource_name: str) -> str:
     """Construct the service URI for a given resource_name.
     Args:
diff --git a/tests/unit/aiplatform/test_metadata.py b/tests/unit/aiplatform/test_metadata.py
index aa0907def1..a8a73b899e 100644
--- a/tests/unit/aiplatform/test_metadata.py
+++ b/tests/unit/aiplatform/test_metadata.py
@@ -54,11 +54,8 @@
 from google.cloud.aiplatform.metadata import constants
 from google.cloud.aiplatform.metadata import experiment_run_resource
 from google.cloud.aiplatform.metadata import metadata
-from google.cloud.aiplatform.metadata import artifact
-from google.cloud.aiplatform.metadata import context
 from google.cloud.aiplatform.metadata import metadata_store
 from google.cloud.aiplatform.metadata import utils as metadata_utils
-from google.cloud.aiplatform.metadata.schema import base_artifact
 
 from google.cloud.aiplatform import utils
 
@@ -425,33 +422,29 @@ def query_execution_inputs_and_outputs_mock():
 _TEST_CLASSIFICATION_METRICS_METADATA = {
     "confusionMatrix": {
         "annotationSpecs": [{"displayName": "cat"}, {"displayName": "dog"}],
-        "rows": [{"row": [9, 1]}, {"row": [1, 9]}],
+        "rows": [[9, 1], [1, 9]],
     },
     "confidenceMetrics": [
         {"confidenceThreshold": 0.9, "recall": 0.1, "falsePositiveRate": 0.1},
-        {"confidenceThreshold": 0.5, "recall": 0.5, "falsePositiveRate": 0.7},
+        {"confidenceThreshold": 0.5, "recall": 0.7, "falsePositiveRate": 0.5},
         {"confidenceThreshold": 0.1, "recall": 0.9, "falsePositiveRate": 0.9},
     ],
 }
 
-_TEST_CLASSIFICATION_METRICS_ARTIFACT_RESOURCE = GapicArtifact(
+_TEST_CLASSIFICATION_METRICS_ARTIFACT = GapicArtifact(
     name=_TEST_ARTIFACT_NAME,
     display_name=_TEST_CLASSIFICATION_METRICS["display_name"],
     schema_title=constants.GOOGLE_CLASSIFICATION_METRICS,
     schema_version=constants._DEFAULT_SCHEMA_VERSION,
     metadata=_TEST_CLASSIFICATION_METRICS_METADATA,
-)
-
-_TEST_CLASSIFICATION_METRICS_ARTIFACT = artifact.Artifact._empty_constructor()
-_TEST_CLASSIFICATION_METRICS_ARTIFACT._gca_resource = (
-    _TEST_CLASSIFICATION_METRICS_ARTIFACT_RESOURCE
+    state=GapicArtifact.State.LIVE,
 )
 
 
 @pytest.fixture
 def create_classification_metrics_artifact_mock():
     with patch.object(
-        base_artifact.BaseArtifactSchema, "create"
+        MetadataServiceClient, "create_artifact"
     ) as create_classification_metrics_artifact_mock:
         create_classification_metrics_artifact_mock.return_value = (
             _TEST_CLASSIFICATION_METRICS_ARTIFACT
@@ -460,11 +453,14 @@ def create_classification_metrics_artifact_mock():
 
 
 @pytest.fixture
-def context_add_artifacts_and_executions_mock():
+def get_classification_metrics_artifact_mock():
     with patch.object(
-        context.Context, "add_artifacts_and_executions"
-    ) as context_add_artifacts_and_executions_mock:
-        yield context_add_artifacts_and_executions_mock
+        MetadataServiceClient, "get_artifact"
+    ) as get_classification_metrics_artifact_mock:
+        get_classification_metrics_artifact_mock.return_value = (
+            _TEST_CLASSIFICATION_METRICS_ARTIFACT
+        )
+        yield get_classification_metrics_artifact_mock
 
 
 @pytest.fixture
@@ -1195,11 +1191,12 @@ def test_log_metrics(self, update_context_mock):
         "get_experiment_mock",
         "create_experiment_run_context_mock",
         "add_context_children_mock",
-        "create_classification_metrics_artifact_mock",
     )
     def test_log_classification_metrics(
         self,
-        context_add_artifacts_and_executions_mock,
+        create_classification_metrics_artifact_mock,
+        get_classification_metrics_artifact_mock,
+        add_context_artifacts_and_executions_mock,
     ):
         aiplatform.init(
             project=_TEST_PROJECT,
@@ -1216,8 +1213,27 @@ def test_log_classification_metrics(
             threshold=_TEST_CLASSIFICATION_METRICS["threshold"],
         )
 
-        context_add_artifacts_and_executions_mock.assert_called_once_with(
-            artifact_resource_names=[_TEST_ARTIFACT_NAME]
+        expected_artifact = GapicArtifact(
+            display_name=_TEST_CLASSIFICATION_METRICS["display_name"],
+            schema_title=constants.GOOGLE_CLASSIFICATION_METRICS,
+            schema_version=constants._DEFAULT_SCHEMA_VERSION,
+            metadata=_TEST_CLASSIFICATION_METRICS_METADATA,
+            state=GapicArtifact.State.LIVE,
+        )
+        create_classification_metrics_artifact_mock.assert_called_once_with(
+            parent=_TEST_PARENT,
+            artifact=expected_artifact,
+            artifact_id=None,
+        )
+
+        get_classification_metrics_artifact_mock.assert_called_once_with(
+            name=_TEST_ARTIFACT_NAME, retry=base._DEFAULT_RETRY
+        )
+
+        add_context_artifacts_and_executions_mock.assert_called_once_with(
+            context=_TEST_EXPERIMENT_RUN_CONTEXT_NAME,
+            artifacts=[_TEST_ARTIFACT_NAME],
+            executions=None,
         )
 
     @pytest.mark.usefixtures(
diff --git a/tests/unit/aiplatform/test_metadata_schema.py b/tests/unit/aiplatform/test_metadata_schema.py
index 2a1705feeb..8af4d351da 100644
--- a/tests/unit/aiplatform/test_metadata_schema.py
+++ b/tests/unit/aiplatform/test_metadata_schema.py
@@ -64,6 +64,7 @@
 _TEST_DESCRIPTION = "test description"
 _TEST_METADATA = {"test-param1": 1, "test-param2": "test-value", "test-param3": True}
 _TEST_UPDATED_METADATA = {
+    "test-param1": 2.0,
     "test-param2": "test-value-1",
     "test-param3": False,
 }

From fbc98ab4d67b576c1a0d20f92b6d6ec7257e75c3 Mon Sep 17 00:00:00 2001
From: jaycee-li <jayceeli@google.com>
Date: Thu, 29 Sep 2022 09:24:43 -0700
Subject: [PATCH 11/14] fix: ClassificationMetrics doesn't catch params with
 value=0

---
 .../metadata/schema/google/artifact_schema.py | 16 +++++-----
 .../cloud/aiplatform/metadata/schema/utils.py | 30 +++++++++----------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py b/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py
index e1b25f048d..44a5670339 100644
--- a/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py
+++ b/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py
@@ -363,21 +363,21 @@ def __init__(
                     "aggregation_type can only be 'AGGREGATION_TYPE_UNSPECIFIED', 'MACRO_AVERAGE', or 'MICRO_AVERAGE'."
                 )
             extended_metadata["aggregationType"] = aggregation_type
-        if aggregation_threshold:
+        if aggregation_threshold is not None:
             extended_metadata["aggregationThreshold"] = aggregation_threshold
-        if recall:
+        if recall is not None:
             extended_metadata["recall"] = recall
-        if precision:
+        if precision is not None:
             extended_metadata["precision"] = precision
-        if f1_score:
+        if f1_score is not None:
             extended_metadata["f1Score"] = f1_score
-        if accuracy:
+        if accuracy is not None:
             extended_metadata["accuracy"] = accuracy
-        if au_prc:
+        if au_prc is not None:
             extended_metadata["auPrc"] = au_prc
-        if au_roc:
+        if au_roc is not None:
             extended_metadata["auRoc"] = au_roc
-        if log_loss:
+        if log_loss is not None:
             extended_metadata["logLoss"] = log_loss
         if confusion_matrix:
             extended_metadata["confusionMatrix"] = confusion_matrix.to_dict()
diff --git a/google/cloud/aiplatform/metadata/schema/utils.py b/google/cloud/aiplatform/metadata/schema/utils.py
index fbda44d6ae..336699065b 100644
--- a/google/cloud/aiplatform/metadata/schema/utils.py
+++ b/google/cloud/aiplatform/metadata/schema/utils.py
@@ -149,7 +149,7 @@ class AnnotationSpec:
     Args:
         display_name (str):
             Optional. Display name for a column of a confusion matrix.
-        id (List[str]):
+        id (str):
             Optional. Id for a column of a confusion matrix.
     """
 
@@ -257,33 +257,33 @@ def to_dict(self):
         """ML metadata schema dictionary representation of this DataClass"""
         results = {}
         results["confidenceThreshold"] = self.confidence_threshold
-        if self.recall:
+        if self.recall is not None:
             results["recall"] = self.recall
-        if self.precision:
+        if self.precision is not None:
             results["precision"] = self.precision
-        if self.f1_score:
+        if self.f1_score is not None:
             results["f1Score"] = self.f1_score
-        if self.max_predictions:
+        if self.max_predictions is not None:
             results["maxPredictions"] = self.max_predictions
-        if self.false_positive_rate:
+        if self.false_positive_rate is not None:
             results["falsePositiveRate"] = self.false_positive_rate
-        if self.accuracy:
+        if self.accuracy is not None:
             results["accuracy"] = self.accuracy
-        if self.true_positive_count:
+        if self.true_positive_count is not None:
             results["truePositiveCount"] = self.true_positive_count
-        if self.false_positive_count:
+        if self.false_positive_count is not None:
             results["falsePositiveCount"] = self.false_positive_count
-        if self.false_negative_count:
+        if self.false_negative_count is not None:
             results["falseNegativeCount"] = self.false_negative_count
-        if self.true_negative_count:
+        if self.true_negative_count is not None:
             results["trueNegativeCount"] = self.true_negative_count
-        if self.recall_at_1:
+        if self.recall_at_1 is not None:
             results["recallAt1"] = self.recall_at_1
-        if self.precision_at_1:
+        if self.precision_at_1 is not None:
             results["precisionAt1"] = self.precision_at_1
-        if self.false_positive_rate_at_1:
+        if self.false_positive_rate_at_1 is not None:
             results["falsePositiveRateAt1"] = self.false_positive_rate_at_1
-        if self.f1_score_at_1:
+        if self.f1_score_at_1 is not None:
             results["f1ScoreAt1"] = self.f1_score_at_1
         if self.confusion_matrix:
             results["confusionMatrix"] = self.confusion_matrix.to_dict()

From 6fb76ba776d48d11a6d95c38aaf44066045308ef Mon Sep 17 00:00:00 2001
From: jaycee-li <jayceeli@google.com>
Date: Thu, 29 Sep 2022 12:05:26 -0700
Subject: [PATCH 12/14] add sample for get_classification_metrics

---
 samples/model-builder/conftest.py             | 12 ++++++-
 ...iment_run_classification_metrics_sample.py | 34 +++++++++++++++++++
 ..._run_classification_metrics_sample_test.py | 34 +++++++++++++++++++
 3 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample.py
 create mode 100644 samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample_test.py

diff --git a/samples/model-builder/conftest.py b/samples/model-builder/conftest.py
index a303a3be6d..6a4fde868a 100644
--- a/samples/model-builder/conftest.py
+++ b/samples/model-builder/conftest.py
@@ -1,4 +1,4 @@
-# Copyright 2021 Google LLC
+# Copyright 2022 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -642,6 +642,11 @@ def mock_time_series_metrics():
     mock = MagicMock()
     yield mock
 
+@pytest.fixture
+def mock_classification_metrics():
+    mock = MagicMock()
+    yield mock
+
 
 @pytest.fixture
 def mock_get_execution(mock_execution):
@@ -889,6 +894,11 @@ def mock_get_time_series_metrics(mock_time_series_metrics, mock_experiment_run):
         mock_get_time_series_metrics.return_value = mock_time_series_metrics
         yield mock_get_time_series_metrics
 
+@pytest.fixture
+def mock_get_classification_metrics(mock_classification_metrics, mock_experiment_run):
+    with patch.object(mock_experiment_run, "get_classification_metrics") as mock_get_classification_metrics:
+        mock_get_classification_metrics.return_value = mock_classification_metrics
+        yield mock_get_classification_metrics
 
 """
 ----------------------------------------------------------------------------
diff --git a/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample.py b/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample.py
new file mode 100644
index 0000000000..e2676094c5
--- /dev/null
+++ b/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample.py
@@ -0,0 +1,34 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Union, List
+
+from google.cloud import aiplatform
+
+
+#  [START aiplatform_sdk_get_experiment_run_classification_metrics_sample]
+def get_experiment_run_classification_metrics_sample(
+    run_name: str,
+    experiment: Union[str, aiplatform.Experiment],
+    project: str,
+    location: str,
+) -> List[Dict[str, Union[str, List]]]:
+    experiment_run = aiplatform.ExperimentRun(
+        run_name=run_name, experiment=experiment, project=project, location=location
+    )
+
+    return experiment_run.get_classification_metrics()
+
+
+#  [END aiplatform_sdk_get_experiment_run_classification_metrics_sample]
diff --git a/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample_test.py b/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample_test.py
new file mode 100644
index 0000000000..438cdf9199
--- /dev/null
+++ b/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample_test.py
@@ -0,0 +1,34 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import get_experiment_run_params_sample
+
+import pytest
+
+import test_constants as constants
+
+
+@pytest.mark.usefixtures("mock_get_run")
+def test_get_experiment_run_classification_metrics_sample(mock_get_classification_metrics, mock_classification_metrics):
+
+    classification_metrics = get_experiment_run_params_sample.get_experiment_run_classification_metrics_sample(
+        run_name=constants.EXPERIMENT_RUN_NAME,
+        experiment=constants.EXPERIMENT_NAME,
+        project=constants.PROJECT,
+        location=constants.LOCATION,
+    )
+
+    mock_get_classification_metrics.assert_called_with()
+
+    assert classification_metrics is mock_classification_metrics

From 96ef2f388227a4299f96efcbccfd919e23a1943d Mon Sep 17 00:00:00 2001
From: jaycee-li <jayceeli@google.com>
Date: Thu, 29 Sep 2022 12:22:34 -0700
Subject: [PATCH 13/14] fix linting

---
 samples/model-builder/conftest.py                             | 3 +++
 .../get_experiment_run_classification_metrics_sample.py       | 2 +-
 .../get_experiment_run_classification_metrics_sample_test.py  | 4 ++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/samples/model-builder/conftest.py b/samples/model-builder/conftest.py
index 6a4fde868a..b5bec845ba 100644
--- a/samples/model-builder/conftest.py
+++ b/samples/model-builder/conftest.py
@@ -642,6 +642,7 @@ def mock_time_series_metrics():
     mock = MagicMock()
     yield mock
 
+
 @pytest.fixture
 def mock_classification_metrics():
     mock = MagicMock()
@@ -894,12 +895,14 @@ def mock_get_time_series_metrics(mock_time_series_metrics, mock_experiment_run):
         mock_get_time_series_metrics.return_value = mock_time_series_metrics
         yield mock_get_time_series_metrics
 
+
 @pytest.fixture
 def mock_get_classification_metrics(mock_classification_metrics, mock_experiment_run):
     with patch.object(mock_experiment_run, "get_classification_metrics") as mock_get_classification_metrics:
         mock_get_classification_metrics.return_value = mock_classification_metrics
         yield mock_get_classification_metrics
 
+
 """
 ----------------------------------------------------------------------------
 Model Versioning Fixtures
diff --git a/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample.py b/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample.py
index e2676094c5..284ed9f968 100644
--- a/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample.py
+++ b/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, Union, List
+from typing import Dict, List, Union
 
 from google.cloud import aiplatform
 
diff --git a/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample_test.py b/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample_test.py
index 438cdf9199..3f6deb80bf 100644
--- a/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample_test.py
+++ b/samples/model-builder/experiment_tracking/get_experiment_run_classification_metrics_sample_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import get_experiment_run_params_sample
+import get_experiment_run_classification_metrics_sample
 
 import pytest
 
@@ -22,7 +22,7 @@
 @pytest.mark.usefixtures("mock_get_run")
 def test_get_experiment_run_classification_metrics_sample(mock_get_classification_metrics, mock_classification_metrics):
 
-    classification_metrics = get_experiment_run_params_sample.get_experiment_run_classification_metrics_sample(
+    classification_metrics = get_experiment_run_classification_metrics_sample.get_experiment_run_classification_metrics_sample(
         run_name=constants.EXPERIMENT_RUN_NAME,
         experiment=constants.EXPERIMENT_NAME,
         project=constants.PROJECT,

From 98cd8051bff0511c3e139bac59244c7c3015762b Mon Sep 17 00:00:00 2001
From: jaycee-li <jayceeli@google.com>
Date: Thu, 29 Sep 2022 16:54:00 -0700
Subject: [PATCH 14/14] add todos

---
 .../cloud/aiplatform/metadata/schema/google/artifact_schema.py   | 1 +
 google/cloud/aiplatform/metadata/schema/utils.py                 | 1 +
 2 files changed, 2 insertions(+)

diff --git a/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py b/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py
index 44a5670339..4941e42480 100644
--- a/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py
+++ b/google/cloud/aiplatform/metadata/schema/google/artifact_schema.py
@@ -359,6 +359,7 @@ def __init__(
         extended_metadata = copy.deepcopy(metadata) if metadata else {}
         if aggregation_type:
             if aggregation_type not in _CLASSIFICATION_METRICS_AGGREGATION_TYPE:
+                ## Todo: add negative test case for this
                 raise ValueError(
                     "aggregation_type can only be 'AGGREGATION_TYPE_UNSPECIFIED', 'MACRO_AVERAGE', or 'MICRO_AVERAGE'."
                 )
diff --git a/google/cloud/aiplatform/metadata/schema/utils.py b/google/cloud/aiplatform/metadata/schema/utils.py
index 336699065b..c6e23735b6 100644
--- a/google/cloud/aiplatform/metadata/schema/utils.py
+++ b/google/cloud/aiplatform/metadata/schema/utils.py
@@ -181,6 +181,7 @@ class ConfusionMatrix:
     annotation_specs: Optional[List[AnnotationSpec]] = None
 
     def to_dict(self):
+        ## Todo: add a validation to check 'matrix' and 'annotation_specs' have the same length
         """ML metadata schema dictionary representation of this DataClass"""
         results = {}
         if self.annotation_specs: