[MAINTENANCE] Add Metric and Domain base classes (#10920)

great-expectations · Feb 13, 2025 · c6c3cc4 · c6c3cc4
1 parent df0a2d0
commit c6c3cc4
Show file tree

Hide file tree

Showing 9 changed files with 437 additions and 0 deletions.
diff --git a/great_expectations/metrics/README.md b/great_expectations/metrics/README.md
@@ -0,0 +1,3 @@
+## Notes
+
+In order to enable auto-complete for metrics in VS Code or PyCharm, you need to have a mypy plugin installed.
diff --git a/great_expectations/metrics/__init__.py b/great_expectations/metrics/__init__.py
@@ -0,0 +1 @@
+from .column_values.between import ColumnValuesBetween
diff --git a/great_expectations/metrics/column_values/__init__.py b/great_expectations/metrics/column_values/__init__.py
diff --git a/great_expectations/metrics/column_values/between.py b/great_expectations/metrics/column_values/between.py
@@ -0,0 +1,12 @@
+from typing import Optional
+
+from great_expectations.core.types import Comparable
+from great_expectations.metrics.domain import ColumnValues
+from great_expectations.metrics.metric import Metric
+
+
+class ColumnValuesBetween(Metric, ColumnValues):
+    min_value: Optional[Comparable] = None
+    max_value: Optional[Comparable] = None
+    strict_min: bool = False
+    strict_max: bool = False
diff --git a/great_expectations/metrics/domain.py b/great_expectations/metrics/domain.py
@@ -0,0 +1,60 @@
+from typing import Annotated, Optional
+
+from great_expectations.compatibility.pydantic import BaseModel, Field, StrictStr
+
+NonEmptyString = Annotated[StrictStr, Field(min_length=1)]
+
+
+class AbstractClassInstantiationError(TypeError):
+    def __init__(self, class_name: str) -> None:
+        super().__init__(f"Cannot instantiate abstract class `{class_name}`.")
+
+
+class Domain(BaseModel):
+    """The abstract base class for defining all types of domains over which metrics are computed."""
+
+    batch_id: NonEmptyString
+
+    def __new__(cls, *args, **kwargs):
+        if cls is Domain:
+            raise AbstractClassInstantiationError(cls.__name__)
+        return super().__new__(cls)
+
+
+class Values(Domain):
+    """The abstract base class for metric domain types that compute row-level calculations."""
+
+    table: NonEmptyString
+    row_condition: Optional[StrictStr] = None
+
+    def __new__(cls, *args, **kwargs):
+        if cls is Values:
+            raise AbstractClassInstantiationError(cls.__name__)
+        return super().__new__(cls)
+
+
+class ColumnValues(Values):
+    """A domain type for metrics that compute row-level calculations on a single column.
+
+    The ColumnValues domain type is used to define metrics that evaluate conditions or compute
+    values for each row in a single column. This class is intended to be used as a mixin
+    with the Metric class when defining a new Metric.
+
+    Attributes:
+        batch_id (str): Unique identifier for the batch being processed.
+        table (str): Name of the table containing the column.
+        column (str): Name of the column to compute metrics on.
+        row_condition (Optional[str]): A condition that can be used to filter rows.
+                                       See: https://docs.greatexpectations.io/docs/core/customize_expectations/expectation_conditions/#create-an-expectation-condition
+
+    Examples:
+        A metric with a ColumnValues domain for column nullity values computed on each row:
+
+        >>> class Null(Metric, ColumnValues):
+        ...     ...
+
+    See Also:
+        Metric: The abstract base class for defining all metrics
+    """
+
+    column: NonEmptyString
diff --git a/great_expectations/metrics/metric.py b/great_expectations/metrics/metric.py
@@ -0,0 +1,148 @@
+import re
+from typing import ClassVar, Final
+
+from typing_extensions import dataclass_transform
+
+from great_expectations.compatibility.pydantic import BaseModel, ModelMetaclass, root_validator
+from great_expectations.metrics.domain import AbstractClassInstantiationError, Domain
+from great_expectations.validator.metric_configuration import (
+    MetricConfiguration,
+    MetricConfigurationID,
+)
+
+ALLOWABLE_METRIC_MIXINS: Final[int] = 1
+
+
+class MixinTypeError(TypeError):
+    def __init__(self, class_name: str, mixin_superclass_name: str) -> None:
+        super().__init__(
+            f"`{class_name}` must use a single `{mixin_superclass_name}` subclass mixin."
+        )
+
+
+@dataclass_transform()
+class MetaMetric(ModelMetaclass):
+    def __new__(cls, name, bases, attrs):
+        # ensure a single Domain mixin is defined
+        if name != "Metric" and (
+            len(bases) != ALLOWABLE_METRIC_MIXINS + 1
+            or not any(issubclass(base_type, Domain) for base_type in bases)
+        ):
+            raise MixinTypeError(name, "Domain")
+        return super().__new__(cls, name, bases, attrs)
+
+
+class Metric(BaseModel, metaclass=MetaMetric):
+    """The abstract base class for defining all metrics.
+
+    A Metric represents a measurable property that can be computed over a specific domain
+    of data (e.g., a column, table, or column pair). All concrete metric implementations
+    must inherit from this class and specify their domain type as a mixin.
+
+    Examples:
+        A metric for column nullity values computed on each row:
+
+        >>> class Null(Metric, ColumnValues):
+        ...     ...
+
+        A metric for a single table row count value:
+
+        >>> class RowCount(Metric, Table):
+        ...     ...
+
+    Notes:
+        - The Metric class cannot be instantiated directly - it must be subclassed.
+        - Subclasses must specify a single Domain type as a mixin.
+        - Once Metrics are instantiated, they are immutable.
+
+    See Also:
+        Domain: The base class for all domain types
+        MetricConfiguration: Configuration class for metric computation
+    """
+
+    name: ClassVar[str]
+    config: MetricConfiguration
+
+    class Config:
+        arbitrary_types_allowed = True
+        frozen = True
+
+    def __new__(cls, *args, **kwargs):
+        if cls is Metric:
+            raise AbstractClassInstantiationError(cls.__name__)
+        cls.name = cls._get_metric_name()
+        return super().__new__(cls)
+
+    @root_validator(pre=True)
+    @classmethod
+    def _set_config(cls, values) -> dict:
+        if "config" not in values or values["config"] is None:
+            values["config"] = cls._to_config(values)
+        return values
+
+    @property
+    def id(self) -> MetricConfigurationID:
+        return self.config.id
+
+    @staticmethod
+    def _pascal_to_snake(class_name: str) -> str:
+        # Adds an underscore between a sequence of uppercase letters and an uppercase-lowercase pair
+        # APIFunctionMetric -> API_FunctionMetric
+        class_name = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", class_name)
+        # Adds an underscore between a lowercase letter/digit and an uppercase letter
+        # APIFunctionMetric -> API_Function_Metric
+        class_name = re.sub(r"([a-z\d])([A-Z])", r"\1_\2", class_name)
+        # Convert the entire string to lowercase
+        # API_Function_Metric -> api_function_metric
+        return class_name.lower()
+
+    @classmethod
+    def _get_metric_name(cls) -> str:
+        """The name of the metric as it exists in the registry."""
+        for base_type in cls.__bases__:
+            if issubclass(base_type, Domain):
+                domain_class_name = str(base_type.__name__)
+                metric_class_name = str(cls.__name__)
+                domain_class_snake_case = Metric._pascal_to_snake(domain_class_name)
+                metric_class_snake_case = Metric._pascal_to_snake(metric_class_name)
+                # the convention is that the metric class name includes the domain class name
+                # but the metric names don't repeat the domain name, so we remove it
+                return ".".join(
+                    [
+                        domain_class_snake_case,
+                        metric_class_snake_case.replace(domain_class_snake_case, "").strip("_"),
+                    ]
+                )
+
+        # this should never be reached
+        # that a Domain exists in __bases__ should have been confirmed in MetaMetric.__new__
+        raise MixinTypeError(cls.__name__, "Domain")
+
+    @classmethod
+    def _to_config(cls, model_values: dict) -> MetricConfiguration:
+        """Returns a MetricConfiguration instance for this Metric."""
+        metric_domain_kwargs = {}
+        metric_value_kwargs = {}
+        for base_type in cls.__bases__:
+            if issubclass(base_type, Domain):
+                domain_fields = base_type.__fields__
+                metric_fields = Metric.__fields__
+                value_fields = {
+                    field_name: field_info
+                    for field_name, field_info in cls.__fields__.items()
+                    if field_name not in domain_fields and field_name not in metric_fields
+                }
+                for field_name, field_info in domain_fields.items():
+                    metric_domain_kwargs[field_name] = model_values.get(
+                        field_name, field_info.default
+                    )
+                for field_name, field_info in value_fields.items():
+                    metric_value_kwargs[field_name] = model_values.get(
+                        field_name, field_info.default
+                    )
+
+        return MetricConfiguration(
+            metric_name=cls.name,
+            metric_domain_kwargs=metric_domain_kwargs,
+            metric_value_kwargs=metric_value_kwargs,
+        )
diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
diff --git a/tests/metrics/test_domain.py b/tests/metrics/test_domain.py
@@ -0,0 +1,51 @@
+import pytest
+
+from great_expectations.compatibility.pydantic import ValidationError, errors
+from great_expectations.metrics.domain import (
+    AbstractClassInstantiationError,
+    ColumnValues,
+    Domain,
+    Values,
+)
+
+BATCH_ID = "my_data_source-my_data_asset-year_2025"
+TABLE = "my_table"
+COLUMN = "my_column"
+
+
+class TestAbstractClasses:
+    @pytest.mark.unit
+    def test_domain_instantiation_raises(self):
+        with pytest.raises(AbstractClassInstantiationError):
+            Domain(batch_id=BATCH_ID)
+
+    @pytest.mark.unit
+    def test_values_instantiation_raises(self):
+        with pytest.raises(AbstractClassInstantiationError):
+            Values(batch_id=BATCH_ID, table=TABLE)
+
+
+class TestColumnMap:
+    @pytest.mark.unit
+    def test_column_map_instantiation_success(self):
+        ColumnValues(batch_id=BATCH_ID, table=TABLE, column=COLUMN)
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize(
+        "kwargs",
+        [
+            {"batch_id": "", "table": TABLE, "column": COLUMN},
+            {"batch_id": BATCH_ID, "table": "", "column": COLUMN},
+            {"batch_id": BATCH_ID, "table": TABLE, "column": ""},
+        ],
+    )
+    def test_column_map_arguments_empty_string_raises(self, kwargs: dict):
+        with pytest.raises(ValidationError) as e:
+            ColumnValues(**kwargs)
+        all_errors = e.value.raw_errors
+        assert any(
+            True
+            if hasattr(error, "exc") and isinstance(error.exc, errors.AnyStrMinLengthError)
+            else False
+            for error in all_errors
+        )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		## Notes

		In order to enable auto-complete for metrics in VS Code or PyCharm, you need to have a mypy plugin installed.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .column_values.between import ColumnValuesBetween