Skip to content

Commit

Permalink
[MAINTENANCE] Add Metric and Domain base classes (#10920)
Browse files Browse the repository at this point in the history
  • Loading branch information
NathanFarmer authored Feb 13, 2025
1 parent df0a2d0 commit c6c3cc4
Show file tree
Hide file tree
Showing 9 changed files with 437 additions and 0 deletions.
3 changes: 3 additions & 0 deletions great_expectations/metrics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## Notes

In order to enable auto-complete for metrics in VS Code or PyCharm, you need to have a mypy plugin installed.
1 change: 1 addition & 0 deletions great_expectations/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .column_values.between import ColumnValuesBetween
Empty file.
12 changes: 12 additions & 0 deletions great_expectations/metrics/column_values/between.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import Optional

from great_expectations.core.types import Comparable
from great_expectations.metrics.domain import ColumnValues
from great_expectations.metrics.metric import Metric


class ColumnValuesBetween(Metric, ColumnValues):
min_value: Optional[Comparable] = None
max_value: Optional[Comparable] = None
strict_min: bool = False
strict_max: bool = False
60 changes: 60 additions & 0 deletions great_expectations/metrics/domain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from typing import Annotated, Optional

from great_expectations.compatibility.pydantic import BaseModel, Field, StrictStr

NonEmptyString = Annotated[StrictStr, Field(min_length=1)]


class AbstractClassInstantiationError(TypeError):
def __init__(self, class_name: str) -> None:
super().__init__(f"Cannot instantiate abstract class `{class_name}`.")


class Domain(BaseModel):
"""The abstract base class for defining all types of domains over which metrics are computed."""

batch_id: NonEmptyString

def __new__(cls, *args, **kwargs):
if cls is Domain:
raise AbstractClassInstantiationError(cls.__name__)
return super().__new__(cls)


class Values(Domain):
"""The abstract base class for metric domain types that compute row-level calculations."""

table: NonEmptyString
row_condition: Optional[StrictStr] = None

def __new__(cls, *args, **kwargs):
if cls is Values:
raise AbstractClassInstantiationError(cls.__name__)
return super().__new__(cls)


class ColumnValues(Values):
"""A domain type for metrics that compute row-level calculations on a single column.
The ColumnValues domain type is used to define metrics that evaluate conditions or compute
values for each row in a single column. This class is intended to be used as a mixin
with the Metric class when defining a new Metric.
Attributes:
batch_id (str): Unique identifier for the batch being processed.
table (str): Name of the table containing the column.
column (str): Name of the column to compute metrics on.
row_condition (Optional[str]): A condition that can be used to filter rows.
See: https://docs.greatexpectations.io/docs/core/customize_expectations/expectation_conditions/#create-an-expectation-condition
Examples:
A metric with a ColumnValues domain for column nullity values computed on each row:
>>> class Null(Metric, ColumnValues):
... ...
See Also:
Metric: The abstract base class for defining all metrics
"""

column: NonEmptyString
148 changes: 148 additions & 0 deletions great_expectations/metrics/metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import re
from typing import ClassVar, Final

from typing_extensions import dataclass_transform

from great_expectations.compatibility.pydantic import BaseModel, ModelMetaclass, root_validator
from great_expectations.metrics.domain import AbstractClassInstantiationError, Domain
from great_expectations.validator.metric_configuration import (
MetricConfiguration,
MetricConfigurationID,
)

ALLOWABLE_METRIC_MIXINS: Final[int] = 1


class MixinTypeError(TypeError):
def __init__(self, class_name: str, mixin_superclass_name: str) -> None:
super().__init__(
f"`{class_name}` must use a single `{mixin_superclass_name}` subclass mixin."
)


@dataclass_transform()
class MetaMetric(ModelMetaclass):
def __new__(cls, name, bases, attrs):
# ensure a single Domain mixin is defined
if name != "Metric" and (
len(bases) != ALLOWABLE_METRIC_MIXINS + 1
or not any(issubclass(base_type, Domain) for base_type in bases)
):
raise MixinTypeError(name, "Domain")
return super().__new__(cls, name, bases, attrs)


class Metric(BaseModel, metaclass=MetaMetric):
"""The abstract base class for defining all metrics.
A Metric represents a measurable property that can be computed over a specific domain
of data (e.g., a column, table, or column pair). All concrete metric implementations
must inherit from this class and specify their domain type as a mixin.
Examples:
A metric for column nullity values computed on each row:
>>> class Null(Metric, ColumnValues):
... ...
A metric for a single table row count value:
>>> class RowCount(Metric, Table):
... ...
Notes:
- The Metric class cannot be instantiated directly - it must be subclassed.
- Subclasses must specify a single Domain type as a mixin.
- Once Metrics are instantiated, they are immutable.
See Also:
Domain: The base class for all domain types
MetricConfiguration: Configuration class for metric computation
"""

name: ClassVar[str]
config: MetricConfiguration

class Config:
arbitrary_types_allowed = True
frozen = True

def __new__(cls, *args, **kwargs):
if cls is Metric:
raise AbstractClassInstantiationError(cls.__name__)
cls.name = cls._get_metric_name()
return super().__new__(cls)

@root_validator(pre=True)
@classmethod
def _set_config(cls, values) -> dict:
if "config" not in values or values["config"] is None:
values["config"] = cls._to_config(values)
return values

@property
def id(self) -> MetricConfigurationID:
return self.config.id

@staticmethod
def _pascal_to_snake(class_name: str) -> str:
# Adds an underscore between a sequence of uppercase letters and an uppercase-lowercase pair
# APIFunctionMetric -> API_FunctionMetric
class_name = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", class_name)
# Adds an underscore between a lowercase letter/digit and an uppercase letter
# APIFunctionMetric -> API_Function_Metric
class_name = re.sub(r"([a-z\d])([A-Z])", r"\1_\2", class_name)
# Convert the entire string to lowercase
# API_Function_Metric -> api_function_metric
return class_name.lower()

@classmethod
def _get_metric_name(cls) -> str:
"""The name of the metric as it exists in the registry."""
for base_type in cls.__bases__:
if issubclass(base_type, Domain):
domain_class_name = str(base_type.__name__)
metric_class_name = str(cls.__name__)
domain_class_snake_case = Metric._pascal_to_snake(domain_class_name)
metric_class_snake_case = Metric._pascal_to_snake(metric_class_name)
# the convention is that the metric class name includes the domain class name
# but the metric names don't repeat the domain name, so we remove it
return ".".join(
[
domain_class_snake_case,
metric_class_snake_case.replace(domain_class_snake_case, "").strip("_"),
]
)

# this should never be reached
# that a Domain exists in __bases__ should have been confirmed in MetaMetric.__new__
raise MixinTypeError(cls.__name__, "Domain")

@classmethod
def _to_config(cls, model_values: dict) -> MetricConfiguration:
"""Returns a MetricConfiguration instance for this Metric."""
metric_domain_kwargs = {}
metric_value_kwargs = {}
for base_type in cls.__bases__:
if issubclass(base_type, Domain):
domain_fields = base_type.__fields__
metric_fields = Metric.__fields__
value_fields = {
field_name: field_info
for field_name, field_info in cls.__fields__.items()
if field_name not in domain_fields and field_name not in metric_fields
}
for field_name, field_info in domain_fields.items():
metric_domain_kwargs[field_name] = model_values.get(
field_name, field_info.default
)
for field_name, field_info in value_fields.items():
metric_value_kwargs[field_name] = model_values.get(
field_name, field_info.default
)

return MetricConfiguration(
metric_name=cls.name,
metric_domain_kwargs=metric_domain_kwargs,
metric_value_kwargs=metric_value_kwargs,
)
Empty file added tests/metrics/__init__.py
Empty file.
51 changes: 51 additions & 0 deletions tests/metrics/test_domain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pytest

from great_expectations.compatibility.pydantic import ValidationError, errors
from great_expectations.metrics.domain import (
AbstractClassInstantiationError,
ColumnValues,
Domain,
Values,
)

BATCH_ID = "my_data_source-my_data_asset-year_2025"
TABLE = "my_table"
COLUMN = "my_column"


class TestAbstractClasses:
@pytest.mark.unit
def test_domain_instantiation_raises(self):
with pytest.raises(AbstractClassInstantiationError):
Domain(batch_id=BATCH_ID)

@pytest.mark.unit
def test_values_instantiation_raises(self):
with pytest.raises(AbstractClassInstantiationError):
Values(batch_id=BATCH_ID, table=TABLE)


class TestColumnMap:
@pytest.mark.unit
def test_column_map_instantiation_success(self):
ColumnValues(batch_id=BATCH_ID, table=TABLE, column=COLUMN)

@pytest.mark.unit
@pytest.mark.parametrize(
"kwargs",
[
{"batch_id": "", "table": TABLE, "column": COLUMN},
{"batch_id": BATCH_ID, "table": "", "column": COLUMN},
{"batch_id": BATCH_ID, "table": TABLE, "column": ""},
],
)
def test_column_map_arguments_empty_string_raises(self, kwargs: dict):
with pytest.raises(ValidationError) as e:
ColumnValues(**kwargs)
all_errors = e.value.raw_errors
assert any(
True
if hasattr(error, "exc") and isinstance(error.exc, errors.AnyStrMinLengthError)
else False
for error in all_errors
)
Loading

0 comments on commit c6c3cc4

Please sign in to comment.