diff --git a/python/pyiceberg/transforms.py b/python/pyiceberg/transforms.py index fdb4978d60f3..270918e67b41 100644 --- a/python/pyiceberg/transforms.py +++ b/python/pyiceberg/transforms.py @@ -18,6 +18,7 @@ import base64 import struct from abc import ABC, abstractmethod +from enum import IntEnum from functools import singledispatch from typing import ( Any, @@ -58,6 +59,10 @@ VOID = "void" BUCKET = "bucket" TRUNCATE = "truncate" +YEAR = "year" +MONTH = "month" +DAY = "day" +HOUR = "hour" BUCKET_PARSER = ParseNumberFromBrackets(BUCKET) TRUNCATE_PARSER = ParseNumberFromBrackets(TRUNCATE) @@ -92,6 +97,14 @@ def validate(cls, v: Any): return BucketTransform(num_buckets=BUCKET_PARSER.match(v)) elif v.startswith(TRUNCATE): return TruncateTransform(width=TRUNCATE_PARSER.match(v)) + elif v == YEAR: + return YearTransform() + elif v == MONTH: + return MonthTransform() + elif v == DAY: + return DayTransform() + elif v == HOUR: + return HourTransform() else: return UnknownTransform(transform=v) return v @@ -141,7 +154,6 @@ class BucketTransform(Transform[S, int]): num_buckets (int): The number of buckets. """ - _source_type: IcebergType = PrivateAttr() _num_buckets: PositiveInt = PrivateAttr() def __init__(self, num_buckets: int, **data: Any): @@ -215,6 +227,217 @@ def __repr__(self) -> str: return f"BucketTransform(num_buckets={self._num_buckets})" +class TimeResolution(IntEnum): + YEAR = 6 + MONTH = 5 + WEEK = 4 + DAY = 3 + HOUR = 2 + MINUTE = 1 + SECOND = 0 + + +class TimeTransform(Transform[S, int], Singleton): + @property + @abstractmethod + def granularity(self) -> TimeResolution: + ... + + def satisfies_order_of(self, other: Transform) -> bool: + return self.granularity <= other.granularity if hasattr(other, "granularity") else False + + def result_type(self, source: IcebergType) -> IcebergType: + return IntegerType() + + @property + def dedup_name(self) -> str: + return "time" + + @property + def preserves_order(self) -> bool: + return True + + +class YearTransform(TimeTransform): + """Transforms a datetime value into a year value. + + Example: + >>> transform = YearTransform() + >>> transform.transform(TimestampType())(1512151975038194) + 47 + """ + + __root__: Literal["year"] = Field(default="year") + + def transform(self, source: IcebergType) -> Callable[[Optional[S]], Optional[int]]: + source_type = type(source) + if source_type == DateType: + + def year_func(v): + return datetime.days_to_years(v) + + elif source_type in {TimestampType, TimestamptzType}: + + def year_func(v): + return datetime.micros_to_years(v) + + else: + raise ValueError(f"Cannot apply year transform for type: {source}") + + return lambda v: year_func(v) if v is not None else None + + def can_transform(self, source: IcebergType) -> bool: + return type(source) in { + DateType, + TimestampType, + TimestamptzType, + } + + @property + def granularity(self) -> TimeResolution: + return TimeResolution.YEAR + + def to_human_string(self, _: IcebergType, value: Optional[S]) -> str: + return datetime.to_human_year(value) if isinstance(value, int) else "null" + + def __repr__(self) -> str: + return "YearTransform()" + + +class MonthTransform(TimeTransform): + """Transforms a datetime value into a month value. + + Example: + >>> transform = MonthTransform() + >>> transform.transform(DateType())(17501) + 575 + """ + + __root__: Literal["month"] = Field(default="month") + + def transform(self, source: IcebergType) -> Callable[[Optional[S]], Optional[int]]: + source_type = type(source) + if source_type == DateType: + + def month_func(v): + return datetime.days_to_months(v) + + elif source_type in {TimestampType, TimestamptzType}: + + def month_func(v): + return datetime.micros_to_months(v) + + else: + raise ValueError(f"Cannot apply month transform for type: {source}") + + return lambda v: month_func(v) if v else None + + def can_transform(self, source: IcebergType) -> bool: + return type(source) in { + DateType, + TimestampType, + TimestamptzType, + } + + @property + def granularity(self) -> TimeResolution: + return TimeResolution.MONTH + + def to_human_string(self, _: IcebergType, value: Optional[S]) -> str: + return datetime.to_human_month(value) if isinstance(value, int) else "null" + + def __repr__(self) -> str: + return "MonthTransform()" + + +class DayTransform(TimeTransform): + """Transforms a datetime value into a day value. + + Example: + >>> transform = MonthTransform() + >>> transform.transform(DateType())(17501) + 17501 + """ + + __root__: Literal["day"] = Field(default="day") + + def transform(self, source: IcebergType) -> Callable[[Optional[S]], Optional[int]]: + source_type = type(source) + if source_type == DateType: + + def day_func(v): + return v + + elif source_type in {TimestampType, TimestamptzType}: + + def day_func(v): + return datetime.micros_to_days(v) + + else: + raise ValueError(f"Cannot apply day transform for type: {source}") + + return lambda v: day_func(v) if v else None + + def can_transform(self, source: IcebergType) -> bool: + return type(source) in { + DateType, + TimestampType, + TimestamptzType, + } + + def result_type(self, source: IcebergType) -> IcebergType: + return DateType() + + @property + def granularity(self) -> TimeResolution: + return TimeResolution.DAY + + def to_human_string(self, _: IcebergType, value: Optional[S]) -> str: + return datetime.to_human_day(value) if isinstance(value, int) else "null" + + def __repr__(self) -> str: + return "DayTransform()" + + +class HourTransform(TimeTransform): + """Transforms a datetime value into a hour value. + + Example: + >>> transform = HourTransform() + >>> transform.transform(TimestampType())(1512151975038194) + 420042 + """ + + __root__: Literal["hour"] = Field(default="hour") + + def transform(self, source: IcebergType) -> Callable[[Optional[S]], Optional[int]]: + if type(source) in {TimestampType, TimestamptzType}: + + def hour_func(v): + return datetime.micros_to_hours(v) + + else: + raise ValueError(f"Cannot apply hour transform for type: {source}") + + return lambda v: hour_func(v) if v else None + + def can_transform(self, source: IcebergType) -> bool: + return type(source) in { + TimestampType, + TimestamptzType, + } + + @property + def granularity(self) -> TimeResolution: + return TimeResolution.HOUR + + def to_human_string(self, _: IcebergType, value: Optional[S]) -> str: + return datetime.to_human_hour(value) if isinstance(value, int) else "null" + + def __repr__(self) -> str: + return "HourTransform()" + + def _base64encode(buffer: bytes) -> str: """Converts bytes to base64 string""" return base64.b64encode(buffer).decode("ISO-8859-1") @@ -230,7 +453,6 @@ class IdentityTransform(Transform[S, S]): """ __root__: Literal["identity"] = Field(default="identity") - _source_type: IcebergType = PrivateAttr() def transform(self, source: IcebergType) -> Callable[[Optional[S]], Optional[S]]: return lambda v: v @@ -389,7 +611,6 @@ class UnknownTransform(Transform): """ __root__: Literal["unknown"] = Field(default="unknown") - _source_type: IcebergType = PrivateAttr() _transform: str = PrivateAttr() def __init__(self, transform: str, **data: Any): diff --git a/python/pyiceberg/utils/datetime.py b/python/pyiceberg/utils/datetime.py index 9e491ce9ce69..5bfcf772fef1 100644 --- a/python/pyiceberg/utils/datetime.py +++ b/python/pyiceberg/utils/datetime.py @@ -98,11 +98,26 @@ def micros_to_timestamptz(micros: int): return EPOCH_TIMESTAMPTZ + dt +def to_human_year(year_ordinal: int) -> str: + """Converts a DateType value to human string""" + return f"{EPOCH_TIMESTAMP.year + year_ordinal:0=4d}" + + +def to_human_month(month_ordinal: int) -> str: + """Converts a DateType value to human string""" + return f"{EPOCH_TIMESTAMP.year + month_ordinal // 12:0=4d}-{1 + month_ordinal % 12:0=2d}" + + def to_human_day(day_ordinal: int) -> str: """Converts a DateType value to human string""" return (EPOCH_DATE + timedelta(days=day_ordinal)).isoformat() +def to_human_hour(hour_ordinal: int) -> str: + """Converts a DateType value to human string""" + return (EPOCH_TIMESTAMP + timedelta(hours=hour_ordinal)).isoformat("-", "hours") + + def to_human_time(micros_from_midnight: int) -> str: """Converts a TimeType value to human string""" return micros_to_time(micros_from_midnight).isoformat() @@ -116,3 +131,30 @@ def to_human_timestamptz(timestamp_micros: int) -> str: def to_human_timestamp(timestamp_micros: int) -> str: """Converts a TimestampType value to human string""" return (EPOCH_TIMESTAMP + timedelta(microseconds=timestamp_micros)).isoformat() + + +def micros_to_hours(timestamp: int) -> int: + """Converts a timestamp in microseconds to a date in hours""" + return int((datetime.utcfromtimestamp(timestamp // 1_000_000) - EPOCH_TIMESTAMP).total_seconds() / 3600) + + +def days_to_months(days: int) -> int: + """Creates a date from the number of days from 1970-01-01""" + d = days_to_date(days) + return (d.year - EPOCH_DATE.year) * 12 + (d.month - EPOCH_DATE.month) + + +def micros_to_months(timestamp: int) -> int: + dt = micros_to_timestamp(timestamp) + return (dt.year - EPOCH_TIMESTAMP.year) * 12 + (dt.month - EPOCH_TIMESTAMP.month) - (1 if dt.day < EPOCH_TIMESTAMP.day else 0) + + +def days_to_years(days: int) -> int: + return days_to_date(days).year - EPOCH_DATE.year + + +def micros_to_years(timestamp: int) -> int: + dt = micros_to_timestamp(timestamp) + return (dt.year - EPOCH_TIMESTAMP.year) - ( + 1 if dt.month < EPOCH_TIMESTAMP.month or (dt.month == EPOCH_TIMESTAMP.month and dt.day < EPOCH_TIMESTAMP.day) else 0 + ) diff --git a/python/tests/test_transforms.py b/python/tests/test_transforms.py index bdebae94e43b..eec3e4aebefb 100644 --- a/python/tests/test_transforms.py +++ b/python/tests/test_transforms.py @@ -24,11 +24,15 @@ from pyiceberg import transforms from pyiceberg.transforms import ( BucketTransform, + DayTransform, + HourTransform, IdentityTransform, + MonthTransform, Transform, TruncateTransform, UnknownTransform, VoidTransform, + YearTransform, ) from pyiceberg.types import ( BinaryType, @@ -139,6 +143,125 @@ def test_string_with_surrogate_pair(): assert bucket_transform(string_with_surrogate_pair) == mmh3.hash(as_bytes) +@pytest.mark.parametrize( + "date,date_transform,expected", + [ + (47, YearTransform(), "2017"), + (575, MonthTransform(), "2017-12"), + (17501, DayTransform(), "2017-12-01"), + ], +) +def test_date_to_human_string(date, date_transform, expected): + assert date_transform.to_human_string(DateType(), date) == expected + + +@pytest.mark.parametrize( + "date_transform", + [ + YearTransform(), + MonthTransform(), + DayTransform(), + ], +) +def test_none_date_to_human_string(date_transform): + assert date_transform.to_human_string(DateType(), None) == "null" + + +def test_hour_to_human_string(): + assert HourTransform().to_human_string(TimestampType(), None) == "null" + assert HourTransform().to_human_string(TimestampType(), 420042) == "2017-12-01-18" + + +@pytest.mark.parametrize( + "negative_value,time_transform,expected", + [ + (-1, YearTransform(), "1969"), + (-1, MonthTransform(), "1969-12"), + (-1, DayTransform(), "1969-12-31"), + (-1, HourTransform(), "1969-12-31-23"), + ], +) +def test_negative_value_to_human_string(negative_value, time_transform, expected): + assert time_transform.to_human_string(TimestampType(), negative_value) == expected + + +@pytest.mark.parametrize( + "type_var", + [ + DateType(), + TimestampType(), + TimestamptzType(), + ], +) +def test_time_methods(type_var): + assert YearTransform().can_transform(type_var) + assert MonthTransform().can_transform(type_var) + assert DayTransform().can_transform(type_var) + assert YearTransform().preserves_order + assert MonthTransform().preserves_order + assert DayTransform().preserves_order + assert YearTransform().result_type(type_var) == IntegerType() + assert MonthTransform().result_type(type_var) == IntegerType() + assert DayTransform().result_type(type_var) == DateType() + assert YearTransform().dedup_name == "time" + assert MonthTransform().dedup_name == "time" + assert DayTransform().dedup_name == "time" + + +@pytest.mark.parametrize( + "transform,type_var,value,expected", + [ + (DayTransform(), DateType(), 17501, 17501), + (DayTransform(), DateType(), -1, -1), + (MonthTransform(), DateType(), 17501, 575), + (MonthTransform(), DateType(), -1, -1), + (YearTransform(), DateType(), 17501, 47), + (YearTransform(), DateType(), -1, -1), + (YearTransform(), TimestampType(), 1512151975038194, 47), + (YearTransform(), TimestampType(), -1, -1), + (MonthTransform(), TimestamptzType(), 1512151975038194, 575), + (MonthTransform(), TimestamptzType(), -1, -1), + (DayTransform(), TimestampType(), 1512151975038194, 17501), + (DayTransform(), TimestampType(), -1, -1), + ], +) +def test_time_apply_method(transform, type_var, value, expected): + assert transform.transform(type_var)(value) == expected + + +@pytest.mark.parametrize( + "type_var", + [ + TimestampType(), + TimestamptzType(), + ], +) +def test_hour_method(type_var): + assert HourTransform().can_transform(type_var) + assert HourTransform().result_type(type_var) == IntegerType() + assert HourTransform().transform(type_var)(1512151975038194) == 420042 + assert HourTransform().dedup_name == "time" + + +@pytest.mark.parametrize( + "transform,other_transform", + [ + (YearTransform(), MonthTransform()), + (YearTransform(), DayTransform()), + (YearTransform(), HourTransform()), + (MonthTransform(), DayTransform()), + (MonthTransform(), HourTransform()), + (DayTransform(), HourTransform()), + ], +) +def test_satisfies_order_of_method(transform, other_transform): + assert transform.satisfies_order_of(transform) + assert other_transform.satisfies_order_of(transform) + assert not transform.satisfies_order_of(other_transform) + assert not transform.satisfies_order_of(VoidTransform()) + assert not other_transform.satisfies_order_of(IdentityTransform()) + + @pytest.mark.parametrize( "type_var,value,expected", [ @@ -318,3 +441,65 @@ def test_void_transform_str(): def test_void_transform_repr(): assert repr(VoidTransform()) == "VoidTransform()" + + +def test_year_transform_serialize(): + assert YearTransform().json() == '"year"' + + +def test_year_transform_deserialize(): + transform = TestType.parse_raw('"year"').__root__ + assert transform == YearTransform() + + +def test_month_transform_serialize(): + assert MonthTransform().json() == '"month"' + + +def test_month_transform_deserialize(): + transform = TestType.parse_raw('"month"').__root__ + assert transform == MonthTransform() + + +def test_day_transform_serialize(): + assert DayTransform().json() == '"day"' + + +def test_day_transform_deserialize(): + transform = TestType.parse_raw('"day"').__root__ + assert transform == DayTransform() + + +def test_hour_transform_serialize(): + assert HourTransform().json() == '"hour"' + + +def test_hour_transform_deserialize(): + transform = TestType.parse_raw('"hour"').__root__ + assert transform == HourTransform() + + +@pytest.mark.parametrize( + "transform,transform_str", + [ + (YearTransform(), "year"), + (MonthTransform(), "month"), + (DayTransform(), "day"), + (HourTransform(), "hour"), + ], +) +def test_datetime_transform_str(transform, transform_str): + assert str(transform) == transform_str + + +@pytest.mark.parametrize( + "transform,transform_repr", + [ + (YearTransform(), "YearTransform()"), + (MonthTransform(), "MonthTransform()"), + (DayTransform(), "DayTransform()"), + (HourTransform(), "HourTransform()"), + ], +) +def test_datetime_transform_repr(transform, transform_repr): + assert repr(transform) == transform_repr