Skip to content

Commit

Permalink
feat: add deterministic hash methods to all types (#573)
Browse files Browse the repository at this point in the history
### Summary of Changes

- feat: add deterministic `__hash__` methods to the all types
- test: add tests to all new hash implementations
- feat: add `__eq__` method to `TaggedTable` and `TimeSeries`

The `__hash__`-Implementation is needed to more efficiently check
whether data may be the same.
A deterministic implementation is used (based on `xxhash`, a fast
non-cryptographic hash algorithm) to allow these comparisons to be
performed across different interpreters (processes).

The implementation of the Schema was updated to be deterministic.
  • Loading branch information
WinPlay02 authored Mar 18, 2024
1 parent dbdf11e commit f6a3ca7
Show file tree
Hide file tree
Showing 24 changed files with 764 additions and 6 deletions.
122 changes: 119 additions & 3 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ scikit-learn = "^1.2.0"
seaborn = "^0.13.0"
torch = {version = "^2.2.0", source = "torch_cuda121"}
torchvision = {version = "^0.17.0", source = "torch_cuda121"}
xxhash = "^3.4.1"

[tool.poetry.group.dev.dependencies]
pytest = ">=7.2.1,<9.0.0"
Expand Down
12 changes: 12 additions & 0 deletions src/safeds/data/image/containers/_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import torch
import torch.nn.functional as func
import xxhash
from PIL.Image import open as pil_image_open
from torch import Tensor

Expand Down Expand Up @@ -109,6 +110,17 @@ def __eq__(self, other: object) -> bool:
and torch.all(torch.eq(self._image_tensor, other._set_device(self.device)._image_tensor)).item()
)

def __hash__(self) -> int:
"""
Return a deterministic hash value for this image.
Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.width.to_bytes(8) + self.height.to_bytes(8) + self.channel.to_bytes(8)).intdigest()

def __sizeof__(self) -> int:
"""
Return the complete size of this object.
Expand Down
12 changes: 12 additions & 0 deletions src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
import pandas as pd
import seaborn as sns
import xxhash

from safeds.data.image.containers import Image
from safeds.data.tabular.typing import ColumnType
Expand Down Expand Up @@ -191,6 +192,17 @@ def __getitem__(self, index: int | slice) -> T | Column[T]:
data = self._data[index].reset_index(drop=True).rename(self.name)
return Column._from_pandas_series(data, self._type)

def __hash__(self) -> int:
"""
Return a deterministic hash value for this column.
Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.name.encode("utf-8") + self.type.__repr__().encode("utf-8") + self.number_of_rows.to_bytes(8)).intdigest()

def __iter__(self) -> Iterator[T]:
r"""
Create an iterator for the data of this column. This way e.g. for-each loops can be used on it.
Expand Down
13 changes: 13 additions & 0 deletions src/safeds/data/tabular/containers/_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

import sys
import functools
import operator
from collections.abc import Callable, Mapping
from typing import TYPE_CHECKING, Any

import pandas as pd
import xxhash

from safeds.data.tabular.typing import ColumnType, Schema
from safeds.exceptions import UnknownColumnNameError
Expand Down Expand Up @@ -216,6 +218,17 @@ def __getitem__(self, column_name: str) -> Any:
"""
return self.get_value(column_name)

def __hash__(self) -> int:
"""
Return a deterministic hash value for this row.
Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(hash(self._schema).to_bytes(8) + functools.reduce(operator.add, [xxhash.xxh3_64(str(self.get_value(value))).intdigest().to_bytes(8) for value in self], b"\0")).intdigest()

def __iter__(self) -> Iterator[Any]:
"""
Create an iterator for the column names of this row.
Expand Down
12 changes: 12 additions & 0 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import openpyxl
import pandas as pd
import seaborn as sns
import xxhash
from pandas import DataFrame
from scipy import stats

Expand Down Expand Up @@ -457,6 +458,17 @@ def __eq__(self, other: object) -> bool:
return table1.column_names == table2.column_names
return table1._schema == table2._schema and table1._data.equals(table2._data)

def __hash__(self) -> int:
"""
Return a deterministic hash value for this table.
Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(hash(self._schema).to_bytes(8) + self.number_of_rows.to_bytes(8)).intdigest()

def __repr__(self) -> str:
r"""
Display the table in only one line.
Expand Down
27 changes: 27 additions & 0 deletions src/safeds/data/tabular/containers/_tagged_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import sys
from typing import TYPE_CHECKING

import xxhash

from safeds.data.tabular.containers import Column, Row, Table
from safeds.exceptions import (
ColumnIsTargetError,
Expand Down Expand Up @@ -165,6 +167,31 @@ def __init__(
self._features: Table = _data.keep_only_columns(feature_names)
self._target: Column = _data.get_column(target_name)

def __eq__(self, other: object) -> bool:
"""
Compare two tagged table instances.
Returns
-------
'True' if contents and tags are equal, 'False' otherwise.
"""
if not isinstance(other, TaggedTable):
return NotImplemented
if self is other:
return True
return self.target == other.target and self.features == other.features and Table.__eq__(self, other)

def __hash__(self) -> int:
"""
Return a deterministic hash value for this tagged table.
Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(hash(self.target).to_bytes(8) + hash(self.features).to_bytes(8) + Table.__hash__(self).to_bytes(8)).intdigest()

def __sizeof__(self) -> int:
"""
Return the complete size of this object.
Expand Down
26 changes: 26 additions & 0 deletions src/safeds/data/tabular/containers/_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import xxhash

from safeds.data.image.containers import Image
from safeds.data.tabular.containers import Column, Row, Table, TaggedTable
Expand Down Expand Up @@ -194,6 +195,31 @@ def __init__(
raise UnknownColumnNameError([time_name])
self._time: Column = _data.get_column(time_name)

def __eq__(self, other: object) -> bool:
"""
Compare two time series instances.
Returns
-------
'True' if contents are equal, 'False' otherwise.
"""
if not isinstance(other, TimeSeries):
return NotImplemented
if self is other:
return True
return self.time == other.time and TaggedTable.__eq__(self, other)

def __hash__(self) -> int:
"""
Return a deterministic hash value for this time series.
Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(hash(self.time).to_bytes(8) + TaggedTable.__hash__(self).to_bytes(8)).intdigest()

def __sizeof__(self) -> int:
"""
Return the complete size of this object.
Expand Down
13 changes: 13 additions & 0 deletions src/safeds/data/tabular/transformation/_table_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,26 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

import xxhash

if TYPE_CHECKING:
from safeds.data.tabular.containers import Table


class TableTransformer(ABC):
"""Learn a transformation for a set of columns in a `Table` and transform another `Table` with the same columns."""

def __hash__(self) -> int:
"""
Return a deterministic hash value for a table transformer.
Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.__class__.__qualname__.encode("utf-8") + (1 if self.is_fitted() else 0).to_bytes(1)).intdigest()

@abstractmethod
def fit(self, table: Table, column_names: list[str] | None) -> TableTransformer:
"""
Expand Down
6 changes: 4 additions & 2 deletions src/safeds/data/tabular/typing/_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING

import xxhash

from safeds.data.tabular.typing import Anything, Integer, Nothing, RealNumber
from safeds.data.tabular.typing._column_type import ColumnType
from safeds.exceptions import UnknownColumnNameError
Expand Down Expand Up @@ -66,7 +68,7 @@ def __init__(self, schema: dict[str, ColumnType]):

def __hash__(self) -> int:
"""
Return a hash value for the schema.
Return a deterministic hash value for the schema.
Returns
-------
Expand All @@ -81,7 +83,7 @@ def __hash__(self) -> int:
"""
column_names = self._schema.keys()
column_types = map(repr, self._schema.values())
return hash(tuple(zip(column_names, column_types, strict=True)))
return xxhash.xxh3_64(str(tuple(zip(column_names, column_types, strict=True)))).intdigest()

def __repr__(self) -> str:
"""
Expand Down
12 changes: 12 additions & 0 deletions src/safeds/ml/classical/classification/_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

import xxhash
from sklearn.metrics import accuracy_score as sk_accuracy_score

from safeds.data.tabular.containers import Table, TaggedTable
Expand All @@ -17,6 +18,17 @@
class Classifier(ABC):
"""Abstract base class for all classifiers."""

def __hash__(self) -> int:
"""
Return a deterministic hash value for a classifier.
Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.__class__.__qualname__.encode("utf-8") + (1 if self.is_fitted() else 0).to_bytes(1)).intdigest()

@abstractmethod
def fit(self, training_set: TaggedTable) -> Classifier:
"""
Expand Down
12 changes: 12 additions & 0 deletions src/safeds/ml/classical/regression/_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

import xxhash
from sklearn.metrics import mean_absolute_error as sk_mean_absolute_error
from sklearn.metrics import mean_squared_error as sk_mean_squared_error

Expand All @@ -16,6 +17,17 @@
class Regressor(ABC):
"""Abstract base class for all regressors."""

def __hash__(self) -> int:
"""
Return a deterministic hash value for a regressor.
Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.__class__.__qualname__.encode("utf-8") + (1 if self.is_fitted() else 0).to_bytes(1)).intdigest()

@abstractmethod
def fit(self, training_set: TaggedTable) -> Regressor:
"""
Expand Down
38 changes: 38 additions & 0 deletions tests/safeds/data/image/containers/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,44 @@ def test_should_raise(self, resource_path: str, device: Device) -> None:
assert (image.__eq__(other)) is NotImplemented


class TestHash:
@pytest.mark.parametrize("device", _test_devices(), ids=_test_devices_ids())
@pytest.mark.parametrize(
"resource_path",
_test_images_all(),
ids=_test_images_all_ids(),
)
def test_should_hash_be_equal(self, resource_path: str, device: Device) -> None:
_skip_if_device_not_available(device)
image = Image.from_file(resolve_resource_path(resource_path), device)
image2 = Image.from_file(resolve_resource_path(resource_path), device)
assert hash(image) == hash(image2)

@pytest.mark.parametrize("device", _test_devices(), ids=_test_devices_ids())
def test_should_hash_not_be_equal(self, device: Device) -> None:
_skip_if_device_not_available(device)
image = Image.from_file(resolve_resource_path(_plane_png_path), device)
image2 = Image.from_file(resolve_resource_path(_white_square_png_path), device)
assert hash(image) != hash(image2)

@pytest.mark.parametrize(
"resource_path",
_test_images_all(),
ids=_test_images_all_ids(),
)
def test_should_hash_be_equal_different_devices(self, resource_path: str) -> None:
_skip_if_device_not_available(_device_cuda)
image = Image.from_file(resolve_resource_path(resource_path), torch.device("cpu"))
image2 = Image.from_file(resolve_resource_path(resource_path), torch.device("cuda"))
assert hash(image) == hash(image2)

def test_should_hash_not_be_equal_different_devices(self) -> None:
_skip_if_device_not_available(_device_cuda)
image = Image.from_file(resolve_resource_path(_plane_png_path), torch.device("cpu"))
image2 = Image.from_file(resolve_resource_path(_white_square_png_path), torch.device("cuda"))
assert hash(image) != hash(image2)


@pytest.mark.parametrize("device", _test_devices(), ids=_test_devices_ids())
class TestResize:
@pytest.mark.parametrize(
Expand Down
36 changes: 36 additions & 0 deletions tests/safeds/data/tabular/containers/_column/test_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from typing import Any

import pytest
from safeds.data.tabular.containers import Column, Row


@pytest.mark.parametrize(
("column1", "column2"),
[
(Column("a"), Column("a")),
(Column("a", [1, 2, 3]), Column("a", [1, 2, 3])),
(Column("a", [1, 2, 3]), Column("a", [1, 2, 4])),
],
ids=[
"empty columns",
"equal columns",
"different values",
],
)
def test_should_return_same_hash_for_equal_columns(column1: Column, column2: Column) -> None:
assert hash(column1) == hash(column2)


@pytest.mark.parametrize(
("column1", "column2"),
[
(Column("a"), Column("b")),
(Column("a", [1, 2, 3]), Column("a", ["1", "2", "3"])),
],
ids=[
"different names",
"different types",
],
)
def test_should_return_different_hash_for_unequal_columns(column1: Column, column2: Column) -> None:
assert hash(column1) != hash(column2)
Loading

0 comments on commit f6a3ca7

Please sign in to comment.