Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/datachain/lib/udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,15 @@ def hash(self) -> str:
"""
Creates SHA hash of this UDF function. It takes into account function,
inputs and outputs.

For function-based UDFs, hashes self._func.
For class-based UDFs, hashes the process method.
"""
# Hash user code: either _func (function-based) or process method (class-based)
func_to_hash = self._func if self._func else self.process
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (code-quality): Replace if-expression with or (or-if-exp-identity)

Suggested change
func_to_hash = self._func if self._func else self.process
func_to_hash = self._func or self.process


ExplanationHere we find ourselves setting a value if it evaluates to True, and otherwise
using a default.

The 'After' case is a bit easier to read and avoids the duplication of
input_currency.

It works because the left-hand side is evaluated first. If it evaluates to
true then currency will be set to this and the right-hand side will not be
evaluated. If it evaluates to false the right-hand side will be evaluated and
currency will be set to DEFAULT_CURRENCY.


parts = [
hash_callable(self._func),
hash_callable(func_to_hash),
self.params.hash() if self.params else "",
self.output.hash(),
]
Expand Down
60 changes: 54 additions & 6 deletions tests/unit/test_datachain_hash.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from unittest.mock import patch

import pandas as pd
import pytest
from pydantic import BaseModel

import datachain as dc
from datachain import func
from datachain.lib.dc import C

DF_DATA = {
"first_name": ["Alice", "Bob", "Charlie", "David", "Eva"],
"age": [25, 30, 35, 40, 45],
}


class Person(BaseModel):
name: str
Expand Down Expand Up @@ -55,13 +61,55 @@ def mock_get_listing():


def test_read_values():
pytest.skip(
"Hash of the chain started with read_values is currently inconsistent,"
" meaning it produces different hash every time. This happens because we"
" create random name dataset in the process. Correct solution would be"
" to calculate hash of all those input values."
"""
Hash of the chain started with read_values is currently inconsistent.
Goal of this test is just to check it doesn't break.
"""
assert dc.read_values(num=[1, 2, 3]).hash() is not None


def test_read_csv(test_session, tmp_dir):
"""
Hash of the chain started with read_csv is currently inconsistent.
Goal of this test is just to check it doesn't break.
"""
path = tmp_dir / "test.csv"
pd.DataFrame(DF_DATA).to_csv(path, index=False)
assert dc.read_csv(path.as_uri(), session=test_session).hash() is not None


@pytest.mark.filterwarnings("ignore::pydantic.warnings.PydanticDeprecatedSince20")
def test_read_json(test_session, tmp_dir):
"""
Hash of the chain started with read_json is currently inconsistent.
Goal of this test is just to check it doesn't break.
"""
path = tmp_dir / "test.jsonl"
dc.read_pandas(pd.DataFrame(DF_DATA), session=test_session).to_jsonl(path)
assert (
dc.read_json(path.as_uri(), format="jsonl", session=test_session).hash()
is not None
)
assert dc.read_values(num=[1, 2, 3]).hash() == ""


def test_read_pandas(test_session, tmp_dir):
"""
Hash of the chain started with read_pandas is currently inconsistent.
Goal of this test is just to check it doesn't break.
"""
df = pd.DataFrame(DF_DATA)
assert dc.read_pandas(df, session=test_session).hash() is not None


def test_read_parquet(test_session, tmp_dir):
"""
Hash of the chain started with read_parquet is currently inconsistent.
Goal of this test is just to check it doesn't break.
"""
df = pd.DataFrame(DF_DATA)
path = tmp_dir / "test.parquet"
dc.read_pandas(df, session=test_session).to_parquet(path)
assert dc.read_parquet(path.as_uri(), session=test_session).hash() is not None


def test_read_storage(mock_get_listing):
Expand Down
28 changes: 28 additions & 0 deletions tests/unit/test_query_steps_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,22 @@ def custom_feature_gen(m_fr):
)


# Class-based UDFs for testing hash calculation
class DoubleMapper(Mapper):
"""Class-based Mapper that overrides process()."""

def process(self, x):
return x * 2


class TripleGenerator(Generator):
"""Class-based Generator that overrides process()."""

def process(self, x):
yield x * 3
yield x * 3 + 1


@pytest.fixture
def numbers_dataset(test_session):
"""
Expand Down Expand Up @@ -394,6 +410,12 @@ def test_subtract_hash(test_session, numbers_dataset, on, _hash):
{"x": CustomFeature},
"b4edceaa18ed731085e1c433a6d21deabec8d92dfc338fb1d709ed7951977fc5",
),
(
DoubleMapper(),
["x"],
{"double": int},
"7994436106fef0486b04078b02ee437be3aa73ade2d139fb8c020e2199515e26",
),
],
)
def test_udf_mapper_hash(
Expand Down Expand Up @@ -428,6 +450,12 @@ def test_udf_mapper_hash(
{"x": CustomFeature},
"7ff702d242612cbb83cbd1777aa79d2792fb2a341db5ea406cd9fd3f42543b9c",
),
(
TripleGenerator(),
["x"],
{"triple": int},
"02b4c6bf98ffa011b7c62f3374f219f21796ece5b001d99e4c2f69edf0a94f4a",
),
],
)
def test_udf_generator_hash(
Expand Down