datachain-ai · ilongin · Sep 24, 2025 · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/src/datachain/diff/__init__.py b/src/datachain/diff/__init__.py
@@ -1,5 +1,3 @@
-import random
-import string
 from collections.abc import Sequence
 from enum import Enum
 from typing import TYPE_CHECKING, Optional, Union
@@ -11,16 +9,12 @@
 if TYPE_CHECKING:
     from datachain.lib.dc import DataChain
 
-
 C = Column
 
 
-def get_status_col_name() -> str:
-    """Returns new unique status col name"""
-    return "diff_" + "".join(
-        random.choice(string.ascii_letters)  # noqa: S311
-        for _ in range(10)
-    )
+STATUS_COL_NAME = "diff_7aeed3aa17ba4d50b8d1c368c76e16a6"
+LEFT_DIFF_COL_NAME = "diff_95f95344064a4b819c8625cd1a5cfc2b"
+RIGHT_DIFF_COL_NAME = "diff_5808838a49b54849aa461d7387376d34"
 
 
 class CompareStatus(str, Enum):
@@ -101,9 +95,9 @@ def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]:
         compare = right_compare = [c for c in cols if c in right_cols and c not in on]  # type: ignore[misc]
 
     # get diff column names
-    diff_col = status_col or get_status_col_name()
-    ldiff_col = get_status_col_name()
-    rdiff_col = get_status_col_name()
+    diff_col = status_col or STATUS_COL_NAME
+    ldiff_col = LEFT_DIFF_COL_NAME
+    rdiff_col = RIGHT_DIFF_COL_NAME
 
     # adding helper diff columns, which will be removed after
     left = left.mutate(**{ldiff_col: 1})
@@ -227,7 +221,7 @@ def compare_and_split(
         )
         ```
     """
-    status_col = get_status_col_name()
+    status_col = STATUS_COL_NAME
 
     res = _compare(
         left,

diff --git a/src/datachain/hash_utils.py b/src/datachain/hash_utils.py
@@ -0,0 +1,147 @@
+import hashlib
+import inspect
+import json
+import textwrap
+from collections.abc import Sequence
+from typing import TypeVar, Union
+
+from sqlalchemy.sql.elements import (
+    BinaryExpression,
+    BindParameter,
+    ColumnElement,
+    Label,
+    Over,
+    UnaryExpression,
+)
+from sqlalchemy.sql.functions import Function
+
+T = TypeVar("T", bound=ColumnElement)
+ColumnLike = Union[str, T]
+
+
+def serialize_column_element(expr: Union[str, ColumnElement]) -> dict:  # noqa: PLR0911
+    """
+    Recursively serialize a SQLAlchemy ColumnElement into a deterministic structure.
+    """
+
+    # Binary operations: col > 5, col1 + col2, etc.
+    if isinstance(expr, BinaryExpression):
+        op = (
+            expr.operator.__name__
+            if hasattr(expr.operator, "__name__")
+            else str(expr.operator)
+        )
+        return {
+            "type": "binary",
+            "op": op,
+            "left": serialize_column_element(expr.left),
+            "right": serialize_column_element(expr.right),
+        }
+
+    # Unary operations: -col, NOT col, etc.
+    if isinstance(expr, UnaryExpression):
+        op = (
+            expr.operator.__name__
+            if expr.operator is not None and hasattr(expr.operator, "__name__")
+            else str(expr.operator)
+        )
+
+        return {
+            "type": "unary",
+            "op": op,
+            "element": serialize_column_element(expr.element),  # type: ignore[arg-type]
+        }
+
+    # Function calls: func.lower(col), func.count(col), etc.
+    if isinstance(expr, Function):
+        return {
+            "type": "function",
+            "name": expr.name,
+            "clauses": [serialize_column_element(c) for c in expr.clauses],
+        }
+
+    # Window functions: func.row_number().over(partition_by=..., order_by=...)
+    if isinstance(expr, Over):
+        return {
+            "type": "window",
+            "function": serialize_column_element(expr.element),
+            "partition_by": [
+                serialize_column_element(p) for p in getattr(expr, "partition_by", [])
+            ],
+            "order_by": [
+                serialize_column_element(o) for o in getattr(expr, "order_by", [])
+            ],
+        }
+
+    # Labeled expressions: col.label("alias")
+    if isinstance(expr, Label):
+        return {
+            "type": "label",
+            "name": expr.name,
+            "element": serialize_column_element(expr.element),
+        }
+
+    # Bound values (constants)
+    if isinstance(expr, BindParameter):
+        return {"type": "bind", "value": expr.value}
+
+    # Plain columns
+    if hasattr(expr, "name"):
+        return {"type": "column", "name": expr.name}
+
+    # Fallback: stringify unknown nodes
+    return {"type": "other", "repr": str(expr)}
+
+
+def hash_column_elements(columns: Sequence[ColumnLike]) -> str:
+    """
+    Hash a list of ColumnElements deterministically, dialect agnostic.
+    Only accepts ordered iterables (like list or tuple).
+    """
+    serialized = [serialize_column_element(c) for c in columns]
+    json_str = json.dumps(serialized, sort_keys=True)  # stable JSON
+    return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
+
+
+def hash_callable(func):
+    """
+    Calculate a hash from a callable.
+    Rules:
+    - Named functions (def) → use source code for stable, cross-version hashing
+    - Lambdas → use bytecode (deterministic in same Python runtime)
+    """
+    if not callable(func):
+        raise TypeError("Expected a callable")
+
+    # Determine if it is a lambda
+    is_lambda = func.__name__ == "<lambda>"
+
+    if not is_lambda:
+        # Try to get exact source of named function
+        try:
+            lines, _ = inspect.getsourcelines(func)
+            payload = textwrap.dedent("".join(lines)).strip()
+        except (OSError, TypeError):
+            # Fallback: bytecode if source not available
+            payload = func.__code__.co_code
+    else:
+        # For lambdas, fall back directly to bytecode
+        payload = func.__code__.co_code
+
+    # Normalize annotations
+    annotations = {
+        k: getattr(v, "__name__", str(v)) for k, v in func.__annotations__.items()
+    }
+
+    # Extras to distinguish functions with same code but different metadata
+    extras = {
+        "name": func.__name__,
+        "defaults": func.__defaults__,
+        "annotations": annotations,
+    }
+
+    # Compute SHA256
+    h = hashlib.sha256()
+    h.update(str(payload).encode() if isinstance(payload, str) else payload)
+    h.update(str(extras).encode())
+    return h.hexdigest()
diff --git a/src/datachain/lib/dc/datachain.py b/src/datachain/lib/dc/datachain.py
@@ -209,6 +209,14 @@ def __repr__(self) -> str:
         self.print_schema(file=file)
         return file.getvalue()
 
+    def hash(self) -> str:
+        """
+        Calculates SHA hash of this chain. Hash calculation is fast and consistent.
+        It takes into account all the steps added to the chain and their inputs.
+        Order of the steps is important.
+        """
+        return self._query.hash()
+
     def _as_delta(
         self,
         on: Optional[Union[str, Sequence[str]]] = None,
@@ -682,7 +690,7 @@ def save(  # type: ignore[override]
 
         if job_id := os.getenv("DATACHAIN_JOB_ID"):
             catalog.metastore.create_checkpoint(
-                job_id,  # type: ignore[arg-type]
+                job_id,
                 _hash=hashlib.sha256(  # TODO this will be replaced with self.hash()
                     str(uuid4()).encode()
                 ).hexdigest(),

diff --git a/src/datachain/lib/signal_schema.py b/src/datachain/lib/signal_schema.py
@@ -1,4 +1,6 @@
 import copy
+import hashlib
+import json
 import warnings
 from collections.abc import Iterator, Sequence
 from dataclasses import dataclass
@@ -257,6 +259,11 @@ def serialize(self) -> dict[str, Any]:
             signals["_custom_types"] = custom_types
         return signals
 
+    def hash(self) -> str:
+        """Create SHA hash of this schema"""
+        json_str = json.dumps(self.serialize(), sort_keys=True, separators=(",", ":"))
+        return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
+
     @staticmethod
     def _split_subtypes(type_name: str) -> list[str]:
         """This splits a list of subtypes, including proper square bracket handling."""

diff --git a/src/datachain/lib/udf.py b/src/datachain/lib/udf.py
@@ -1,3 +1,4 @@
+import hashlib
 import sys
 import traceback
 from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
@@ -12,6 +13,7 @@
 from datachain.asyn import AsyncMapper
 from datachain.cache import temporary_cache
 from datachain.dataset import RowDict
+from datachain.hash_utils import hash_callable
 from datachain.lib.convert.flatten import flatten
 from datachain.lib.file import DataModel, File
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
@@ -61,6 +63,9 @@ class UDFAdapter:
     batch_size: Optional[int] = None
     batch: int = 1
 
+    def hash(self) -> str:
+        return self.inner.hash()
+
     def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
         if use_partitioning:
             return Partition()
@@ -151,6 +156,21 @@ def __init__(self):
         self.output = None
         self._func = None
 
+    def hash(self) -> str:
+        """
+        Creates SHA hash of this UDF function. It takes into account function,
+        inputs and outputs.
+        """
+        parts = [
+            hash_callable(self._func),
+            self.params.hash() if self.params else "",
+            self.output.hash(),
+        ]
+
+        return hashlib.sha256(
+            b"".join([bytes.fromhex(part) for part in parts])
+        ).hexdigest()
+
     def process(self, *args, **kwargs):
         """Processing function that needs to be defined by user"""
         if not self._func: