audeering · hagenw · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ main ]
   pull_request:
-    branches: [ main ]
+    branches: [ main, pyarrow ]
 
 jobs:
   build:
@@ -55,7 +55,8 @@ jobs:
       run: |
         pip install "audeer==2.0.0"
         pip install "audiofile==0.4.0"
-        pip install "pandas==2.2.0"
+        pip install "numpy<2.0.0"
+        pip install "pandas==2.1.0"
         pip install "pyarrow==10.0.1"
         pip install "pyyaml==5.4.1"
       if: matrix.requirements == 'minimum'

diff --git a/audformat/core/table.py b/audformat/core/table.py
@@ -1844,7 +1844,7 @@ def _assert_table_index(
         )
 
 
-def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
+def _dataframe_hash(df: pd.DataFrame) -> bytes:
     """Hash a dataframe.
 
     The hash value takes into account:
@@ -1860,27 +1860,21 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
 
     Args:
         df: dataframe
-        max_rows: if not ``None``,
-            the maximum number of rows,
-            taken into account for hashing
 
     Returns:
         MD5 hash in bytes
 
     """
-    # Idea for implementation from
-    # https://github.com/streamlit/streamlit/issues/7086#issuecomment-1654504410
     md5 = hashlib.md5()
-    if max_rows is not None and len(df) > max_rows:  # pragma: nocover (not yet used)
-        df = df.sample(n=max_rows, random_state=0)
-        # Hash length of dataframe, as we have to track if this changes
-        md5.update(str(len(df)).encode("utf-8"))
-    try:
-        md5.update(bytes(str(pd.util.hash_pandas_object(df)), "utf-8"))
-    except TypeError:
-        # Use pickle if pandas cannot hash the object,
-        # e.g. if it contains numpy.arrays.
-        md5.update(f"{pickle.dumps(df, pickle.HIGHEST_PROTOCOL)}".encode("utf-8"))
+    for _, y in df.reset_index().items():
+        # Convert every column to a numpy array,
+        # and hash its string representation
+        if y.dtype == "Int64":
+            # Enforce consistent conversion to numpy.array
+            # for integers across different pandas versions
+            # (since pandas 2.2.x, Int64 is converted to float if it contains <NA>)
+            y = y.astype("float")
+        md5.update(bytes(str(y.to_numpy()), "utf-8"))
     return md5.digest()
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -28,14 +28,14 @@ classifiers = [
     'Programming Language :: Python :: 3.11',
     'Topic :: Scientific/Engineering',
 ]
-requires-python = '>=3.9'  # pandas >=2.2.0
+requires-python = '>=3.9'  # pandas >=2.1.0
 dependencies = [
     'audeer >=2.0.0',
     'audiofile >=0.4.0',
     'iso-639',
     'iso3166',
     'oyaml',
-    'pandas >=2.2.0',  # hash values, see https://github.com/pandas-dev/pandas/issues/58999
+    'pandas >=2.1.0',  # for pyarrow -> timedelta conversion
     'pyarrow >=10.0.1',  # for pyarrow strings in pandas
     'pyyaml >=5.4.1',
 ]

diff --git a/tests/test_table.py b/tests/test_table.py
@@ -1312,15 +1312,15 @@ def get_md5(path: str) -> str:
     [
         (
             "files",
-            "9caa6722e65a04ddbce1cda2238c9126",
+            "a66a22ee4158e0e5100f1d797155ad81",
         ),
         (
             "segments",
-            "37c9d9dc4f937a6e97ec72a080055e49",
+            "f69eb4a5d19da71e5da00a9b13beb3db",
         ),
         (
             "misc",
-            "3488c007d45b19e04e8fdbf000f0f04d",
+            "331f79758b195cb9b7d0e8889e830eb2",
         ),
     ],
 )