From cbeca0ffd91c056017638f1eefa238d3496e0cf2 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 14:02:30 +0200
Subject: [PATCH 01/10] Use numpy representation for hashing

---
 audformat/core/table.py | 15 +++++----------
 tests/test_table.py     |  6 +++---
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index d3732660..1f059d1e 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1871,16 +1871,11 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
     # Idea for implementation from
     # https://github.com/streamlit/streamlit/issues/7086#issuecomment-1654504410
     md5 = hashlib.md5()
-    if max_rows is not None and len(df) > max_rows:  # pragma: nocover (not yet used)
-        df = df.sample(n=max_rows, random_state=0)
-        # Hash length of dataframe, as we have to track if this changes
-        md5.update(str(len(df)).encode("utf-8"))
-    try:
-        md5.update(bytes(str(pd.util.hash_pandas_object(df)), "utf-8"))
-    except TypeError:
-        # Use pickle if pandas cannot hash the object,
-        # e.g. if it contains numpy.arrays.
-        md5.update(f"{pickle.dumps(df, pickle.HIGHEST_PROTOCOL)}".encode("utf-8"))
+    df = df.copy().reset_index()
+    for column in df.columns:
+        # Convert every column to a numpy array,
+        # and hash its string representation
+        md5.update(bytes(str(df[column].to_numpy()), "utf-8"))
     return md5.digest()
 
 
diff --git a/tests/test_table.py b/tests/test_table.py
index 5900d7f3..421f831c 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1312,15 +1312,15 @@ def get_md5(path: str) -> str:
     [
         (
             "files",
-            "9caa6722e65a04ddbce1cda2238c9126",
+            "a66a22ee4158e0e5100f1d797155ad81",
         ),
         (
             "segments",
-            "37c9d9dc4f937a6e97ec72a080055e49",
+            "f69eb4a5d19da71e5da00a9b13beb3db",
         ),
         (
             "misc",
-            "3488c007d45b19e04e8fdbf000f0f04d",
+            "d3bfb0271878be96c42d523ee4c491da",
         ),
     ],
 )

From a4ae33f7e988e80eff1f380bf53521707ec5d45b Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 14:14:53 +0200
Subject: [PATCH 02/10] Enable tests and require pandas>=1.4.1

---
 .github/workflows/test.yml | 4 ++--
 pyproject.toml             | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 210a685f..9225c325 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ main ]
   pull_request:
-    branches: [ main ]
+    branches: [ main, pyarrow ]
 
 jobs:
   build:
@@ -55,7 +55,7 @@ jobs:
       run: |
         pip install "audeer==2.0.0"
         pip install "audiofile==0.4.0"
-        pip install "pandas==2.2.0"
+        pip install "pandas==1.4.1"
         pip install "pyarrow==10.0.1"
         pip install "pyyaml==5.4.1"
       if: matrix.requirements == 'minimum'
diff --git a/pyproject.toml b/pyproject.toml
index b0f45140..0c6a4297 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,7 @@ dependencies = [
     'iso-639',
     'iso3166',
     'oyaml',
-    'pandas >=2.2.0',  # hash values, see https://github.com/pandas-dev/pandas/issues/58999
+    'pandas >=1.4.1',
     'pyarrow >=10.0.1',  # for pyarrow strings in pandas
     'pyyaml >=5.4.1',
 ]

From ac8a3ef16bb740407ee07a9fccd69e1e030a610c Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 14:18:49 +0200
Subject: [PATCH 03/10] Use numpy<2.0 in minimum test

---
 .github/workflows/test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9225c325..9faac468 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -55,6 +55,7 @@ jobs:
       run: |
         pip install "audeer==2.0.0"
         pip install "audiofile==0.4.0"
+        pip install "numpy<2.0"
         pip install "pandas==1.4.1"
         pip install "pyarrow==10.0.1"
         pip install "pyyaml==5.4.1"

From c0c71d42fa26c3969304688242833d4bac4e7a5f Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 14:32:30 +0200
Subject: [PATCH 04/10] Skip doctests in minimum

---
 .github/workflows/test.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9faac468..604297b3 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -64,6 +64,12 @@ jobs:
     - name: Test with pytest
       run: |
         python -m pytest
+      if: matrix.requirements != 'minimum'
+
+    - name: Test with pytest (skip doctests)
+      run: |
+        python -m pytest --ignore=audformat --cov-fail-under=99
+      if: matrix.requirements == 'minimum'
 
     - name: Upload coverage to Codecov
       uses: codecov/codecov-action@v4

From 5af2d7dac35abedfa1f25a4f38a779acfcdfbf56 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 15:04:15 +0200
Subject: [PATCH 05/10] Require pandas>=2.1.0

---
 .github/workflows/test.yml |  9 +--------
 audformat/core/table.py    | 14 +++++++++++++-
 pyproject.toml             |  4 ++--
 tests/test_table.py        |  2 +-
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 604297b3..7e7d84a0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -55,8 +55,7 @@ jobs:
       run: |
         pip install "audeer==2.0.0"
         pip install "audiofile==0.4.0"
-        pip install "numpy<2.0"
-        pip install "pandas==1.4.1"
+        pip install "pandas==2.1.0"
         pip install "pyarrow==10.0.1"
         pip install "pyyaml==5.4.1"
       if: matrix.requirements == 'minimum'
@@ -64,12 +63,6 @@ jobs:
     - name: Test with pytest
       run: |
         python -m pytest
-      if: matrix.requirements != 'minimum'
-
-    - name: Test with pytest (skip doctests)
-      run: |
-        python -m pytest --ignore=audformat --cov-fail-under=99
-      if: matrix.requirements == 'minimum'
 
     - name: Upload coverage to Codecov
       uses: codecov/codecov-action@v4
diff --git a/audformat/core/table.py b/audformat/core/table.py
index 1f059d1e..ad534b02 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1875,7 +1875,19 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
     for column in df.columns:
         # Convert every column to a numpy array,
         # and hash its string representation
-        md5.update(bytes(str(df[column].to_numpy()), "utf-8"))
+        print(df[column].to_numpy())
+        print(df[column].dtype)
+        print(hashlib.md5(bytes(str(df[column].to_numpy()), "utf-8")).hexdigest())
+        if df[column].dtype == "Int64":
+            # Enforce consistent conversion to numpy.array
+            # for integers across different pandas versions
+            # (since pandas 2.2.x,
+            # Int64 is converted to float,
+            # if contains <NA>)
+            y = df[column].astype("float")
+        else:
+            y = df[column]
+        md5.update(bytes(str(y.to_numpy()), "utf-8"))
     return md5.digest()
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 0c6a4297..13c329eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,14 +28,14 @@ classifiers = [
     'Programming Language :: Python :: 3.11',
     'Topic :: Scientific/Engineering',
 ]
-requires-python = '>=3.9'  # pandas >=2.2.0
+requires-python = '>=3.9'  # pandas >=2.1.0
 dependencies = [
     'audeer >=2.0.0',
     'audiofile >=0.4.0',
     'iso-639',
     'iso3166',
     'oyaml',
-    'pandas >=1.4.1',
+    'pandas >=2.1.0',  # for pyarrow -> timedelta conversion
     'pyarrow >=10.0.1',  # for pyarrow strings in pandas
     'pyyaml >=5.4.1',
 ]
diff --git a/tests/test_table.py b/tests/test_table.py
index 421f831c..1af6bd85 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1320,7 +1320,7 @@ def get_md5(path: str) -> str:
         ),
         (
             "misc",
-            "d3bfb0271878be96c42d523ee4c491da",
+            "331f79758b195cb9b7d0e8889e830eb2",
         ),
     ],
 )

From 0fcab74b7d02b2466fe4bfd4d504ce7c82966c1d Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 15:06:40 +0200
Subject: [PATCH 06/10] Require numpy<=2.0.0 in minimum test

---
 .github/workflows/test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7e7d84a0..cc3cd32c 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -55,6 +55,7 @@ jobs:
       run: |
         pip install "audeer==2.0.0"
         pip install "audiofile==0.4.0"
+        pip install "numpy<=2.0.0"
         pip install "pandas==2.1.0"
         pip install "pyarrow==10.0.1"
         pip install "pyyaml==5.4.1"

From 94b934c4cf0faf12c43a9d86831399cfe2af8ba4 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 15:10:02 +0200
Subject: [PATCH 07/10] Remove print statements

---
 audformat/core/table.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index ad534b02..ad9d4541 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1875,9 +1875,6 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
     for column in df.columns:
         # Convert every column to a numpy array,
         # and hash its string representation
-        print(df[column].to_numpy())
-        print(df[column].dtype)
-        print(hashlib.md5(bytes(str(df[column].to_numpy()), "utf-8")).hexdigest())
         if df[column].dtype == "Int64":
             # Enforce consistent conversion to numpy.array
             # for integers across different pandas versions

From a1d4922632d1c6c0704804d5bbb54fb66e95afa1 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 15:10:19 +0200
Subject: [PATCH 08/10] Fix numpy<2.0.0 for minimum test

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index cc3cd32c..f895bbf6 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -55,7 +55,7 @@ jobs:
       run: |
         pip install "audeer==2.0.0"
         pip install "audiofile==0.4.0"
-        pip install "numpy<=2.0.0"
+        pip install "numpy<2.0.0"
         pip install "pandas==2.1.0"
         pip install "pyarrow==10.0.1"
         pip install "pyyaml==5.4.1"

From 6bf2e05ac84b17f450f97262135adde8a4bb7f56 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 15:14:17 +0200
Subject: [PATCH 09/10] Remove max_rows argument

---
 audformat/core/table.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index ad9d4541..e7f4ac09 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1844,7 +1844,7 @@ def _assert_table_index(
         )
 
 
-def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
+def _dataframe_hash(df: pd.DataFrame) -> bytes:
     """Hash a dataframe.
 
     The hash value takes into account:
@@ -1860,16 +1860,11 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
 
     Args:
         df: dataframe
-        max_rows: if not ``None``,
-            the maximum number of rows,
-            taken into account for hashing
 
     Returns:
         MD5 hash in bytes
 
     """
-    # Idea for implementation from
-    # https://github.com/streamlit/streamlit/issues/7086#issuecomment-1654504410
     md5 = hashlib.md5()
     df = df.copy().reset_index()
     for column in df.columns:
@@ -1878,9 +1873,7 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
         if df[column].dtype == "Int64":
             # Enforce consistent conversion to numpy.array
             # for integers across different pandas versions
-            # (since pandas 2.2.x,
-            # Int64 is converted to float,
-            # if contains <NA>)
+            # (since pandas 2.2.x, Int64 is converted to float if it contains <NA>)
             y = df[column].astype("float")
         else:
             y = df[column]

From 575d2fec73994112e177b5b8db588ad5b474489e Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 15:17:22 +0200
Subject: [PATCH 10/10] Simplify code

---
 audformat/core/table.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index e7f4ac09..30924953 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1866,17 +1866,14 @@ def _dataframe_hash(df: pd.DataFrame) -> bytes:
 
     """
     md5 = hashlib.md5()
-    df = df.copy().reset_index()
-    for column in df.columns:
+    for _, y in df.reset_index().items():
         # Convert every column to a numpy array,
         # and hash its string representation
-        if df[column].dtype == "Int64":
+        if y.dtype == "Int64":
             # Enforce consistent conversion to numpy.array
             # for integers across different pandas versions
             # (since pandas 2.2.x, Int64 is converted to float if it contains <NA>)
-            y = df[column].astype("float")
-        else:
-            y = df[column]
+            y = y.astype("float")
         md5.update(bytes(str(y.to_numpy()), "utf-8"))
     return md5.digest()