diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 210a685f..f895bbf6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,7 +4,7 @@ on: push: branches: [ main ] pull_request: - branches: [ main ] + branches: [ main, pyarrow ] jobs: build: @@ -55,7 +55,8 @@ jobs: run: | pip install "audeer==2.0.0" pip install "audiofile==0.4.0" - pip install "pandas==2.2.0" + pip install "numpy<2.0.0" + pip install "pandas==2.1.0" pip install "pyarrow==10.0.1" pip install "pyyaml==5.4.1" if: matrix.requirements == 'minimum' diff --git a/audformat/core/table.py b/audformat/core/table.py index d3732660..30924953 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1844,7 +1844,7 @@ def _assert_table_index( ) -def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes: +def _dataframe_hash(df: pd.DataFrame) -> bytes: """Hash a dataframe. The hash value takes into account: @@ -1860,27 +1860,21 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes: Args: df: dataframe - max_rows: if not ``None``, - the maximum number of rows, - taken into account for hashing Returns: MD5 hash in bytes """ - # Idea for implementation from - # https://github.com/streamlit/streamlit/issues/7086#issuecomment-1654504410 md5 = hashlib.md5() - if max_rows is not None and len(df) > max_rows: # pragma: nocover (not yet used) - df = df.sample(n=max_rows, random_state=0) - # Hash length of dataframe, as we have to track if this changes - md5.update(str(len(df)).encode("utf-8")) - try: - md5.update(bytes(str(pd.util.hash_pandas_object(df)), "utf-8")) - except TypeError: - # Use pickle if pandas cannot hash the object, - # e.g. if it contains numpy.arrays. - md5.update(f"{pickle.dumps(df, pickle.HIGHEST_PROTOCOL)}".encode("utf-8")) + for _, y in df.reset_index().items(): + # Convert every column to a numpy array, + # and hash its string representation + if y.dtype == "Int64": + # Enforce consistent conversion to numpy.array + # for integers across different pandas versions + # (since pandas 2.2.x, Int64 is converted to float if it contains ) + y = y.astype("float") + md5.update(bytes(str(y.to_numpy()), "utf-8")) return md5.digest() diff --git a/pyproject.toml b/pyproject.toml index b0f45140..13c329eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,14 +28,14 @@ classifiers = [ 'Programming Language :: Python :: 3.11', 'Topic :: Scientific/Engineering', ] -requires-python = '>=3.9' # pandas >=2.2.0 +requires-python = '>=3.9' # pandas >=2.1.0 dependencies = [ 'audeer >=2.0.0', 'audiofile >=0.4.0', 'iso-639', 'iso3166', 'oyaml', - 'pandas >=2.2.0', # hash values, see https://github.com/pandas-dev/pandas/issues/58999 + 'pandas >=2.1.0', # for pyarrow -> timedelta conversion 'pyarrow >=10.0.1', # for pyarrow strings in pandas 'pyyaml >=5.4.1', ] diff --git a/tests/test_table.py b/tests/test_table.py index 5900d7f3..1af6bd85 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1312,15 +1312,15 @@ def get_md5(path: str) -> str: [ ( "files", - "9caa6722e65a04ddbce1cda2238c9126", + "a66a22ee4158e0e5100f1d797155ad81", ), ( "segments", - "37c9d9dc4f937a6e97ec72a080055e49", + "f69eb4a5d19da71e5da00a9b13beb3db", ), ( "misc", - "3488c007d45b19e04e8fdbf000f0f04d", + "331f79758b195cb9b7d0e8889e830eb2", ), ], )