Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
branches: [ main, pyarrow ]

jobs:
build:
Expand Down Expand Up @@ -55,7 +55,8 @@ jobs:
run: |
pip install "audeer==2.0.0"
pip install "audiofile==0.4.0"
pip install "pandas==2.2.0"
pip install "numpy<2.0.0"
pip install "pandas==2.1.0"
pip install "pyarrow==10.0.1"
pip install "pyyaml==5.4.1"
if: matrix.requirements == 'minimum'
Expand Down
26 changes: 10 additions & 16 deletions audformat/core/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1844,7 +1844,7 @@ def _assert_table_index(
)


def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
def _dataframe_hash(df: pd.DataFrame) -> bytes:
"""Hash a dataframe.

The hash value takes into account:
Expand All @@ -1860,27 +1860,21 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:

Args:
df: dataframe
max_rows: if not ``None``,
the maximum number of rows,
taken into account for hashing

Returns:
MD5 hash in bytes

"""
# Idea for implementation from
# https://github.com/streamlit/streamlit/issues/7086#issuecomment-1654504410
md5 = hashlib.md5()
if max_rows is not None and len(df) > max_rows: # pragma: nocover (not yet used)
df = df.sample(n=max_rows, random_state=0)
# Hash length of dataframe, as we have to track if this changes
md5.update(str(len(df)).encode("utf-8"))
try:
md5.update(bytes(str(pd.util.hash_pandas_object(df)), "utf-8"))
except TypeError:
# Use pickle if pandas cannot hash the object,
# e.g. if it contains numpy.arrays.
md5.update(f"{pickle.dumps(df, pickle.HIGHEST_PROTOCOL)}".encode("utf-8"))
for _, y in df.reset_index().items():
# Convert every column to a numpy array,
# and hash its string representation
if y.dtype == "Int64":
# Enforce consistent conversion to numpy.array
# for integers across different pandas versions
# (since pandas 2.2.x, Int64 is converted to float if it contains <NA>)
y = y.astype("float")
md5.update(bytes(str(y.to_numpy()), "utf-8"))
return md5.digest()


Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ classifiers = [
'Programming Language :: Python :: 3.11',
'Topic :: Scientific/Engineering',
]
requires-python = '>=3.9' # pandas >=2.2.0
requires-python = '>=3.9' # pandas >=2.1.0
dependencies = [
'audeer >=2.0.0',
'audiofile >=0.4.0',
'iso-639',
'iso3166',
'oyaml',
'pandas >=2.2.0', # hash values, see https://github.com/pandas-dev/pandas/issues/58999
'pandas >=2.1.0', # for pyarrow -> timedelta conversion
'pyarrow >=10.0.1', # for pyarrow strings in pandas
'pyyaml >=5.4.1',
]
Expand Down
6 changes: 3 additions & 3 deletions tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1312,15 +1312,15 @@ def get_md5(path: str) -> str:
[
(
"files",
"9caa6722e65a04ddbce1cda2238c9126",
"a66a22ee4158e0e5100f1d797155ad81",
),
(
"segments",
"37c9d9dc4f937a6e97ec72a080055e49",
"f69eb4a5d19da71e5da00a9b13beb3db",
),
(
"misc",
"3488c007d45b19e04e8fdbf000f0f04d",
"331f79758b195cb9b7d0e8889e830eb2",
),
],
)
Expand Down