From cbeca0ffd91c056017638f1eefa238d3496e0cf2 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 14:02:30 +0200 Subject: [PATCH 01/10] Use numpy representation for hashing --- audformat/core/table.py | 15 +++++---------- tests/test_table.py | 6 +++--- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index d3732660..1f059d1e 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1871,16 +1871,11 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes: # Idea for implementation from # https://github.com/streamlit/streamlit/issues/7086#issuecomment-1654504410 md5 = hashlib.md5() - if max_rows is not None and len(df) > max_rows: # pragma: nocover (not yet used) - df = df.sample(n=max_rows, random_state=0) - # Hash length of dataframe, as we have to track if this changes - md5.update(str(len(df)).encode("utf-8")) - try: - md5.update(bytes(str(pd.util.hash_pandas_object(df)), "utf-8")) - except TypeError: - # Use pickle if pandas cannot hash the object, - # e.g. if it contains numpy.arrays. - md5.update(f"{pickle.dumps(df, pickle.HIGHEST_PROTOCOL)}".encode("utf-8")) + df = df.copy().reset_index() + for column in df.columns: + # Convert every column to a numpy array, + # and hash its string representation + md5.update(bytes(str(df[column].to_numpy()), "utf-8")) return md5.digest() diff --git a/tests/test_table.py b/tests/test_table.py index 5900d7f3..421f831c 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1312,15 +1312,15 @@ def get_md5(path: str) -> str: [ ( "files", - "9caa6722e65a04ddbce1cda2238c9126", + "a66a22ee4158e0e5100f1d797155ad81", ), ( "segments", - "37c9d9dc4f937a6e97ec72a080055e49", + "f69eb4a5d19da71e5da00a9b13beb3db", ), ( "misc", - "3488c007d45b19e04e8fdbf000f0f04d", + "d3bfb0271878be96c42d523ee4c491da", ), ], ) From a4ae33f7e988e80eff1f380bf53521707ec5d45b Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 14:14:53 +0200 Subject: [PATCH 02/10] Enable tests and require pandas>=1.4.1 --- .github/workflows/test.yml | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 210a685f..9225c325 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,7 +4,7 @@ on: push: branches: [ main ] pull_request: - branches: [ main ] + branches: [ main, pyarrow ] jobs: build: @@ -55,7 +55,7 @@ jobs: run: | pip install "audeer==2.0.0" pip install "audiofile==0.4.0" - pip install "pandas==2.2.0" + pip install "pandas==1.4.1" pip install "pyarrow==10.0.1" pip install "pyyaml==5.4.1" if: matrix.requirements == 'minimum' diff --git a/pyproject.toml b/pyproject.toml index b0f45140..0c6a4297 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ dependencies = [ 'iso-639', 'iso3166', 'oyaml', - 'pandas >=2.2.0', # hash values, see https://github.com/pandas-dev/pandas/issues/58999 + 'pandas >=1.4.1', 'pyarrow >=10.0.1', # for pyarrow strings in pandas 'pyyaml >=5.4.1', ] From ac8a3ef16bb740407ee07a9fccd69e1e030a610c Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 14:18:49 +0200 Subject: [PATCH 03/10] Use numpy<2.0 in minimum test --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9225c325..9faac468 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,6 +55,7 @@ jobs: run: | pip install "audeer==2.0.0" pip install "audiofile==0.4.0" + pip install "numpy<2.0" pip install "pandas==1.4.1" pip install "pyarrow==10.0.1" pip install "pyyaml==5.4.1" From c0c71d42fa26c3969304688242833d4bac4e7a5f Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 14:32:30 +0200 Subject: [PATCH 04/10] Skip doctests in minimum --- .github/workflows/test.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9faac468..604297b3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -64,6 +64,12 @@ jobs: - name: Test with pytest run: | python -m pytest + if: matrix.requirements != 'minimum' + + - name: Test with pytest (skip doctests) + run: | + python -m pytest --ignore=audformat --cov-fail-under=99 + if: matrix.requirements == 'minimum' - name: Upload coverage to Codecov uses: codecov/codecov-action@v4 From 5af2d7dac35abedfa1f25a4f38a779acfcdfbf56 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 15:04:15 +0200 Subject: [PATCH 05/10] Require pandas>=2.1.0 --- .github/workflows/test.yml | 9 +-------- audformat/core/table.py | 14 +++++++++++++- pyproject.toml | 4 ++-- tests/test_table.py | 2 +- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 604297b3..7e7d84a0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,8 +55,7 @@ jobs: run: | pip install "audeer==2.0.0" pip install "audiofile==0.4.0" - pip install "numpy<2.0" - pip install "pandas==1.4.1" + pip install "pandas==2.1.0" pip install "pyarrow==10.0.1" pip install "pyyaml==5.4.1" if: matrix.requirements == 'minimum' @@ -64,12 +63,6 @@ jobs: - name: Test with pytest run: | python -m pytest - if: matrix.requirements != 'minimum' - - - name: Test with pytest (skip doctests) - run: | - python -m pytest --ignore=audformat --cov-fail-under=99 - if: matrix.requirements == 'minimum' - name: Upload coverage to Codecov uses: codecov/codecov-action@v4 diff --git a/audformat/core/table.py b/audformat/core/table.py index 1f059d1e..ad534b02 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1875,7 +1875,19 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes: for column in df.columns: # Convert every column to a numpy array, # and hash its string representation - md5.update(bytes(str(df[column].to_numpy()), "utf-8")) + print(df[column].to_numpy()) + print(df[column].dtype) + print(hashlib.md5(bytes(str(df[column].to_numpy()), "utf-8")).hexdigest()) + if df[column].dtype == "Int64": + # Enforce consistent conversion to numpy.array + # for integers across different pandas versions + # (since pandas 2.2.x, + # Int64 is converted to float, + # if contains ) + y = df[column].astype("float") + else: + y = df[column] + md5.update(bytes(str(y.to_numpy()), "utf-8")) return md5.digest() diff --git a/pyproject.toml b/pyproject.toml index 0c6a4297..13c329eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,14 +28,14 @@ classifiers = [ 'Programming Language :: Python :: 3.11', 'Topic :: Scientific/Engineering', ] -requires-python = '>=3.9' # pandas >=2.2.0 +requires-python = '>=3.9' # pandas >=2.1.0 dependencies = [ 'audeer >=2.0.0', 'audiofile >=0.4.0', 'iso-639', 'iso3166', 'oyaml', - 'pandas >=1.4.1', + 'pandas >=2.1.0', # for pyarrow -> timedelta conversion 'pyarrow >=10.0.1', # for pyarrow strings in pandas 'pyyaml >=5.4.1', ] diff --git a/tests/test_table.py b/tests/test_table.py index 421f831c..1af6bd85 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -1320,7 +1320,7 @@ def get_md5(path: str) -> str: ), ( "misc", - "d3bfb0271878be96c42d523ee4c491da", + "331f79758b195cb9b7d0e8889e830eb2", ), ], ) From 0fcab74b7d02b2466fe4bfd4d504ce7c82966c1d Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 15:06:40 +0200 Subject: [PATCH 06/10] Require numpy<=2.0.0 in minimum test --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7e7d84a0..cc3cd32c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,6 +55,7 @@ jobs: run: | pip install "audeer==2.0.0" pip install "audiofile==0.4.0" + pip install "numpy<=2.0.0" pip install "pandas==2.1.0" pip install "pyarrow==10.0.1" pip install "pyyaml==5.4.1" From 94b934c4cf0faf12c43a9d86831399cfe2af8ba4 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 15:10:02 +0200 Subject: [PATCH 07/10] Remove print statements --- audformat/core/table.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index ad534b02..ad9d4541 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1875,9 +1875,6 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes: for column in df.columns: # Convert every column to a numpy array, # and hash its string representation - print(df[column].to_numpy()) - print(df[column].dtype) - print(hashlib.md5(bytes(str(df[column].to_numpy()), "utf-8")).hexdigest()) if df[column].dtype == "Int64": # Enforce consistent conversion to numpy.array # for integers across different pandas versions From a1d4922632d1c6c0704804d5bbb54fb66e95afa1 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 15:10:19 +0200 Subject: [PATCH 08/10] Fix numpy<2.0.0 for minimum test --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index cc3cd32c..f895bbf6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,7 +55,7 @@ jobs: run: | pip install "audeer==2.0.0" pip install "audiofile==0.4.0" - pip install "numpy<=2.0.0" + pip install "numpy<2.0.0" pip install "pandas==2.1.0" pip install "pyarrow==10.0.1" pip install "pyyaml==5.4.1" From 6bf2e05ac84b17f450f97262135adde8a4bb7f56 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 15:14:17 +0200 Subject: [PATCH 09/10] Remove max_rows argument --- audformat/core/table.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index ad9d4541..e7f4ac09 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1844,7 +1844,7 @@ def _assert_table_index( ) -def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes: +def _dataframe_hash(df: pd.DataFrame) -> bytes: """Hash a dataframe. The hash value takes into account: @@ -1860,16 +1860,11 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes: Args: df: dataframe - max_rows: if not ``None``, - the maximum number of rows, - taken into account for hashing Returns: MD5 hash in bytes """ - # Idea for implementation from - # https://github.com/streamlit/streamlit/issues/7086#issuecomment-1654504410 md5 = hashlib.md5() df = df.copy().reset_index() for column in df.columns: @@ -1878,9 +1873,7 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes: if df[column].dtype == "Int64": # Enforce consistent conversion to numpy.array # for integers across different pandas versions - # (since pandas 2.2.x, - # Int64 is converted to float, - # if contains ) + # (since pandas 2.2.x, Int64 is converted to float if it contains ) y = df[column].astype("float") else: y = df[column] From 575d2fec73994112e177b5b8db588ad5b474489e Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 19 Jun 2024 15:17:22 +0200 Subject: [PATCH 10/10] Simplify code --- audformat/core/table.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/audformat/core/table.py b/audformat/core/table.py index e7f4ac09..30924953 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1866,17 +1866,14 @@ def _dataframe_hash(df: pd.DataFrame) -> bytes: """ md5 = hashlib.md5() - df = df.copy().reset_index() - for column in df.columns: + for _, y in df.reset_index().items(): # Convert every column to a numpy array, # and hash its string representation - if df[column].dtype == "Int64": + if y.dtype == "Int64": # Enforce consistent conversion to numpy.array # for integers across different pandas versions # (since pandas 2.2.x, Int64 is converted to float if it contains ) - y = df[column].astype("float") - else: - y = df[column] + y = y.astype("float") md5.update(bytes(str(y.to_numpy()), "utf-8")) return md5.digest()