From f7df8af478861baff99083fcce1c53f405956657 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sat, 25 Oct 2025 23:05:51 -0400 Subject: [PATCH 01/16] try triggering exceptions before entering khash --- pandas/_libs/hashtable_class_helper.pxi.in | 10 +++ pandas/tests/libs/test_hashtable.py | 78 ++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index eae393f33bfd3..7182f078720f0 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1356,6 +1356,14 @@ cdef class PyObjectHashTable(HashTable): cdef: khiter_t k + # GH 57052 + # in khash_python.h, kh_python_hash_equal and kh_python_hash_func will be called repeatedly by khash in a loop. + # if object implements custom __hash__ and __eq__ methods that can raise exceptions, + # kh_python_hash_{equal,func} will suppress the exceptions without warnings. + # as a workaround: try triggering exceptions here, before starting the khash loop + hash(val) + val == val + k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: return self.table.vals[k] @@ -1369,6 +1377,8 @@ cdef class PyObjectHashTable(HashTable): char* buf hash(key) + # GH 57052 + key == key k = kh_put_pymap(self.table, key, &ret) if kh_exist_pymap(self.table, k): diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 6a95cfc7355d8..5f8eb8c540951 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -1,6 +1,7 @@ from collections import namedtuple from collections.abc import Generator from contextlib import contextmanager +from itertools import product import re import struct import tracemalloc @@ -780,3 +781,80 @@ def test_float_complex_int_are_equal_as_objects(): result = isin(np.array(values, dtype=object), np.asarray(comps)) expected = np.array([False, True, True, True], dtype=np.bool_) tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "throw1hash, throw2hash, throw1eq, throw2eq", + product([True, False], repeat=4), +) +def test_exceptions_thrown_from_custom_hash_and_eq_methods( + throw1hash, throw2hash, throw1eq, throw2eq +): + # GH 57052 + class testkey: + def __init__(self, value, throw_hash=False, throw_eq=False): + self.value = value + self.throw_hash = throw_hash + self.throw_eq = throw_eq + + def __hash__(self): + if self.throw_hash: + raise RuntimeError(f"exception in {self!r}.__hash__") + return hash(self.value) + + def __eq__(self, other): + if self.throw_eq: + raise RuntimeError(f"exception in {self!r}.__eq__") + return self.value == other.value + + def __repr__(self): + return f"{self.__class__.__name__}({self.value}, {self.throw_hash}, {self.throw_eq})" + + table = ht.PyObjectHashTable() + + key1 = testkey(value="hello1") + key2 = testkey(value="hello2") + + table.set_item(key1, 123) + table.set_item(key2, 456) + + key1.throw_hash = throw1hash + key2.throw_hash = throw2hash + key1.throw_eq = throw1eq + key2.throw_eq = throw2eq + + if throw1hash and throw1eq: + with pytest.raises( + RuntimeError, match=re.escape(f"exception in {key1!r}.") + "__(hash|eq)__" + ): + table.get_item(key1) + elif throw1hash: + with pytest.raises( + RuntimeError, match=re.escape(f"exception in {key1!r}.__hash__") + ): + table.get_item(key1) + elif throw1eq: + with pytest.raises( + RuntimeError, match=re.escape(f"exception in {key1!r}.__eq__") + ): + table.get_item(key1) + else: + assert table.get_item(key1) == 123 + + if throw2hash and throw2eq: + with pytest.raises( + RuntimeError, match=re.escape(f"exception in {key2!r}.") + "__(hash|eq)__" + ): + table.get_item(key2) + elif throw2hash: + with pytest.raises( + RuntimeError, match=re.escape(f"exception in {key2!r}.__hash__") + ): + table.get_item(key2) + elif throw2eq: + with pytest.raises( + RuntimeError, match=re.escape(f"exception in {key2!r}.__eq__") + ): + table.get_item(key2) + else: + assert table.get_item(key2) == 456 From cf1e29e531a6d3a194a483e77e3a27b7d988c257 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Mon, 27 Oct 2025 22:18:09 -0400 Subject: [PATCH 02/16] update whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 995e7676afbca..19e159abf0e26 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1210,6 +1210,7 @@ Styler Other ^^^^^ +- Bug :class:``PyObjectHashTable`` that would silently suppress exceptions thrown from custom ``__hash__`` and ``__eq__`` methods during hashing (:issue:`57052`) - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :class:`Series` ignoring errors when trying to convert :class:`Series` input data to the given ``dtype`` (:issue:`60728`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) From 76f94fa5d899555dd92868cd055666a021216a16 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Mon, 27 Oct 2025 22:19:25 -0400 Subject: [PATCH 03/16] fix precommit --- pandas/tests/libs/test_hashtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 5f8eb8c540951..efdf0ae3a18fb 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -808,7 +808,7 @@ def __eq__(self, other): return self.value == other.value def __repr__(self): - return f"{self.__class__.__name__}({self.value}, {self.throw_hash}, {self.throw_eq})" + return f"testkey({self.value}, {self.throw_hash}, {self.throw_eq})" table = ht.PyObjectHashTable() From 6729fe823d889707ba4ff4ab750f4760f2f0679d Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Mon, 27 Oct 2025 22:33:43 -0400 Subject: [PATCH 04/16] fix typo --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 19e159abf0e26..affd79468a37f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1210,7 +1210,7 @@ Styler Other ^^^^^ -- Bug :class:``PyObjectHashTable`` that would silently suppress exceptions thrown from custom ``__hash__`` and ``__eq__`` methods during hashing (:issue:`57052`) +- Bug in :class:``PyObjectHashTable`` that would silently suppress exceptions thrown from custom ``__hash__`` and ``__eq__`` methods during hashing (:issue:`57052`) - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :class:`Series` ignoring errors when trying to convert :class:`Series` input data to the given ``dtype`` (:issue:`60728`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) From be9d047a61da2d2df86ae9b02aafe5f5c33a209d Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Mon, 27 Oct 2025 22:56:42 -0400 Subject: [PATCH 05/16] fix precommit --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index affd79468a37f..b5049e734739b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1210,9 +1210,9 @@ Styler Other ^^^^^ -- Bug in :class:``PyObjectHashTable`` that would silently suppress exceptions thrown from custom ``__hash__`` and ``__eq__`` methods during hashing (:issue:`57052`) - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :class:`Series` ignoring errors when trying to convert :class:`Series` input data to the given ``dtype`` (:issue:`60728`) +- Bug in :class:``PyObjectHashTable`` that would silently suppress exceptions thrown from custom ``__hash__`` and ``__eq__`` methods during hashing (:issue:`57052`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) - Bug in :func:`eval` where method calls on binary operations like ``(x + y).dropna()`` would raise ``AttributeError: 'BinOp' object has no attribute 'value'`` (:issue:`61175`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) From 318fe86595e781810dbe32d4df6c655b8ecab599 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Tue, 28 Oct 2025 22:41:06 -0400 Subject: [PATCH 06/16] rewrite hash fix, add tests --- pandas/_libs/hashtable_class_helper.pxi.in | 30 ++-- .../pandas/vendored/klib/khash_python.h | 8 +- pandas/tests/frame/indexing/test_indexing.py | 24 ++++ pandas/tests/libs/test_hashtable.py | 132 ++++++++++++------ 4 files changed, 138 insertions(+), 56 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 7182f078720f0..fd5b9cc8b7910 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -4,6 +4,8 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ from cpython.unicode cimport PyUnicode_AsUTF8 +from cpython.exc cimport PyErr_Occurred, PyErr_Fetch +from cpython.ref cimport Py_XDECREF {{py: @@ -1309,6 +1311,22 @@ cdef class StringHashTable(HashTable): return labels +cdef raise_if_errors(): + cdef: + object exc + PyObject *type + PyObject *value + PyObject *traceback + + PyErr_Fetch(&type, &value, &traceback) + if value != NULL: + exc = value + Py_XDECREF(value) + Py_XDECREF(type) + Py_XDECREF(traceback) + raise exc + + cdef class PyObjectHashTable(HashTable): def __init__(self, int64_t size_hint=1): @@ -1356,15 +1374,8 @@ cdef class PyObjectHashTable(HashTable): cdef: khiter_t k - # GH 57052 - # in khash_python.h, kh_python_hash_equal and kh_python_hash_func will be called repeatedly by khash in a loop. - # if object implements custom __hash__ and __eq__ methods that can raise exceptions, - # kh_python_hash_{equal,func} will suppress the exceptions without warnings. - # as a workaround: try triggering exceptions here, before starting the khash loop - hash(val) - val == val - k = kh_get_pymap(self.table, val) + raise_if_errors() if k != self.table.n_buckets: return self.table.vals[k] else: @@ -1377,10 +1388,9 @@ cdef class PyObjectHashTable(HashTable): char* buf hash(key) - # GH 57052 - key == key k = kh_put_pymap(self.table, key, &ret) + raise_if_errors() if kh_exist_pymap(self.table, k): self.table.vals[k] = val else: diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index e0bb96d57b9e1..fa1ea1430f917 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -193,6 +193,9 @@ static inline int tupleobject_cmp(PyTupleObject *a, PyTupleObject *b) { } static inline int pyobject_cmp(PyObject *a, PyObject *b) { + if (PyErr_Occurred() != NULL) { + return 0; + } if (a == b) { return 1; } @@ -215,7 +218,6 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); if (result < 0) { - PyErr_Clear(); return 0; } return result; @@ -292,6 +294,9 @@ static inline Py_hash_t tupleobject_hash(PyTupleObject *key) { } static inline khuint32_t kh_python_hash_func(PyObject *key) { + if (PyErr_Occurred() != NULL) { + return 0; + } Py_hash_t hash; // For PyObject_Hash holds: // hash(0.0) == 0 == hash(-0.0) @@ -315,7 +320,6 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) { } if (hash == -1) { - PyErr_Clear(); return 0; } #if SIZEOF_PY_HASH_T == 4 diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 2b36c1135d36d..82d31889a6ffe 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1943,3 +1943,27 @@ def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, index def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype) self._check_setitem_invalid(df, invalid, indexer) + + +def test_error_raised_from_custom_hash_method(): + # GH 57052 + class testkey: + def __init__(self, value): + self.value = value + + def __hash__(self): + raise RuntimeError(f"exception in {self!r}.__hash__") + + def __eq__(self, other): + return self.value == other.value + + def __repr__(self): + return f"testkey({self.value})" + + df = DataFrame({"i": map(testkey, range(10))}).set_index("i") + for i in range(len(df.index)): + key = testkey(i) + with pytest.raises( + RuntimeError, match=re.escape(f"exception in {key!r}.__hash__") + ): + df.loc[key] diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index efdf0ae3a18fb..42d486d6fd524 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -783,33 +783,57 @@ def test_float_complex_int_are_equal_as_objects(): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize( - "throw1hash, throw2hash, throw1eq, throw2eq", - product([True, False], repeat=4), -) -def test_exceptions_thrown_from_custom_hash_and_eq_methods( - throw1hash, throw2hash, throw1eq, throw2eq -): +class testkey: # GH 57052 - class testkey: - def __init__(self, value, throw_hash=False, throw_eq=False): - self.value = value - self.throw_hash = throw_hash - self.throw_eq = throw_eq + def __init__(self, value, throw_hash=False, throw_eq=False): + self.value = value + self.throw_hash = throw_hash + self.throw_eq = throw_eq + + def __hash__(self): + if self.throw_hash: + raise RuntimeError(f"exception in {self!r}.__hash__") + return hash(self.value) + + def __eq__(self, other): + if self.throw_eq: + raise RuntimeError(f"exception in {self!r}.__eq__") + return self.value == other.value - def __hash__(self): - if self.throw_hash: - raise RuntimeError(f"exception in {self!r}.__hash__") - return hash(self.value) + def __repr__(self): + return f"testkey({self.value}, {self.throw_hash}, {self.throw_eq})" + + +@pytest.mark.parametrize("throw1, throw2", product([True, False], repeat=2)) +def test_error_raised_from_hash_method_in_set_item(throw1, throw2): + # GH 57052 + table = ht.PyObjectHashTable() + + key1 = testkey(value="hello1", throw_hash=throw1) + key2 = testkey(value="hello2", throw_hash=throw2) + + if throw1: + with pytest.raises( + RuntimeError, match=re.escape(f"exception in {key1!r}.__hash__") + ): + table.set_item(key1, 123) + else: + table.set_item(key1, 123) + assert table.get_item(key1) == 123 - def __eq__(self, other): - if self.throw_eq: - raise RuntimeError(f"exception in {self!r}.__eq__") - return self.value == other.value + if throw2: + with pytest.raises( + RuntimeError, match=re.escape(f"exception in {key2!r}.__hash__") + ): + table.set_item(key2, 456) + else: + table.set_item(key2, 456) + assert table.get_item(key2) == 456 - def __repr__(self): - return f"testkey({self.value}, {self.throw_hash}, {self.throw_eq})" +@pytest.mark.parametrize("throw1, throw2", product([True, False], repeat=2)) +def test_error_raised_from_hash_method_in_get_item(throw1, throw2): + # GH 57052 table = ht.PyObjectHashTable() key1 = testkey(value="hello1") @@ -818,43 +842,63 @@ def __repr__(self): table.set_item(key1, 123) table.set_item(key2, 456) - key1.throw_hash = throw1hash - key2.throw_hash = throw2hash - key1.throw_eq = throw1eq - key2.throw_eq = throw2eq + key1.throw_hash = throw1 + key2.throw_hash = throw2 - if throw1hash and throw1eq: - with pytest.raises( - RuntimeError, match=re.escape(f"exception in {key1!r}.") + "__(hash|eq)__" - ): - table.get_item(key1) - elif throw1hash: + if throw1: with pytest.raises( RuntimeError, match=re.escape(f"exception in {key1!r}.__hash__") ): table.get_item(key1) - elif throw1eq: - with pytest.raises( - RuntimeError, match=re.escape(f"exception in {key1!r}.__eq__") - ): - table.get_item(key1) else: assert table.get_item(key1) == 123 - if throw2hash and throw2eq: + if throw2: with pytest.raises( - RuntimeError, match=re.escape(f"exception in {key2!r}.") + "__(hash|eq)__" + RuntimeError, match=re.escape(f"exception in {key2!r}.__hash__") ): table.get_item(key2) - elif throw2hash: + else: + assert table.get_item(key2) == 456 + + +@pytest.mark.parametrize("throw", [True, False]) +def test_error_raised_from_eq_method_in_set_item(throw): + # GH 57052 + table = ht.PyObjectHashTable() + + key1 = testkey(value="hello", throw_eq=throw) + key2 = testkey(value=key1.value) + + if throw: + table.set_item(key1, 123) with pytest.raises( - RuntimeError, match=re.escape(f"exception in {key2!r}.__hash__") + RuntimeError, match=re.escape(f"exception in {key1!r}.__eq__") ): - table.get_item(key2) - elif throw2eq: + table.set_item(key2, 456) + else: + table.set_item(key2, 456) + assert table.get_item(key2) == 456 + + +@pytest.mark.parametrize("throw", [True, False]) +def test_error_raised_from_eq_method_in_get_item(throw): + # GH 57052 + table = ht.PyObjectHashTable() + + key1 = testkey(value="hello") + key2 = testkey(value=key1.value) + + table.set_item(key1, 123) + table.set_item(key2, 456) + + if throw: + key1.throw_eq = True with pytest.raises( - RuntimeError, match=re.escape(f"exception in {key2!r}.__eq__") + RuntimeError, match=re.escape(f"exception in {key1!r}.__eq__") ): table.get_item(key2) else: + # this looks odd but it is because key1.value == key2.value + assert table.get_item(key1) == 456 assert table.get_item(key2) == 456 From a7994c85a59480113ea02dd934c7d73bb6a736ab Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Tue, 28 Oct 2025 23:27:27 -0400 Subject: [PATCH 07/16] raise exceptions from map_locations --- pandas/_libs/hashtable_class_helper.pxi.in | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index fd5b9cc8b7910..0925b8e66529d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1409,6 +1409,7 @@ cdef class PyObjectHashTable(HashTable): hash(val) k = kh_put_pymap(self.table, val, &ret) + raise_if_errors() self.table.vals[k] = i def lookup(self, ndarray[object] values, object mask = None) -> ndarray: From c25ab5b1a814553104eaed4d9f692bb3931e4933 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Thu, 30 Oct 2025 22:14:07 -0400 Subject: [PATCH 08/16] implement fully checked kh_pymap interface --- pandas/_libs/hashtable_class_helper.pxi.in | 45 +++-------- pandas/_libs/hashtable_func_helper.pxi.in | 74 +++++++++--------- .../pandas/vendored/klib/khash_python.h | 6 ++ pandas/_libs/khash.pxd | 8 ++ pandas/_libs/khash.pyx | 75 +++++++++++++++++++ pandas/_libs/meson.build | 1 + 6 files changed, 138 insertions(+), 71 deletions(-) create mode 100644 pandas/_libs/khash.pyx diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 0925b8e66529d..05a697fcc0c2c 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -4,8 +4,6 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ from cpython.unicode cimport PyUnicode_AsUTF8 -from cpython.exc cimport PyErr_Occurred, PyErr_Fetch -from cpython.ref cimport Py_XDECREF {{py: @@ -75,7 +73,7 @@ cimported_types = ['complex64', 'int16', 'int32', 'int64', - 'pymap', + 'pymap_checked', 'str', 'strbox', 'uint8', @@ -1311,32 +1309,16 @@ cdef class StringHashTable(HashTable): return labels -cdef raise_if_errors(): - cdef: - object exc - PyObject *type - PyObject *value - PyObject *traceback - - PyErr_Fetch(&type, &value, &traceback) - if value != NULL: - exc = value - Py_XDECREF(value) - Py_XDECREF(type) - Py_XDECREF(traceback) - raise exc - - cdef class PyObjectHashTable(HashTable): def __init__(self, int64_t size_hint=1): - self.table = kh_init_pymap() + self.table = kh_init_pymap_checked() size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) - kh_resize_pymap(self.table, size_hint) + kh_resize_pymap_checked(self.table, size_hint) def __dealloc__(self): if self.table is not NULL: - kh_destroy_pymap(self.table) + kh_destroy_pymap_checked(self.table) self.table = NULL def __len__(self) -> int: @@ -1347,7 +1329,7 @@ cdef class PyObjectHashTable(HashTable): khiter_t k hash(key) - k = kh_get_pymap(self.table, key) + k = kh_get_pymap_checked(self.table, key) return k != self.table.n_buckets def sizeof(self, deep: bool = False) -> int: @@ -1374,8 +1356,7 @@ cdef class PyObjectHashTable(HashTable): cdef: khiter_t k - k = kh_get_pymap(self.table, val) - raise_if_errors() + k = kh_get_pymap_checked(self.table, val) if k != self.table.n_buckets: return self.table.vals[k] else: @@ -1389,9 +1370,8 @@ cdef class PyObjectHashTable(HashTable): hash(key) - k = kh_put_pymap(self.table, key, &ret) - raise_if_errors() - if kh_exist_pymap(self.table, k): + k = kh_put_pymap_checked(self.table, key, &ret) + if kh_exist_pymap_checked(self.table, k): self.table.vals[k] = val else: raise KeyError(key) @@ -1408,8 +1388,7 @@ cdef class PyObjectHashTable(HashTable): val = values[i] hash(val) - k = kh_put_pymap(self.table, val, &ret) - raise_if_errors() + k = kh_put_pymap_checked(self.table, val, &ret) self.table.vals[k] = i def lookup(self, ndarray[object] values, object mask = None) -> ndarray: @@ -1426,7 +1405,7 @@ cdef class PyObjectHashTable(HashTable): val = values[i] hash(val) - k = kh_get_pymap(self.table, val) + k = kh_get_pymap_checked(self.table, val) if k != self.table.n_buckets: locs[i] = self.table.vals[k] else: @@ -1504,10 +1483,10 @@ cdef class PyObjectHashTable(HashTable): labels[i] = na_sentinel continue - k = kh_get_pymap(self.table, val) + k = kh_get_pymap_checked(self.table, val) if k == self.table.n_buckets: # k hasn't been seen yet - k = kh_put_pymap(self.table, val, &ret) + k = kh_put_pymap_checked(self.table, val, &ret) uniques.append(val) if return_inverse: self.table.vals[k] = count diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 3487f5ebd050d..6ffeff9b5f3ab 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -6,26 +6,24 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dtype, ttype, c_type, to_c_type -dtypes = [('Complex128', 'complex128', 'complex128', - 'khcomplex128_t', 'to_khcomplex128_t'), - ('Complex64', 'complex64', 'complex64', - 'khcomplex64_t', 'to_khcomplex64_t'), - ('Float64', 'float64', 'float64', 'float64_t', ''), - ('Float32', 'float32', 'float32', 'float32_t', ''), - ('UInt64', 'uint64', 'uint64', 'uint64_t', ''), - ('UInt32', 'uint32', 'uint32', 'uint32_t', ''), - ('UInt16', 'uint16', 'uint16', 'uint16_t', ''), - ('UInt8', 'uint8', 'uint8', 'uint8_t', ''), - ('Object', 'object', 'pymap', 'object', ''), - ('Int64', 'int64', 'int64', 'int64_t', ''), - ('Int32', 'int32', 'int32', 'int32_t', ''), - ('Int16', 'int16', 'int16', 'int16_t', ''), - ('Int8', 'int8', 'int8', 'int8_t', '')] +# name, dtype, ttype, tfunc_type, c_type, to_c_type +dtypes = [('Complex128', 'complex128', 'complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'), + ('Complex64', 'complex64', 'complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'), + ('Float64', 'float64', 'float64', 'float64', 'float64_t', ''), + ('Float32', 'float32', 'float32', 'float32', 'float32_t', ''), + ('UInt64', 'uint64', 'uint64', 'uint64', 'uint64_t', ''), + ('UInt32', 'uint32', 'uint32', 'uint32', 'uint32_t', ''), + ('UInt16', 'uint16', 'uint16', 'uint16', 'uint16_t', ''), + ('UInt8', 'uint8', 'uint8', 'uint8', 'uint8_t', ''), + ('Object', 'object', 'pymap', 'pymap_checked', 'object', ''), + ('Int64', 'int64', 'int64', 'int64', 'int64_t', ''), + ('Int32', 'int32', 'int32', 'int32', 'int32_t', ''), + ('Int16', 'int16', 'int16', 'int16', 'int16_t', ''), + ('Int8', 'int8', 'int8', 'int8', 'int8_t', '')] }} -{{for name, dtype, ttype, c_type, to_c_type in dtypes}} +{{for name, dtype, ttype, tfunc_type, c_type, to_c_type in dtypes}} @cython.wraparound(False) @@ -55,26 +53,26 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 # result_keys remembers the original order of keys result_keys = {{name}}Vector() - table = kh_init_{{ttype}}() + table = kh_init_{{tfunc_type}}() {{if dtype == 'object'}} if uses_mask: raise NotImplementedError("uses_mask not implemented with object dtype") - kh_resize_{{ttype}}(table, n // 10) + kh_resize_{{tfunc_type}}(table, n // 10) for i in range(n): val = values[i] if not dropna or not checknull(val): - k = kh_get_{{ttype}}(table, {{to_c_type}}val) + k = kh_get_{{tfunc_type}}(table, {{to_c_type}}val) if k != table.n_buckets: table.vals[k] += 1 else: - k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret) + k = kh_put_{{tfunc_type}}(table, {{to_c_type}}val, &ret) table.vals[k] = 1 result_keys.append(val) {{else}} - kh_resize_{{ttype}}(table, n) + kh_resize_{{tfunc_type}}(table, n) for i in range(n): val = {{to_c_type}}(values[i]) @@ -90,11 +88,11 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 if uses_mask and isna_entry: na_counter += 1 else: - k = kh_get_{{ttype}}(table, val) + k = kh_get_{{tfunc_type}}(table, val) if k != table.n_buckets: table.vals[k] += 1 else: - k = kh_put_{{ttype}}(table, val, &ret) + k = kh_put_{{tfunc_type}}(table, val, &ret) table.vals[k] = 1 result_keys.append(val) {{endif}} @@ -107,9 +105,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 for i in range(table.size): {{if dtype == 'object'}} - k = kh_get_{{ttype}}(table, result_keys.data[i]) + k = kh_get_{{tfunc_type}}(table, result_keys.data[i]) {{else}} - k = kh_get_{{ttype}}(table, result_keys.data.data[i]) + k = kh_get_{{tfunc_type}}(table, result_keys.data.data[i]) {{endif}} result_counts[i] = table.vals[k] @@ -117,7 +115,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 result_counts[table.size] = na_counter result_keys.append(val) - kh_destroy_{{ttype}}(table) + kh_destroy_{{tfunc_type}}(table) return result_keys.to_array(), result_counts.base, na_counter @@ -138,12 +136,12 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons {{endif}} Py_ssize_t i, n = len(values), first_na = -1 khiter_t k - kh_{{ttype}}_t *table = kh_init_{{ttype}}() + kh_{{ttype}}_t *table = kh_init_{{tfunc_type}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') bint seen_na = False, uses_mask = mask is not None bint seen_multiple_na = False - kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) + kh_resize_{{tfunc_type}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) if keep not in ('last', 'first', False): raise ValueError('keep must be either "first", "last" or False') @@ -168,7 +166,7 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons seen_na = True else: value = {{to_c_type}}(values[i]) - kh_put_{{ttype}}(table, value, &ret) + kh_put_{{tfunc_type}}(table, value, &ret) out[i] = ret == 0 {{endfor}} @@ -193,16 +191,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons else: value = {{to_c_type}}(values[i]) - k = kh_get_{{ttype}}(table, value) + k = kh_get_{{tfunc_type}}(table, value) if k != table.n_buckets: out[table.vals[k]] = 1 out[i] = 1 else: - k = kh_put_{{ttype}}(table, value, &ret) + k = kh_put_{{tfunc_type}}(table, value, &ret) table.vals[k] = i out[i] = 0 - kh_destroy_{{ttype}}(table) + kh_destroy_{{tfunc_type}}(table) return out @@ -243,11 +241,11 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{c_type}} val {{endif}} - kh_{{ttype}}_t *table = kh_init_{{ttype}}() + kh_{{ttype}}_t *table = kh_init_{{tfunc_type}}() # construct the table n = len(values) - kh_resize_{{ttype}}(table, n) + kh_resize_{{tfunc_type}}(table, n) {{if dtype == 'object'}} if True: @@ -256,7 +254,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} for i in range(n): val = {{to_c_type}}(values[i]) - kh_put_{{ttype}}(table, val, &ret) + kh_put_{{tfunc_type}}(table, val, &ret) # test membership n = len(arr) @@ -269,10 +267,10 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} for i in range(n): val = {{to_c_type}}(arr[i]) - k = kh_get_{{ttype}}(table, val) + k = kh_get_{{tfunc_type}}(table, val) result[i] = (k != table.n_buckets) - kh_destroy_{{ttype}}(table) + kh_destroy_{{tfunc_type}}(table) return result.view(np.bool_) # ---------------------------------------------------------------------- diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index fa1ea1430f917..45e44dc66d135 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -218,6 +218,9 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); if (result < 0) { + if (PyErr_Occurred() != NULL) { + return 0; + } return 0; } return result; @@ -320,6 +323,9 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) { } if (hash == -1) { + if (PyErr_Occurred() != NULL) { + return 0; + } return 0; } #if SIZEOF_PY_HASH_T == 4 diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index c439e1cca772b..f8c30eec113f7 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -125,5 +125,13 @@ cdef extern from "pandas/vendored/klib/khash_python.h": khuint_t kh_needed_n_buckets(khuint_t element_n) nogil +cdef kh_pymap_t* kh_init_pymap_checked() +cdef void kh_destroy_pymap_checked(kh_pymap_t*) +cdef void kh_clear_pymap_checked(kh_pymap_t*) +cdef khuint_t kh_get_pymap_checked(kh_pymap_t*, PyObject*) +cdef void kh_resize_pymap_checked(kh_pymap_t*, khuint_t) +cdef khuint_t kh_put_pymap_checked(kh_pymap_t*, PyObject*, int*) +cdef void kh_del_pymap_checked(kh_pymap_t*, khuint_t) +cdef bint kh_exist_pymap_checked(kh_pymap_t*, khiter_t) include "khash_for_primitive_helper.pxi" diff --git a/pandas/_libs/khash.pyx b/pandas/_libs/khash.pyx new file mode 100644 index 0000000000000..2c4d37d6f1e8a --- /dev/null +++ b/pandas/_libs/khash.pyx @@ -0,0 +1,75 @@ +from cpython.object cimport PyObject +from cpython.exc cimport PyErr_Occurred, PyErr_Fetch +from cpython.ref cimport Py_XDECREF + + +cdef inline raise_if_errors(): + cdef: + object exc_type + object exc_value + PyObject *type + PyObject *value + PyObject *traceback + + if PyErr_Occurred(): + PyErr_Fetch(&type, &value, &traceback) + Py_XDECREF(traceback) + if value != NULL: + exc_value = value + if isinstance(exc_value, str): + if type != NULL: + exc_type = type + else: + exc_type = RuntimeError + Py_XDECREF(type) + raise exc_type(exc_value) + else: + Py_XDECREF(type) + raise exc_value + + +cdef kh_pymap_t* kh_init_pymap_checked(): + cdef kh_pymap_t* table = kh_init_pymap() + if PyErr_Occurred(): + kh_destroy_pymap(table) + table = NULL + raise_if_errors() + return table + + +cdef void kh_destroy_pymap_checked(kh_pymap_t* table): + kh_destroy_pymap(table) + raise_if_errors() + + +cdef void kh_clear_pymap_checked(kh_pymap_t* table): + kh_clear_pymap(table) + raise_if_errors() + + +cdef khuint_t kh_get_pymap_checked(kh_pymap_t* table, PyObject* key): + cdef khuint_t k = kh_get_pymap(table, key) + raise_if_errors() + return k + + +cdef void kh_resize_pymap_checked(kh_pymap_t* table, khuint_t new_n_buckets): + kh_resize_pymap(table, new_n_buckets) + raise_if_errors() + + +cdef khuint_t kh_put_pymap_checked(kh_pymap_t* table, PyObject* key, int* ret): + cdef khuint_t result = kh_put_pymap(table, key, ret) + raise_if_errors() + return result + + +cdef void kh_del_pymap_checked(kh_pymap_t* table, khuint_t k): + kh_del_pymap(table, k) + raise_if_errors() + + +cdef bint kh_exist_pymap_checked(kh_pymap_t* table, khiter_t k): + cdef bint res = kh_exist_pymap(table, k) + raise_if_errors() + return res diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index 33fc65e5034d0..f6d6e74648cf3 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -97,6 +97,7 @@ libs_sources = { 'sources': ['join.pyx', _khash_primitive_helper], 'deps': _khash_primitive_helper_dep, }, + 'khash': {'sources': ['khash.pyx']}, 'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']}, 'missing': {'sources': ['missing.pyx']}, 'pandas_datetime': { From c87a388696a955f2da603327aadc1043d861645d Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Thu, 30 Oct 2025 22:16:54 -0400 Subject: [PATCH 09/16] isort --- pandas/_libs/khash.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/khash.pyx b/pandas/_libs/khash.pyx index 2c4d37d6f1e8a..c2b684100131f 100644 --- a/pandas/_libs/khash.pyx +++ b/pandas/_libs/khash.pyx @@ -1,5 +1,8 @@ +from cpython.exc cimport ( + PyErr_Fetch, + PyErr_Occurred, +) from cpython.object cimport PyObject -from cpython.exc cimport PyErr_Occurred, PyErr_Fetch from cpython.ref cimport Py_XDECREF From 0a4cba88755393afedefc20586651258056ec1bf Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Thu, 30 Oct 2025 22:22:22 -0400 Subject: [PATCH 10/16] cleanup --- pandas/_libs/hashtable_func_helper.pxi.in | 46 +++++++++---------- .../pandas/vendored/klib/khash_python.h | 7 --- 2 files changed, 23 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 6ffeff9b5f3ab..0722fb65218b8 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -6,7 +6,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dtype, ttype, tfunc_type, c_type, to_c_type +# name, dtype, ttype, tfunc, c_type, to_c_type dtypes = [('Complex128', 'complex128', 'complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'), ('Complex64', 'complex64', 'complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'), ('Float64', 'float64', 'float64', 'float64', 'float64_t', ''), @@ -23,7 +23,7 @@ dtypes = [('Complex128', 'complex128', 'complex128', 'complex128', 'khcomplex128 }} -{{for name, dtype, ttype, tfunc_type, c_type, to_c_type in dtypes}} +{{for name, dtype, ttype, tfunc, c_type, to_c_type in dtypes}} @cython.wraparound(False) @@ -53,26 +53,26 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 # result_keys remembers the original order of keys result_keys = {{name}}Vector() - table = kh_init_{{tfunc_type}}() + table = kh_init_{{tfunc}}() {{if dtype == 'object'}} if uses_mask: raise NotImplementedError("uses_mask not implemented with object dtype") - kh_resize_{{tfunc_type}}(table, n // 10) + kh_resize_{{tfunc}}(table, n // 10) for i in range(n): val = values[i] if not dropna or not checknull(val): - k = kh_get_{{tfunc_type}}(table, {{to_c_type}}val) + k = kh_get_{{tfunc}}(table, {{to_c_type}}val) if k != table.n_buckets: table.vals[k] += 1 else: - k = kh_put_{{tfunc_type}}(table, {{to_c_type}}val, &ret) + k = kh_put_{{tfunc}}(table, {{to_c_type}}val, &ret) table.vals[k] = 1 result_keys.append(val) {{else}} - kh_resize_{{tfunc_type}}(table, n) + kh_resize_{{tfunc}}(table, n) for i in range(n): val = {{to_c_type}}(values[i]) @@ -88,11 +88,11 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 if uses_mask and isna_entry: na_counter += 1 else: - k = kh_get_{{tfunc_type}}(table, val) + k = kh_get_{{tfunc}}(table, val) if k != table.n_buckets: table.vals[k] += 1 else: - k = kh_put_{{tfunc_type}}(table, val, &ret) + k = kh_put_{{tfunc}}(table, val, &ret) table.vals[k] = 1 result_keys.append(val) {{endif}} @@ -105,9 +105,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 for i in range(table.size): {{if dtype == 'object'}} - k = kh_get_{{tfunc_type}}(table, result_keys.data[i]) + k = kh_get_{{tfunc}}(table, result_keys.data[i]) {{else}} - k = kh_get_{{tfunc_type}}(table, result_keys.data.data[i]) + k = kh_get_{{tfunc}}(table, result_keys.data.data[i]) {{endif}} result_counts[i] = table.vals[k] @@ -115,7 +115,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 result_counts[table.size] = na_counter result_keys.append(val) - kh_destroy_{{tfunc_type}}(table) + kh_destroy_{{tfunc}}(table) return result_keys.to_array(), result_counts.base, na_counter @@ -136,12 +136,12 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons {{endif}} Py_ssize_t i, n = len(values), first_na = -1 khiter_t k - kh_{{ttype}}_t *table = kh_init_{{tfunc_type}}() + kh_{{ttype}}_t *table = kh_init_{{tfunc}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') bint seen_na = False, uses_mask = mask is not None bint seen_multiple_na = False - kh_resize_{{tfunc_type}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) + kh_resize_{{tfunc}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) if keep not in ('last', 'first', False): raise ValueError('keep must be either "first", "last" or False') @@ -166,7 +166,7 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons seen_na = True else: value = {{to_c_type}}(values[i]) - kh_put_{{tfunc_type}}(table, value, &ret) + kh_put_{{tfunc}}(table, value, &ret) out[i] = ret == 0 {{endfor}} @@ -191,16 +191,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons else: value = {{to_c_type}}(values[i]) - k = kh_get_{{tfunc_type}}(table, value) + k = kh_get_{{tfunc}}(table, value) if k != table.n_buckets: out[table.vals[k]] = 1 out[i] = 1 else: - k = kh_put_{{tfunc_type}}(table, value, &ret) + k = kh_put_{{tfunc}}(table, value, &ret) table.vals[k] = i out[i] = 0 - kh_destroy_{{tfunc_type}}(table) + kh_destroy_{{tfunc}}(table) return out @@ -241,11 +241,11 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{c_type}} val {{endif}} - kh_{{ttype}}_t *table = kh_init_{{tfunc_type}}() + kh_{{ttype}}_t *table = kh_init_{{tfunc}}() # construct the table n = len(values) - kh_resize_{{tfunc_type}}(table, n) + kh_resize_{{tfunc}}(table, n) {{if dtype == 'object'}} if True: @@ -254,7 +254,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} for i in range(n): val = {{to_c_type}}(values[i]) - kh_put_{{tfunc_type}}(table, val, &ret) + kh_put_{{tfunc}}(table, val, &ret) # test membership n = len(arr) @@ -267,10 +267,10 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} for i in range(n): val = {{to_c_type}}(arr[i]) - k = kh_get_{{tfunc_type}}(table, val) + k = kh_get_{{tfunc}}(table, val) result[i] = (k != table.n_buckets) - kh_destroy_{{tfunc_type}}(table) + kh_destroy_{{tfunc}}(table) return result.view(np.bool_) # ---------------------------------------------------------------------- diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 45e44dc66d135..a18d41f6a0249 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -3,7 +3,6 @@ #pragma once #include - #include #include @@ -218,9 +217,6 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); if (result < 0) { - if (PyErr_Occurred() != NULL) { - return 0; - } return 0; } return result; @@ -323,9 +319,6 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) { } if (hash == -1) { - if (PyErr_Occurred() != NULL) { - return 0; - } return 0; } #if SIZEOF_PY_HASH_T == 4 From ac99ea6f6a16b30c0cd3340d022e8d5ac7463f1c Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sat, 1 Nov 2025 18:35:05 -0400 Subject: [PATCH 11/16] hash dict/list as 0, do not compare NA --- .../include/pandas/vendored/klib/khash_python.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index a18d41f6a0249..53a3a9da7ce08 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -191,6 +191,12 @@ static inline int tupleobject_cmp(PyTupleObject *a, PyTupleObject *b) { return 1; } +static inline int _is_pandas_NA_type(PyObject *o) { + // TODO compare PyTypeObject* C_NA, not strings! + PyObject* type_name = PyType_GetName(Py_TYPE(o)); + return PyUnicode_CompareWithASCIIString(type_name, "NAType") == 0; +} + static inline int pyobject_cmp(PyObject *a, PyObject *b) { if (PyErr_Occurred() != NULL) { return 0; @@ -213,6 +219,8 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { return tupleobject_cmp((PyTupleObject *)a, (PyTupleObject *)b); } // frozenset isn't yet supported + } else if (_is_pandas_NA_type(a) || _is_pandas_NA_type(b)) { + return 0; } int result = PyObject_RichCompareBool(a, b, Py_EQ); @@ -314,6 +322,13 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) { } else if (PyTuple_Check(key)) { // hash tuple subclasses as builtin tuples hash = tupleobject_hash((PyTupleObject *)key); + } else if (PyDict_Check(key) || PyList_Check(key)) { + // before GH 57052 was fixed, all exceptions raised from PyObject_Hash were suppressed. + // some features rely on this behaviour, e.g. _libs.hashtable.value_count_object via DataFrame.describe, + // which counts generic objects using PyObjectHashTable. + // using hash = 0 for dict and list objects puts all of them in the same bucket, + // which is not optimal for performance but that is what the behaviour was before. + hash = 0; } else { hash = PyObject_Hash(key); } From eb120e9063fbf77f6b31d32ba144628a659486ea Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sat, 1 Nov 2025 18:54:30 -0400 Subject: [PATCH 12/16] fix precommit --- pandas/_libs/include/pandas/vendored/klib/khash_python.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 53a3a9da7ce08..bc1cc30ed7905 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -191,7 +191,7 @@ static inline int tupleobject_cmp(PyTupleObject *a, PyTupleObject *b) { return 1; } -static inline int _is_pandas_NA_type(PyObject *o) { +static inline int _is_pandas_NA_type(PyObject* o) { // TODO compare PyTypeObject* C_NA, not strings! PyObject* type_name = PyType_GetName(Py_TYPE(o)); return PyUnicode_CompareWithASCIIString(type_name, "NAType") == 0; From 2b0fa82703102dcb621696731997e750eb599888 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sat, 1 Nov 2025 21:07:51 -0400 Subject: [PATCH 13/16] stop using hash=0 for list, cleanup comment --- .../_libs/include/pandas/vendored/klib/khash_python.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index bc1cc30ed7905..8c46b2e5bafcf 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -322,12 +322,11 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) { } else if (PyTuple_Check(key)) { // hash tuple subclasses as builtin tuples hash = tupleobject_hash((PyTupleObject *)key); - } else if (PyDict_Check(key) || PyList_Check(key)) { - // before GH 57052 was fixed, all exceptions raised from PyObject_Hash were suppressed. - // some features rely on this behaviour, e.g. _libs.hashtable.value_count_object via DataFrame.describe, - // which counts generic objects using PyObjectHashTable. - // using hash = 0 for dict and list objects puts all of them in the same bucket, - // which is not optimal for performance but that is what the behaviour was before. + } else if (PyDict_Check(key)) { + // Before GH 57052 was fixed, all exceptions raised from PyObject_Hash were suppressed. + // some features rely on this behaviour, e.g. _libs.hashtable.value_count_object via + // DataFrame.describe, which counts generic objects using PyObjectHashTable. + // Using hash = 0 puts all objects in the same bucket, which is bad for performance but that is how it worked before. hash = 0; } else { hash = PyObject_Hash(key); From c94b480e8754599865e7975e48ed523bd27a4296 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sat, 1 Nov 2025 21:08:43 -0400 Subject: [PATCH 14/16] fix unhashable UserDict in JSONArray.duplicated --- pandas/tests/extension/json/array.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 828e4415bd295..21736b24eb35e 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -25,6 +25,7 @@ from typing import ( TYPE_CHECKING, Any, + Literal, ) import numpy as np @@ -41,12 +42,16 @@ ExtensionArray, ExtensionDtype, ) +from pandas.core.algorithms import duplicated from pandas.core.indexers import unpack_tuple_and_ellipses if TYPE_CHECKING: from collections.abc import Mapping - from pandas._typing import type_t + from pandas._typing import ( + npt, + type_t, + ) class JSONDtype(ExtensionDtype): @@ -254,6 +259,17 @@ def _pad_or_backfill(self, *, method, limit=None, copy=True): # GH#56616 - test EA method without limit_area argument return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + # pd.core.algorithms.duplicated is implemented with a hash table that + # does not support UserDict values. + # However, dict values are always hashed as 0 for backwards compatibility, + # see GH 57052 + mask = self.isna().astype(np.bool_, copy=False) + values = np.array([dict(x) for x in self], dtype="object") + return duplicated(values=values, keep=keep, mask=mask) + def make_data(n: int): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer From 2462e30f2715650d165d1de7f232894d4fef1417 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sat, 1 Nov 2025 21:28:50 -0400 Subject: [PATCH 15/16] hash list as 0 again, Series.isin hashes lists --- .../_libs/include/pandas/vendored/klib/khash_python.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 8c46b2e5bafcf..505126f61e421 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -322,11 +322,13 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) { } else if (PyTuple_Check(key)) { // hash tuple subclasses as builtin tuples hash = tupleobject_hash((PyTupleObject *)key); - } else if (PyDict_Check(key)) { + } else if (PyDict_Check(key) || PyList_Check(key)) { // Before GH 57052 was fixed, all exceptions raised from PyObject_Hash were suppressed. - // some features rely on this behaviour, e.g. _libs.hashtable.value_count_object via - // DataFrame.describe, which counts generic objects using PyObjectHashTable. - // Using hash = 0 puts all objects in the same bucket, which is bad for performance but that is how it worked before. + // Existing code that relies on this behaviour is for example: + // * _libs.hashtable.value_count_object via DataFrame.describe + // * _libs.hashtable.ismember_object via Series.isin + // Using hash = 0 puts all dict and list objects in the same bucket, + // which is bad for performance but that is how it worked before. hash = 0; } else { hash = PyObject_Hash(key); From 0fc752e86946e96287a0d02ab382d45ec1d03d8d Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Mon, 3 Nov 2025 22:32:00 -0500 Subject: [PATCH 16/16] fix precommit --- pandas/_libs/include/pandas/vendored/klib/khash_python.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 505126f61e421..e8abc4415dbe6 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -191,9 +191,9 @@ static inline int tupleobject_cmp(PyTupleObject *a, PyTupleObject *b) { return 1; } -static inline int _is_pandas_NA_type(PyObject* o) { +static inline int _is_pandas_NA_type(PyObject *o) { // TODO compare PyTypeObject* C_NA, not strings! - PyObject* type_name = PyType_GetName(Py_TYPE(o)); + PyObject *type_name = PyType_GetName(Py_TYPE(o)); return PyUnicode_CompareWithASCIIString(type_name, "NAType") == 0; } @@ -323,8 +323,8 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) { // hash tuple subclasses as builtin tuples hash = tupleobject_hash((PyTupleObject *)key); } else if (PyDict_Check(key) || PyList_Check(key)) { - // Before GH 57052 was fixed, all exceptions raised from PyObject_Hash were suppressed. - // Existing code that relies on this behaviour is for example: + // Before GH 57052 was fixed, all exceptions raised from PyObject_Hash were + // suppressed. Existing code that relies on this behaviour is for example: // * _libs.hashtable.value_count_object via DataFrame.describe // * _libs.hashtable.ismember_object via Series.isin // Using hash = 0 puts all dict and list objects in the same bucket,