Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1212,6 +1212,7 @@ Other
^^^^^
- Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`)
- Bug in :class:`Series` ignoring errors when trying to convert :class:`Series` input data to the given ``dtype`` (:issue:`60728`)
- Bug in :class:``PyObjectHashTable`` that would silently suppress exceptions thrown from custom ``__hash__`` and ``__eq__`` methods during hashing (:issue:`57052`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you able to add a test that uses a public API that would be fixed by your changes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

- Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`)
- Bug in :func:`eval` where method calls on binary operations like ``(x + y).dropna()`` would raise ``AttributeError: 'BinOp' object has no attribute 'value'`` (:issue:`61175`)
- Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`)
Expand Down
24 changes: 12 additions & 12 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ cimported_types = ['complex64',
'int16',
'int32',
'int64',
'pymap',
'pymap_checked',
'str',
'strbox',
'uint8',
Expand Down Expand Up @@ -1312,13 +1312,13 @@ cdef class StringHashTable(HashTable):
cdef class PyObjectHashTable(HashTable):

def __init__(self, int64_t size_hint=1):
self.table = kh_init_pymap()
self.table = kh_init_pymap_checked()
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
kh_resize_pymap(self.table, size_hint)
kh_resize_pymap_checked(self.table, size_hint)

def __dealloc__(self):
if self.table is not NULL:
kh_destroy_pymap(self.table)
kh_destroy_pymap_checked(self.table)
self.table = NULL

def __len__(self) -> int:
Expand All @@ -1329,7 +1329,7 @@ cdef class PyObjectHashTable(HashTable):
khiter_t k
hash(key)

k = kh_get_pymap(self.table, <PyObject*>key)
k = kh_get_pymap_checked(self.table, <PyObject*>key)
return k != self.table.n_buckets

def sizeof(self, deep: bool = False) -> int:
Expand All @@ -1356,7 +1356,7 @@ cdef class PyObjectHashTable(HashTable):
cdef:
khiter_t k

k = kh_get_pymap(self.table, <PyObject*>val)
k = kh_get_pymap_checked(self.table, <PyObject*>val)
if k != self.table.n_buckets:
return self.table.vals[k]
else:
Expand All @@ -1370,8 +1370,8 @@ cdef class PyObjectHashTable(HashTable):

hash(key)

k = kh_put_pymap(self.table, <PyObject*>key, &ret)
if kh_exist_pymap(self.table, k):
k = kh_put_pymap_checked(self.table, <PyObject*>key, &ret)
if kh_exist_pymap_checked(self.table, k):
self.table.vals[k] = val
else:
raise KeyError(key)
Expand All @@ -1388,7 +1388,7 @@ cdef class PyObjectHashTable(HashTable):
val = values[i]
hash(val)

k = kh_put_pymap(self.table, <PyObject*>val, &ret)
k = kh_put_pymap_checked(self.table, <PyObject*>val, &ret)
self.table.vals[k] = i

def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
Expand All @@ -1405,7 +1405,7 @@ cdef class PyObjectHashTable(HashTable):
val = values[i]
hash(val)

k = kh_get_pymap(self.table, <PyObject*>val)
k = kh_get_pymap_checked(self.table, <PyObject*>val)
if k != self.table.n_buckets:
locs[i] = self.table.vals[k]
else:
Expand Down Expand Up @@ -1483,10 +1483,10 @@ cdef class PyObjectHashTable(HashTable):
labels[i] = na_sentinel
continue

k = kh_get_pymap(self.table, <PyObject*>val)
k = kh_get_pymap_checked(self.table, <PyObject*>val)
if k == self.table.n_buckets:
# k hasn't been seen yet
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
k = kh_put_pymap_checked(self.table, <PyObject*>val, &ret)
uniques.append(val)
if return_inverse:
self.table.vals[k] = count
Expand Down
74 changes: 36 additions & 38 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,24 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in

{{py:

# name, dtype, ttype, c_type, to_c_type
dtypes = [('Complex128', 'complex128', 'complex128',
'khcomplex128_t', 'to_khcomplex128_t'),
('Complex64', 'complex64', 'complex64',
'khcomplex64_t', 'to_khcomplex64_t'),
('Float64', 'float64', 'float64', 'float64_t', ''),
('Float32', 'float32', 'float32', 'float32_t', ''),
('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
('Object', 'object', 'pymap', 'object', '<PyObject*>'),
('Int64', 'int64', 'int64', 'int64_t', ''),
('Int32', 'int32', 'int32', 'int32_t', ''),
('Int16', 'int16', 'int16', 'int16_t', ''),
('Int8', 'int8', 'int8', 'int8_t', '')]
# name, dtype, ttype, tfunc, c_type, to_c_type
dtypes = [('Complex128', 'complex128', 'complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'),
('Complex64', 'complex64', 'complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'),
('Float64', 'float64', 'float64', 'float64', 'float64_t', ''),
('Float32', 'float32', 'float32', 'float32', 'float32_t', ''),
('UInt64', 'uint64', 'uint64', 'uint64', 'uint64_t', ''),
('UInt32', 'uint32', 'uint32', 'uint32', 'uint32_t', ''),
('UInt16', 'uint16', 'uint16', 'uint16', 'uint16_t', ''),
('UInt8', 'uint8', 'uint8', 'uint8', 'uint8_t', ''),
('Object', 'object', 'pymap', 'pymap_checked', 'object', '<PyObject*>'),
('Int64', 'int64', 'int64', 'int64', 'int64_t', ''),
('Int32', 'int32', 'int32', 'int32', 'int32_t', ''),
('Int16', 'int16', 'int16', 'int16', 'int16_t', ''),
('Int8', 'int8', 'int8', 'int8', 'int8_t', '')]

}}

{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
{{for name, dtype, ttype, tfunc, c_type, to_c_type in dtypes}}


@cython.wraparound(False)
Expand Down Expand Up @@ -55,26 +53,26 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
# result_keys remembers the original order of keys

result_keys = {{name}}Vector()
table = kh_init_{{ttype}}()
table = kh_init_{{tfunc}}()

{{if dtype == 'object'}}
if uses_mask:
raise NotImplementedError("uses_mask not implemented with object dtype")

kh_resize_{{ttype}}(table, n // 10)
kh_resize_{{tfunc}}(table, n // 10)

for i in range(n):
val = values[i]
if not dropna or not checknull(val):
k = kh_get_{{ttype}}(table, {{to_c_type}}val)
k = kh_get_{{tfunc}}(table, {{to_c_type}}val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret)
k = kh_put_{{tfunc}}(table, {{to_c_type}}val, &ret)
table.vals[k] = 1
result_keys.append(val)
{{else}}
kh_resize_{{ttype}}(table, n)
kh_resize_{{tfunc}}(table, n)

for i in range(n):
val = {{to_c_type}}(values[i])
Expand All @@ -90,11 +88,11 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
if uses_mask and isna_entry:
na_counter += 1
else:
k = kh_get_{{ttype}}(table, val)
k = kh_get_{{tfunc}}(table, val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_{{ttype}}(table, val, &ret)
k = kh_put_{{tfunc}}(table, val, &ret)
table.vals[k] = 1
result_keys.append(val)
{{endif}}
Expand All @@ -107,17 +105,17 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8

for i in range(table.size):
{{if dtype == 'object'}}
k = kh_get_{{ttype}}(table, result_keys.data[i])
k = kh_get_{{tfunc}}(table, result_keys.data[i])
{{else}}
k = kh_get_{{ttype}}(table, result_keys.data.data[i])
k = kh_get_{{tfunc}}(table, result_keys.data.data[i])
{{endif}}
result_counts[i] = table.vals[k]

if na_counter > 0:
result_counts[table.size] = na_counter
result_keys.append(val)

kh_destroy_{{ttype}}(table)
kh_destroy_{{tfunc}}(table)

return result_keys.to_array(), result_counts.base, na_counter

Expand All @@ -138,12 +136,12 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons
{{endif}}
Py_ssize_t i, n = len(values), first_na = -1
khiter_t k
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
kh_{{ttype}}_t *table = kh_init_{{tfunc}}()
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
bint seen_na = False, uses_mask = mask is not None
bint seen_multiple_na = False

kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
kh_resize_{{tfunc}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))

if keep not in ('last', 'first', False):
raise ValueError('keep must be either "first", "last" or False')
Expand All @@ -168,7 +166,7 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons
seen_na = True
else:
value = {{to_c_type}}(values[i])
kh_put_{{ttype}}(table, value, &ret)
kh_put_{{tfunc}}(table, value, &ret)
out[i] = ret == 0
{{endfor}}

Expand All @@ -193,16 +191,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons

else:
value = {{to_c_type}}(values[i])
k = kh_get_{{ttype}}(table, value)
k = kh_get_{{tfunc}}(table, value)
if k != table.n_buckets:
out[table.vals[k]] = 1
out[i] = 1
else:
k = kh_put_{{ttype}}(table, value, &ret)
k = kh_put_{{tfunc}}(table, value, &ret)
table.vals[k] = i
out[i] = 0

kh_destroy_{{ttype}}(table)
kh_destroy_{{tfunc}}(table)
return out


Expand Down Expand Up @@ -243,11 +241,11 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
{{c_type}} val
{{endif}}

kh_{{ttype}}_t *table = kh_init_{{ttype}}()
kh_{{ttype}}_t *table = kh_init_{{tfunc}}()

# construct the table
n = len(values)
kh_resize_{{ttype}}(table, n)
kh_resize_{{tfunc}}(table, n)

{{if dtype == 'object'}}
if True:
Expand All @@ -256,7 +254,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
{{endif}}
for i in range(n):
val = {{to_c_type}}(values[i])
kh_put_{{ttype}}(table, val, &ret)
kh_put_{{tfunc}}(table, val, &ret)

# test membership
n = len(arr)
Expand All @@ -269,10 +267,10 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
{{endif}}
for i in range(n):
val = {{to_c_type}}(arr[i])
k = kh_get_{{ttype}}(table, val)
k = kh_get_{{tfunc}}(table, val)
result[i] = (k != table.n_buckets)

kh_destroy_{{ttype}}(table)
kh_destroy_{{tfunc}}(table)
return result.view(np.bool_)

# ----------------------------------------------------------------------
Expand Down
25 changes: 22 additions & 3 deletions pandas/_libs/include/pandas/vendored/klib/khash_python.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#pragma once

#include <Python.h>

#include <pymem.h>
#include <string.h>

Expand Down Expand Up @@ -192,7 +191,16 @@ static inline int tupleobject_cmp(PyTupleObject *a, PyTupleObject *b) {
return 1;
}

static inline int _is_pandas_NA_type(PyObject *o) {
// TODO compare PyTypeObject* C_NA, not strings!
PyObject *type_name = PyType_GetName(Py_TYPE(o));
return PyUnicode_CompareWithASCIIString(type_name, "NAType") == 0;
}

static inline int pyobject_cmp(PyObject *a, PyObject *b) {
if (PyErr_Occurred() != NULL) {
return 0;
}
if (a == b) {
return 1;
}
Expand All @@ -211,11 +219,12 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) {
return tupleobject_cmp((PyTupleObject *)a, (PyTupleObject *)b);
}
// frozenset isn't yet supported
} else if (_is_pandas_NA_type(a) || _is_pandas_NA_type(b)) {
return 0;
}

int result = PyObject_RichCompareBool(a, b, Py_EQ);
if (result < 0) {
PyErr_Clear();
return 0;
}
return result;
Expand Down Expand Up @@ -292,6 +301,9 @@ static inline Py_hash_t tupleobject_hash(PyTupleObject *key) {
}

static inline khuint32_t kh_python_hash_func(PyObject *key) {
if (PyErr_Occurred() != NULL) {
return 0;
}
Py_hash_t hash;
// For PyObject_Hash holds:
// hash(0.0) == 0 == hash(-0.0)
Expand All @@ -310,12 +322,19 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) {
} else if (PyTuple_Check(key)) {
// hash tuple subclasses as builtin tuples
hash = tupleobject_hash((PyTupleObject *)key);
} else if (PyDict_Check(key) || PyList_Check(key)) {
// Before GH 57052 was fixed, all exceptions raised from PyObject_Hash were
// suppressed. Existing code that relies on this behaviour is for example:
// * _libs.hashtable.value_count_object via DataFrame.describe
// * _libs.hashtable.ismember_object via Series.isin
// Using hash = 0 puts all dict and list objects in the same bucket,
// which is bad for performance but that is how it worked before.
hash = 0;
} else {
hash = PyObject_Hash(key);
}

if (hash == -1) {
PyErr_Clear();
return 0;
}
#if SIZEOF_PY_HASH_T == 4
Expand Down
8 changes: 8 additions & 0 deletions pandas/_libs/khash.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,13 @@ cdef extern from "pandas/vendored/klib/khash_python.h":

khuint_t kh_needed_n_buckets(khuint_t element_n) nogil

cdef kh_pymap_t* kh_init_pymap_checked()
cdef void kh_destroy_pymap_checked(kh_pymap_t*)
cdef void kh_clear_pymap_checked(kh_pymap_t*)
cdef khuint_t kh_get_pymap_checked(kh_pymap_t*, PyObject*)
cdef void kh_resize_pymap_checked(kh_pymap_t*, khuint_t)
cdef khuint_t kh_put_pymap_checked(kh_pymap_t*, PyObject*, int*)
cdef void kh_del_pymap_checked(kh_pymap_t*, khuint_t)
cdef bint kh_exist_pymap_checked(kh_pymap_t*, khiter_t)

include "khash_for_primitive_helper.pxi"
Loading
Loading