Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ ExtensionArray
Other
^^^^^
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`)
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`)
-

.. ---------------------------------------------------------------------------
Expand Down
94 changes: 94 additions & 0 deletions pandas/core/array_algos/replace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""
Methods used by Block.replace and related methods.
"""
import operator
import re
from typing import Optional, Pattern, Union

import numpy as np

from pandas._typing import ArrayLike, Scalar

from pandas.core.dtypes.common import (
is_datetimelike_v_numeric,
is_numeric_v_string_like,
is_scalar,
)


def compare_or_regex_search(
a: ArrayLike,
b: Union[Scalar, Pattern],
regex: bool = False,
mask: Optional[ArrayLike] = None,
) -> Union[ArrayLike, bool]:
"""
Compare two array_like inputs of the same shape or two scalar values
Calls operator.eq or re.search, depending on regex argument. If regex is
True, perform an element-wise regex matching.
Parameters
----------
a : array_like
b : scalar or regex pattern
regex : bool, default False
mask : array_like or None (default)
Returns
-------
mask : array_like of bool
"""

def _check_comparison_types(
result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern]
):
"""
Raises an error if the two arrays (a,b) cannot be compared.
Otherwise, returns the comparison result as expected.
"""
if is_scalar(result) and isinstance(a, np.ndarray):
type_names = [type(a).__name__, type(b).__name__]

if isinstance(a, np.ndarray):
type_names[0] = f"ndarray(dtype={a.dtype})"

raise TypeError(
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
)

if not regex:
op = lambda x: operator.eq(x, b)
else:
op = np.vectorize(
lambda x: bool(re.search(b, x))
if isinstance(x, str) and isinstance(b, (str, Pattern))
else False
)

# GH#32621 use mask to avoid comparing to NAs
if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray):
mask = np.reshape(~(isna(a)), a.shape)
if isinstance(a, np.ndarray):
a = a[mask]

if is_numeric_v_string_like(a, b):
# GH#29553 avoid deprecation warnings from numpy
return np.zeros(a.shape, dtype=bool)

elif is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b):
# GH#29553 avoid deprecation warnings from numpy
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related to this PR?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, though there is also a mistake here (the second condition has been refactored to a few lines up, so this line should just be elif is_datetimelike_v_numeric(a, b):

In master this is where we incorrectly raise instead of just consider string==numeric not-equal

_check_comparison_types(False, a, b)
return False

result = op(a)

if isinstance(result, np.ndarray) and mask is not None:
# The shape of the mask can differ to that of the result
# since we may compare only a subset of a's or b's elements
tmp = np.zeros(mask.shape, dtype=np.bool_)
tmp[mask] = result
result = tmp

_check_comparison_types(result, a, b)
return result
28 changes: 24 additions & 4 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pandas._libs.internals import BlockPlacement
from pandas._libs.tslibs import conversion
from pandas._libs.tslibs.timezones import tz_compare
from pandas._typing import ArrayLike
from pandas._typing import ArrayLike, Scalar
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -59,8 +59,10 @@
from pandas.core.dtypes.missing import _isna_compat, is_valid_nat_for_dtype, isna

import pandas.core.algorithms as algos
from pandas.core.array_algos.replace import compare_or_regex_search
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays import (
BooleanArray,
Categorical,
DatetimeArray,
ExtensionArray,
Expand Down Expand Up @@ -792,7 +794,6 @@ def _replace_list(
self,
src_list: List[Any],
dest_list: List[Any],
masks: List[np.ndarray],
inplace: bool = False,
regex: bool = False,
) -> List["Block"]:
Expand All @@ -801,11 +802,28 @@ def _replace_list(
"""
src_len = len(src_list) - 1

def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray:
"""
Generate a bool array by perform an equality check, or perform
an element-wise regular expression matching
"""
if isna(s):
return ~mask

s = com.maybe_box_datetimelike(s)
return compare_or_regex_search(self.values, s, regex, mask)

# Calculate the mask once, prior to the call of comp
# in order to avoid repeating the same computations
mask = ~isna(self.values)

masks = [comp(s, mask, regex) for s in src_list]

rb = [self if inplace else self.copy()]
for i, (src, dest) in enumerate(zip(src_list, dest_list)):
new_rb: List["Block"] = []
for blk in rb:
m = masks[i][blk.mgr_locs.indexer]
m = masks[i]
convert = i == src_len # only convert once at the end
result = blk._replace_coerce(
mask=m,
Expand Down Expand Up @@ -2906,7 +2924,9 @@ def _extract_bool_array(mask: ArrayLike) -> np.ndarray:
"""
If we have a SparseArray or BooleanArray, convert it to ndarray[bool].
"""
if isinstance(mask, ExtensionArray):
if isinstance(mask, BooleanArray):
mask = mask.to_numpy(dtype=bool, na_value=False)
elif isinstance(mask, ExtensionArray):
# We could have BooleanArray, Sparse[bool], ...
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think need to update this comment now though - so is there no way to keep this in the same branch as the ExtensionArray check? Would be nice to stay as generic as possible

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll see if we can use to_numpy in the general case

mask = np.asarray(mask, dtype=np.bool_)

Expand Down
104 changes: 1 addition & 103 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
from collections import defaultdict
import itertools
import operator
import re
from typing import (
Any,
DefaultDict,
Dict,
List,
Optional,
Pattern,
Sequence,
Tuple,
TypeVar,
Expand All @@ -19,7 +16,7 @@
import numpy as np

from pandas._libs import internals as libinternals, lib
from pandas._typing import ArrayLike, DtypeObj, Label, Scalar
from pandas._typing import ArrayLike, DtypeObj, Label
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
Expand All @@ -29,12 +26,9 @@
)
from pandas.core.dtypes.common import (
DT64NS_DTYPE,
is_datetimelike_v_numeric,
is_dtype_equal,
is_extension_array_dtype,
is_list_like,
is_numeric_v_string_like,
is_scalar,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import ExtensionDtype
Expand All @@ -44,7 +38,6 @@
import pandas.core.algorithms as algos
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.indexers import maybe_convert_indices
from pandas.core.indexes.api import Index, ensure_index
Expand Down Expand Up @@ -628,31 +621,10 @@ def replace_list(
""" do a list replace """
inplace = validate_bool_kwarg(inplace, "inplace")

# figure out our mask apriori to avoid repeated replacements
values = self.as_array()

def comp(s: Scalar, mask: np.ndarray, regex: bool = False):
"""
Generate a bool array by perform an equality check, or perform
an element-wise regular expression matching
"""
if isna(s):
return ~mask

s = com.maybe_box_datetimelike(s)
return _compare_or_regex_search(values, s, regex, mask)

# Calculate the mask once, prior to the call of comp
# in order to avoid repeating the same computations
mask = ~isna(values)

masks = [comp(s, mask, regex) for s in src_list]

bm = self.apply(
"_replace_list",
src_list=src_list,
dest_list=dest_list,
masks=masks,
inplace=inplace,
regex=regex,
)
Expand Down Expand Up @@ -1900,80 +1872,6 @@ def _merge_blocks(
return blocks


def _compare_or_regex_search(
a: ArrayLike,
b: Union[Scalar, Pattern],
regex: bool = False,
mask: Optional[ArrayLike] = None,
) -> Union[ArrayLike, bool]:
"""
Compare two array_like inputs of the same shape or two scalar values

Calls operator.eq or re.search, depending on regex argument. If regex is
True, perform an element-wise regex matching.

Parameters
----------
a : array_like
b : scalar or regex pattern
regex : bool, default False
mask : array_like or None (default)

Returns
-------
mask : array_like of bool
"""

def _check_comparison_types(
result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern]
):
"""
Raises an error if the two arrays (a,b) cannot be compared.
Otherwise, returns the comparison result as expected.
"""
if is_scalar(result) and isinstance(a, np.ndarray):
type_names = [type(a).__name__, type(b).__name__]

if isinstance(a, np.ndarray):
type_names[0] = f"ndarray(dtype={a.dtype})"

raise TypeError(
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
)

if not regex:
op = lambda x: operator.eq(x, b)
else:
op = np.vectorize(
lambda x: bool(re.search(b, x))
if isinstance(x, str) and isinstance(b, (str, Pattern))
else False
)

# GH#32621 use mask to avoid comparing to NAs
if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray):
mask = np.reshape(~(isna(a)), a.shape)
if isinstance(a, np.ndarray):
a = a[mask]

if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b):
# GH#29553 avoid deprecation warnings from numpy
_check_comparison_types(False, a, b)
return False

result = op(a)

if isinstance(result, np.ndarray) and mask is not None:
# The shape of the mask can differ to that of the result
# since we may compare only a subset of a's or b's elements
tmp = np.zeros(mask.shape, dtype=np.bool_)
tmp[mask] = result
result = tmp

_check_comparison_types(result, a, b)
return result


def _fast_count_smallints(arr: np.ndarray) -> np.ndarray:
"""Faster version of set(arr) for sequences of small numbers."""
counts = np.bincount(arr.astype(np.int_))
Expand Down
15 changes: 13 additions & 2 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -1131,8 +1131,19 @@ def test_replace_bool_with_bool(self):

def test_replace_with_dict_with_bool_keys(self):
df = DataFrame({0: [True, False], 1: [False, True]})
with pytest.raises(TypeError, match="Cannot compare types .+"):
df.replace({"asdf": "asdb", True: "yes"})
result = df.replace({"asdf": "asdb", True: "yes"})
expected = DataFrame({0: ["yes", False], 1: [False, "yes"]})
tm.assert_frame_equal(result, expected)

def test_replace_dict_strings_vs_ints(self):
# GH#34789
df = pd.DataFrame({"Y0": [1, 2], "Y1": [3, 4]})
result = df.replace({"replace_string": "test"})

tm.assert_frame_equal(result, df)

result = df["Y0"].replace({"replace_string": "test"})
tm.assert_series_equal(result, df["Y0"])

def test_replace_truthy(self):
df = DataFrame({"a": [True, True]})
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/series/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,9 @@ def test_replace_bool_with_bool(self):

def test_replace_with_dict_with_bool_keys(self):
s = pd.Series([True, False, True])
with pytest.raises(TypeError, match="Cannot compare types .+"):
s.replace({"asdf": "asdb", True: "yes"})
result = s.replace({"asdf": "asdb", True: "yes"})
expected = pd.Series(["yes", False, "yes"])
tm.assert_series_equal(result, expected)

def test_replace2(self):
N = 100
Expand Down