Skip to content
Open
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ Other enhancements
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
- :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`)
- :meth:`Series.rank` and :meth:`DataFrame.rank` with numpy-nullable dtypes preserve ``NA`` values and return ``UInt64`` dtype where appropriate instead of casting ``NA`` to ``NaN`` with ``float64`` dtype (:issue:`62043`)
Expand Down
25 changes: 25 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11680,6 +11680,10 @@ def corr(
data = self._get_numeric_data() if numeric_only else self
cols = data.columns
idx = cols.copy()

if method in ("spearman", "kendall"):
data = data._transform_ord_cat_cols_to_coded_cols()

mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

if method == "pearson":
Expand Down Expand Up @@ -11973,6 +11977,8 @@ def corrwith(
correl = num / dom

elif method in ["kendall", "spearman"] or callable(method):
left = left._transform_ord_cat_cols_to_coded_cols()
right = right._transform_ord_cat_cols_to_coded_cols()

def c(x):
return nanops.nancorr(x[0], x[1], method=method)
Expand Down Expand Up @@ -12004,6 +12010,25 @@ def c(x):

return correl

def _transform_ord_cat_cols_to_coded_cols(self) -> DataFrame:
"""
any ordered categorical columns are transformed to the respective
categorical codes while other columns remain untouched
"""
categ = self.select_dtypes("category")
if len(categ.columns) == 0:
return self

cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns

if len(cols_convert) > 0:
data = self.copy(deep=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit wary of taking an entire copy of the dataframe in instances where there might be ordered categoricals; that's a potentially large performance hit, and the usage of this seems pretty niche

I see @rhshadrach commented on the original issue, so lets see what his thoughts are

data[cols_convert] = data[cols_convert].transform(
lambda x: x.cat.codes.replace(-1, np.nan)
)
return data
return self

# ----------------------------------------------------------------------
# ndarray-like stats methods

Expand Down
6 changes: 6 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2686,6 +2686,12 @@ def corr(
if len(this) == 0:
return np.nan

if method in ("spearman", "kendall"):
if this.dtype == "category" and this.cat.ordered:
this = this.cat.codes.replace(-1, np.nan)
if other.dtype == "category" and other.cat.ordered:
other = other.cat.codes.replace(-1, np.nan)

this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False)
other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False)

Expand Down
89 changes: 89 additions & 0 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from itertools import combinations

import numpy as np
import pytest

Expand Down Expand Up @@ -252,6 +254,46 @@ def test_corr_numeric_only(self, meth, numeric_only):
with pytest.raises(ValueError, match="could not convert string to float"):
df.corr(meth, numeric_only=numeric_only)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
def test_corr_rank_ordered_categorical(
self,
method,
):
pytest.importorskip("scipy")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless you are going to use the import, you can just add this as a @td.skip_if_no("scipy") decorator to the test

df = DataFrame(
{
"ord_cat": Series(
pd.Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"ord_cat_none": Series(
pd.Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
)
),
"ord_int": Series([0, 1, 2, 3]),
"ord_float": Series([2.0, 3.0, 4.5, 6.5]),
"ord_float_nan": Series([2.0, 3.0, 4.5, np.nan]),
"ord_cat_shuff": Series(
pd.Categorical(
["m", "h", "vh", "low"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"ord_int_shuff": Series([2, 3, 0, 1]),
}
)
corr_calc = df.corr(method=method)
for col1, col2 in combinations(df.columns, r=2):
corr_expected = df[col1].corr(df[col2], method=method)
tm.assert_almost_equal(corr_calc[col1][col2], corr_expected)


class TestDataFrameCorrWith:
@pytest.mark.parametrize(
Expand Down Expand Up @@ -493,3 +535,50 @@ def test_cov_with_missing_values(self):
result2 = df.dropna().cov()
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
def test_corr_rank_ordered_categorical(
self,
method,
):
pytest.importorskip("scipy")
df1 = DataFrame(
{
"a": Series(
pd.Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"b": Series(
pd.Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
)
),
"c": Series([0, 1, 2, 3]),
"d": Series([2.0, 3.0, 4.5, 6.5]),
}
)

df2 = DataFrame(
{
"a": Series([2.0, 3.0, 4.5, np.nan]),
"b": Series(
pd.Categorical(
["m", "h", "vh", "low"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"c": Series([2, 3, 0, 1]),
"d": Series([2.0, 3.0, 4.5, 6.5]),
}
)

corr_calc = df1.corrwith(df2, method=method)
for col in df1.columns:
corr_expected = df1[col].corr(df2[col], method=method)
tm.assert_almost_equal(corr_calc.get(col), corr_expected)
74 changes: 74 additions & 0 deletions pandas/tests/series/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,77 @@ def test_corr_callable_method(self, datetime_series):
df = pd.DataFrame([s1, s2])
expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}])
tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
def test_corr_rank_ordered_categorical(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is pretty long, to the point where its unclear what its intent is. Maybe its worth breaking up into a few tests? Or adding parameterization?

self,
method,
):
stats = pytest.importorskip("scipy.stats")
method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr}
ser_ord_cat = Series(
pd.Categorical(
["low", "med", "high", "very_high"],
categories=["low", "med", "high", "very_high"],
ordered=True,
)
)
ser_ord_cat_codes = ser_ord_cat.cat.codes.replace(-1, np.nan)
ser_ord_int = Series([0, 1, 2, 3])
ser_ord_float = Series([2.0, 3.0, 4.5, 6.5])

corr_calc = ser_ord_cat.corr(ser_ord_int, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_codes, ser_ord_int, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

corr_calc = ser_ord_cat.corr(ser_ord_float, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_codes, ser_ord_float, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

corr_calc = ser_ord_cat.corr(ser_ord_cat, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_codes, ser_ord_cat_codes, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

ser_ord_cat_shuff = Series(
pd.Categorical(
["high", "low", "very_high", "med"],
categories=["low", "med", "high", "very_high"],
ordered=True,
)
)
ser_ord_cat_shuff_codes = ser_ord_cat_shuff.cat.codes.replace(-1, np.nan)

corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_shuff_codes, ser_ord_cat_codes, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat_shuff, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_shuff_codes, ser_ord_cat_shuff_codes, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

ser_ord_cat_with_nan = Series(
pd.Categorical(
["h", "low", "vh", None, "m"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
)
ser_ord_cat_shuff_with_nan_codes = ser_ord_cat_with_nan.cat.codes.replace(
-1, np.nan
)
ser_ord_int = Series([2, 0, 1, 3, None])
corr_calc = ser_ord_cat_with_nan.corr(ser_ord_int, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_shuff_with_nan_codes, ser_ord_int, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)
Loading