-
-
Notifications
You must be signed in to change notification settings - Fork 19.2k
BUG: for ordered categorical data implements correct computation of kendall/spearman correlations #62880
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
BUG: for ordered categorical data implements correct computation of kendall/spearman correlations #62880
Changes from all commits
1f8c628
906f1e4
497dc7e
583aca6
e069810
b90726f
65a506c
ab3b8b9
e93ed83
ec4d97e
ebfc3b0
8cfacef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,5 @@ | ||
| from itertools import combinations | ||
|
|
||
| import numpy as np | ||
| import pytest | ||
|
|
||
|
|
@@ -252,6 +254,46 @@ def test_corr_numeric_only(self, meth, numeric_only): | |
| with pytest.raises(ValueError, match="could not convert string to float"): | ||
| df.corr(meth, numeric_only=numeric_only) | ||
|
|
||
| @pytest.mark.parametrize("method", ["kendall", "spearman"]) | ||
| def test_corr_rank_ordered_categorical( | ||
| self, | ||
| method, | ||
| ): | ||
| pytest.importorskip("scipy") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unless you are going to use the import, you can just add this as a |
||
| df = DataFrame( | ||
| { | ||
| "ord_cat": Series( | ||
| pd.Categorical( | ||
| ["low", "m", "h", "vh"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "ord_cat_none": Series( | ||
| pd.Categorical( | ||
| ["low", "m", "h", None], | ||
| categories=["low", "m", "h"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "ord_int": Series([0, 1, 2, 3]), | ||
| "ord_float": Series([2.0, 3.0, 4.5, 6.5]), | ||
| "ord_float_nan": Series([2.0, 3.0, 4.5, np.nan]), | ||
| "ord_cat_shuff": Series( | ||
| pd.Categorical( | ||
| ["m", "h", "vh", "low"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "ord_int_shuff": Series([2, 3, 0, 1]), | ||
| } | ||
| ) | ||
| corr_calc = df.corr(method=method) | ||
| for col1, col2 in combinations(df.columns, r=2): | ||
| corr_expected = df[col1].corr(df[col2], method=method) | ||
| tm.assert_almost_equal(corr_calc[col1][col2], corr_expected) | ||
|
|
||
|
|
||
| class TestDataFrameCorrWith: | ||
| @pytest.mark.parametrize( | ||
|
|
@@ -493,3 +535,50 @@ def test_cov_with_missing_values(self): | |
| result2 = df.dropna().cov() | ||
| tm.assert_frame_equal(result1, expected) | ||
| tm.assert_frame_equal(result2, expected) | ||
|
|
||
| @pytest.mark.parametrize("method", ["kendall", "spearman"]) | ||
| def test_corr_rank_ordered_categorical( | ||
| self, | ||
| method, | ||
| ): | ||
| pytest.importorskip("scipy") | ||
| df1 = DataFrame( | ||
| { | ||
| "a": Series( | ||
| pd.Categorical( | ||
| ["low", "m", "h", "vh"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "b": Series( | ||
| pd.Categorical( | ||
| ["low", "m", "h", None], | ||
| categories=["low", "m", "h"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "c": Series([0, 1, 2, 3]), | ||
| "d": Series([2.0, 3.0, 4.5, 6.5]), | ||
| } | ||
| ) | ||
|
|
||
| df2 = DataFrame( | ||
| { | ||
| "a": Series([2.0, 3.0, 4.5, np.nan]), | ||
| "b": Series( | ||
| pd.Categorical( | ||
| ["m", "h", "vh", "low"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ) | ||
| ), | ||
| "c": Series([2, 3, 0, 1]), | ||
| "d": Series([2.0, 3.0, 4.5, 6.5]), | ||
| } | ||
| ) | ||
|
|
||
| corr_calc = df1.corrwith(df2, method=method) | ||
| for col in df1.columns: | ||
| corr_expected = df1[col].corr(df2[col], method=method) | ||
| tm.assert_almost_equal(corr_calc.get(col), corr_expected) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -184,3 +184,77 @@ def test_corr_callable_method(self, datetime_series): | |
| df = pd.DataFrame([s1, s2]) | ||
| expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) | ||
| tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) | ||
|
|
||
| @pytest.mark.parametrize("method", ["kendall", "spearman"]) | ||
| def test_corr_rank_ordered_categorical( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test is pretty long, to the point where its unclear what its intent is. Maybe its worth breaking up into a few tests? Or adding parameterization? |
||
| self, | ||
| method, | ||
| ): | ||
| stats = pytest.importorskip("scipy.stats") | ||
| method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr} | ||
| ser_ord_cat = Series( | ||
| pd.Categorical( | ||
| ["low", "med", "high", "very_high"], | ||
| categories=["low", "med", "high", "very_high"], | ||
| ordered=True, | ||
| ) | ||
| ) | ||
| ser_ord_cat_codes = ser_ord_cat.cat.codes.replace(-1, np.nan) | ||
| ser_ord_int = Series([0, 1, 2, 3]) | ||
| ser_ord_float = Series([2.0, 3.0, 4.5, 6.5]) | ||
|
|
||
| corr_calc = ser_ord_cat.corr(ser_ord_int, method=method) | ||
| corr_expected = method_scipy_func[method]( | ||
| ser_ord_cat_codes, ser_ord_int, nan_policy="omit" | ||
| )[0] | ||
| tm.assert_almost_equal(corr_calc, corr_expected) | ||
|
|
||
| corr_calc = ser_ord_cat.corr(ser_ord_float, method=method) | ||
| corr_expected = method_scipy_func[method]( | ||
| ser_ord_cat_codes, ser_ord_float, nan_policy="omit" | ||
| )[0] | ||
| tm.assert_almost_equal(corr_calc, corr_expected) | ||
|
|
||
| corr_calc = ser_ord_cat.corr(ser_ord_cat, method=method) | ||
| corr_expected = method_scipy_func[method]( | ||
| ser_ord_cat_codes, ser_ord_cat_codes, nan_policy="omit" | ||
| )[0] | ||
| tm.assert_almost_equal(corr_calc, corr_expected) | ||
|
|
||
| ser_ord_cat_shuff = Series( | ||
| pd.Categorical( | ||
| ["high", "low", "very_high", "med"], | ||
| categories=["low", "med", "high", "very_high"], | ||
| ordered=True, | ||
| ) | ||
| ) | ||
| ser_ord_cat_shuff_codes = ser_ord_cat_shuff.cat.codes.replace(-1, np.nan) | ||
|
|
||
| corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat, method=method) | ||
| corr_expected = method_scipy_func[method]( | ||
| ser_ord_cat_shuff_codes, ser_ord_cat_codes, nan_policy="omit" | ||
| )[0] | ||
| tm.assert_almost_equal(corr_calc, corr_expected) | ||
|
|
||
| corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat_shuff, method=method) | ||
| corr_expected = method_scipy_func[method]( | ||
| ser_ord_cat_shuff_codes, ser_ord_cat_shuff_codes, nan_policy="omit" | ||
| )[0] | ||
| tm.assert_almost_equal(corr_calc, corr_expected) | ||
|
|
||
| ser_ord_cat_with_nan = Series( | ||
| pd.Categorical( | ||
| ["h", "low", "vh", None, "m"], | ||
| categories=["low", "m", "h", "vh"], | ||
| ordered=True, | ||
| ) | ||
| ) | ||
| ser_ord_cat_shuff_with_nan_codes = ser_ord_cat_with_nan.cat.codes.replace( | ||
| -1, np.nan | ||
| ) | ||
| ser_ord_int = Series([2, 0, 1, 3, None]) | ||
| corr_calc = ser_ord_cat_with_nan.corr(ser_ord_int, method=method) | ||
| corr_expected = method_scipy_func[method]( | ||
| ser_ord_cat_shuff_with_nan_codes, ser_ord_int, nan_policy="omit" | ||
| )[0] | ||
| tm.assert_almost_equal(corr_calc, corr_expected) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm a bit wary of taking an entire copy of the dataframe in instances where there might be ordered categoricals; that's a potentially large performance hit, and the usage of this seems pretty niche
I see @rhshadrach commented on the original issue, so lets see what his thoughts are