From 23aa09575bd043dedc4d1f4037131346a316de26 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 22 Jan 2024 23:33:50 +0100 Subject: [PATCH 1/7] Raise error on unseen levels when materializing --- src/tabmat/formula.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index c1198c72..bbb70aee 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -724,6 +724,14 @@ def encode_contrasts( """ levels = levels if levels is not None else _state.get("categories") force_convert = _state.get("force_convert", False) + + if levels is not None: + unseen_categories = set(data._values.unique()) - set(levels) + if unseen_categories: + raise ValueError( + f"Column {data.name} contains unseen categories: {unseen_categories}." + ) + cat = pandas.Categorical(data._values, categories=levels) _state["categories"] = cat.categories _state["force_convert"] = missing_method == "convert" and cat.isna().any() From 06f0f4aa97a2d6796e95fdbab459600e6e643a79 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 22 Jan 2024 23:59:10 +0100 Subject: [PATCH 2/7] Fix test for unseen categories --- src/tabmat/formula.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index bbb70aee..06df9512 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -726,7 +726,7 @@ def encode_contrasts( force_convert = _state.get("force_convert", False) if levels is not None: - unseen_categories = set(data._values.unique()) - set(levels) + unseen_categories = set(data.unique()) - set(levels) if unseen_categories: raise ValueError( f"Column {data.name} contains unseen categories: {unseen_categories}." From 4d829a4d994a62ae28168b0bf57a47e9df911889 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 23 Jan 2024 00:18:53 +0100 Subject: [PATCH 3/7] Add test for raising on unseen categories --- src/tabmat/formula.py | 2 +- tests/test_formula.py | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 06df9512..07d28deb 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -726,7 +726,7 @@ def encode_contrasts( force_convert = _state.get("force_convert", False) if levels is not None: - unseen_categories = set(data.unique()) - set(levels) + unseen_categories = set(data.dropna().unique()) - set(levels) if unseen_categories: raise ValueError( f"Column {data.name} contains unseen categories: {unseen_categories}." diff --git a/tests/test_formula.py b/tests/test_formula.py index 44213964..1319bdda 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -746,6 +746,28 @@ def test_cat_missing_interactions(): assert tm.from_formula(formula, df).column_names == expected_names +@pytest.mark.parametrize( + "cat_missing_method", ["zero", "convert"], ids=["zero", "convert"] +) +def test_unseen_category(cat_missing_method): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b"]), + } + ) + df_unseen = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", "c"]), + } + ) + result_seen = tm.from_formula( + "cat_1 - 1", df, cat_missing_method=cat_missing_method + ) + + with pytest.raises(ValueError, match="contains unseen categories"): + result_seen.model_spec.get_model_matrix(df_unseen) + + # Tests from formulaic's test suite # --------------------------------- From 3a30eb092870f6a094da541bafd0669dba036210 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 23 Jan 2024 04:18:48 +0100 Subject: [PATCH 4/7] Properly handle missings when checking for unseen --- src/tabmat/formula.py | 19 +++++++++++++------ tests/test_formula.py | 19 ++++++++++++++++++- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 07d28deb..f6db3586 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -429,7 +429,7 @@ def from_categorical( reduced_rank: bool, missing_method: str = "fail", missing_name: str = "(MISSING)", - force_convert: bool = False, + add_category_for_nan: bool = False, ) -> "_InteractableCategoricalVector": """Create an interactable categorical vector from a pandas categorical.""" categories = list(cat.categories) @@ -446,7 +446,7 @@ def from_categorical( "if cat_missing_method='fail'." ) - if missing_method == "convert" and (-1 in codes or force_convert): + if missing_method == "convert" and (-1 in codes or add_category_for_nan): codes[codes == -1] = len(categories) categories.append(missing_name) @@ -723,10 +723,15 @@ def encode_contrasts( order to avoid spanning the intercept. """ levels = levels if levels is not None else _state.get("categories") - force_convert = _state.get("force_convert", False) + add_category_for_nan = _state.get("add_category_for_nan", False) + # Check for unseen categories when levels are specified if levels is not None: - unseen_categories = set(data.dropna().unique()) - set(levels) + if missing_method == "convert" and not add_category_for_nan: + unseen_categories = set(data.unique()) - set(levels) + else: + unseen_categories = set(data.dropna().unique()) - set(levels) + if unseen_categories: raise ValueError( f"Column {data.name} contains unseen categories: {unseen_categories}." @@ -734,14 +739,16 @@ def encode_contrasts( cat = pandas.Categorical(data._values, categories=levels) _state["categories"] = cat.categories - _state["force_convert"] = missing_method == "convert" and cat.isna().any() + _state["add_category_for_nan"] = add_category_for_nan or ( + missing_method == "convert" and cat.isna().any() + ) return _InteractableCategoricalVector.from_categorical( cat, reduced_rank=reduced_rank, missing_method=missing_method, missing_name=missing_name, - force_convert=force_convert, + add_category_for_nan=add_category_for_nan, ) diff --git a/tests/test_formula.py b/tests/test_formula.py index 1319bdda..a7b849d9 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -747,7 +747,7 @@ def test_cat_missing_interactions(): @pytest.mark.parametrize( - "cat_missing_method", ["zero", "convert"], ids=["zero", "convert"] + "cat_missing_method", ["zero", "convert", "fail"], ids=["zero", "convert", "fail"] ) def test_unseen_category(cat_missing_method): df = pd.DataFrame( @@ -768,6 +768,23 @@ def test_unseen_category(cat_missing_method): result_seen.model_spec.get_model_matrix(df_unseen) +def test_unseen_missing_convert(): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b"]), + } + ) + df_unseen = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", pd.NA]), + } + ) + result_seen = tm.from_formula("cat_1 - 1", df, cat_missing_method="convert") + + with pytest.raises(ValueError, match="contains unseen categories"): + result_seen.model_spec.get_model_matrix(df_unseen) + + # Tests from formulaic's test suite # --------------------------------- From 798cc27ea4e5ca16950ca1b7cb6a3a93f02892e1 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 25 Jan 2024 10:38:57 +0100 Subject: [PATCH 5/7] Expand test for unseen missings --- tests/test_formula.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/test_formula.py b/tests/test_formula.py index a7b849d9..2cfefa02 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -768,7 +768,8 @@ def test_unseen_category(cat_missing_method): result_seen.model_spec.get_model_matrix(df_unseen) -def test_unseen_missing_convert(): +@pytest.mark.parametrize("cat_missing_method", ["zero", "convert", "fail"]) +def test_unseen_missing(cat_missing_method): df = pd.DataFrame( { "cat_1": pd.Categorical(["a", "b"]), @@ -779,10 +780,25 @@ def test_unseen_missing_convert(): "cat_1": pd.Categorical(["a", "b", pd.NA]), } ) - result_seen = tm.from_formula("cat_1 - 1", df, cat_missing_method="convert") + result_seen = tm.from_formula( + "cat_1 - 1", df, cat_missing_method=cat_missing_method + ) - with pytest.raises(ValueError, match="contains unseen categories"): - result_seen.model_spec.get_model_matrix(df_unseen) + if cat_missing_method == "convert": + with pytest.raises(ValueError, match="contains unseen categories"): + result_seen.model_spec.get_model_matrix(df_unseen) + elif cat_missing_method == "fail": + with pytest.raises( + ValueError, match="Categorical data can't have missing values" + ): + result_seen.model_spec.get_model_matrix(df_unseen) + elif cat_missing_method == "zero": + result_unseen = result_seen.model_spec.get_model_matrix(df_unseen) + assert result_unseen.A.shape == (3, 2) + np.testing.assert_array_equal( + result_unseen.A, np.array([[1, 0], [0, 1], [0, 0]]) + ) + assert result_unseen.column_names == ["cat_1[a]", "cat_1[b]"] # Tests from formulaic's test suite From c0d4deafa23f255e193f86df45dc0885cb049561 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 25 Jan 2024 10:40:40 +0100 Subject: [PATCH 6/7] Improve attribute name --- src/tabmat/formula.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index f6db3586..7acc821b 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -429,7 +429,7 @@ def from_categorical( reduced_rank: bool, missing_method: str = "fail", missing_name: str = "(MISSING)", - add_category_for_nan: bool = False, + add_missing_category: bool = False, ) -> "_InteractableCategoricalVector": """Create an interactable categorical vector from a pandas categorical.""" categories = list(cat.categories) @@ -446,7 +446,7 @@ def from_categorical( "if cat_missing_method='fail'." ) - if missing_method == "convert" and (-1 in codes or add_category_for_nan): + if missing_method == "convert" and (-1 in codes or add_missing_category): codes[codes == -1] = len(categories) categories.append(missing_name) @@ -723,11 +723,11 @@ def encode_contrasts( order to avoid spanning the intercept. """ levels = levels if levels is not None else _state.get("categories") - add_category_for_nan = _state.get("add_category_for_nan", False) + add_missing_category = _state.get("add_missing_category", False) # Check for unseen categories when levels are specified if levels is not None: - if missing_method == "convert" and not add_category_for_nan: + if missing_method == "convert" and not add_missing_category: unseen_categories = set(data.unique()) - set(levels) else: unseen_categories = set(data.dropna().unique()) - set(levels) @@ -739,7 +739,7 @@ def encode_contrasts( cat = pandas.Categorical(data._values, categories=levels) _state["categories"] = cat.categories - _state["add_category_for_nan"] = add_category_for_nan or ( + _state["add_missing_category"] = add_missing_category or ( missing_method == "convert" and cat.isna().any() ) @@ -748,7 +748,7 @@ def encode_contrasts( reduced_rank=reduced_rank, missing_method=missing_method, missing_name=missing_name, - add_category_for_nan=add_category_for_nan, + add_missing_category=add_missing_category, ) From 2644d314372fe3f979c84e104066a0e41f762cdb Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 25 Jan 2024 10:43:38 +0100 Subject: [PATCH 7/7] Add comment about dropping missings in tests for new levels --- src/tabmat/formula.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 7acc821b..0afe6df1 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -728,6 +728,9 @@ def encode_contrasts( # Check for unseen categories when levels are specified if levels is not None: if missing_method == "convert" and not add_missing_category: + # We only need to include NAs in the check in this case because: + # - missing_method == "fail" raises a more appropriate error later + # - missings are no problem in the other cases unseen_categories = set(data.unique()) - set(levels) else: unseen_categories = set(data.dropna().unique()) - set(levels)