From 96520101e7f3d9d5e23ad6f8a6a517505be508d9 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 16 Aug 2023 15:34:23 +0200 Subject: [PATCH] Add missing cateegorical functinoality to formulas --- src/tabmat/constructor.py | 12 +++++++++++ src/tabmat/formula.py | 42 ++++++++++++++++++++++++++++++++++++--- tests/test_formula.py | 32 +++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 3 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index 46efe83b..9849cbde 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -219,6 +219,8 @@ def from_formula( cat_threshold: int = 4, interaction_separator: str = ":", categorical_format: str = "{name}[{category}]", + cat_missing_method: str = "fail", + cat_missing_name: str = "(MISSING)", intercept_name: str = "Intercept", include_intercept: bool = False, add_column_for_intercept: bool = True, @@ -249,6 +251,14 @@ def from_formula( categorical_format: str, default "{name}[T.{category}]" The format string used to generate the names of categorical variables. Has to include the placeholders ``{name}`` and ``{category}``. + cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail' + How to handle missing values in categorical columns: + - if 'fail', raise an error if there are missing values + - if 'zero', missing values will represent all-zero indicator columns. + - if 'convert', missing values will be converted to the '(MISSING)' category. + cat_missing_name: str, default '(MISSING)' + Name of the category to which missing values will be converted if + ``cat_missing_method='convert'``. intercept_name: str, default "Intercept" The name of the intercept column. include_intercept: bool, default False @@ -286,6 +296,8 @@ def from_formula( sparse_threshold=sparse_threshold, cat_threshold=cat_threshold, add_column_for_intercept=add_column_for_intercept, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, ) result = materializer.get_model_matrix(spec) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 2bb81f9d..6d4798e0 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -44,6 +44,8 @@ def _init(self): self.add_column_for_intercept = self.params.get( "add_column_for_intercept", True ) + self.cat_missing_method = self.params.get("cat_missing_method", "fail") + self.cat_missing_name = self.params.get("cat_missing_name", "(MISSING)") # We can override formulaic's C() function here self.context["C"] = _C @@ -115,7 +117,12 @@ def _combine_columns(self, cols, spec, drop_rows): # Otherwise, concatenate columns into SplitMatrix return SplitMatrix( [ - col[1].to_tabmat(self.dtype, self.sparse_threshold, self.cat_threshold) + col[1].to_tabmat( + self.dtype, + self.sparse_threshold, + self.cat_threshold, + self.cat_missing_method, + ) for col in cols ] ) @@ -292,6 +299,7 @@ def to_tabmat( dtype: numpy.dtype, sparse_threshold: float, cat_threshold: int, + cat_missing_method: str, ) -> MatrixBase: """Convert to an actual tabmat matrix.""" pass @@ -345,6 +353,7 @@ def to_tabmat( dtype: numpy.dtype = numpy.float64, sparse_threshold: float = 0.1, cat_threshold: int = 4, + cat_missing_method: str = "fail", ) -> Union[SparseMatrix, DenseMatrix]: if (self.values != 0).mean() > sparse_threshold: return DenseMatrix(self.values, column_names=[self.name]) @@ -381,6 +390,8 @@ def to_tabmat( dtype: numpy.dtype = numpy.float64, sparse_threshold: float = 0.1, cat_threshold: int = 4, + cat_missing_method: str = "fail", + cat_missing_name: str = "(MISSING)", ) -> SparseMatrix: return SparseMatrix(self.values, column_names=[self.name]) @@ -412,15 +423,25 @@ def __init__( @classmethod def from_categorical( - cls, cat: pandas.Categorical, reduced_rank: bool + cls, + cat: pandas.Categorical, + reduced_rank: bool, + convert_missing: bool = False, + missing_name: str = "(MISSING)", ) -> "_InteractableCategoricalVector": """Create an interactable categorical vector from a pandas categorical.""" categories = list(cat.categories) codes = cat.codes.copy().astype(numpy.int64) + if reduced_rank: codes[codes == 0] = -2 codes[codes > 0] -= 1 categories = categories[1:] + + if convert_missing: + codes[codes == -1] = codes.max() + 1 + categories.append(missing_name) + return cls( codes=codes, categories=categories, @@ -441,6 +462,7 @@ def to_tabmat( dtype: numpy.dtype = numpy.float64, sparse_threshold: float = 0.1, cat_threshold: int = 4, + cat_missing_method: str = "fail", ) -> Union[DenseMatrix, CategoricalMatrix, SplitMatrix]: codes = self.codes.copy() categories = self.categories.copy() @@ -464,6 +486,7 @@ def to_tabmat( dtype=dtype, column_name=self.name, column_name_format="{category}", + cat_missing_method=cat_missing_method, ) if (self.codes == -2).all(): @@ -689,8 +712,21 @@ def encode_contrasts( levels = levels if levels is not None else _state.get("categories") cat = pandas.Categorical(data._values, categories=levels) _state["categories"] = cat.categories + + if _spec is not None and _spec.materializer_params is not None: + convert_missing = ( + _spec.materializer_params.get("cat_missing_method", "fail") == "convert" + ) + missing_name = _spec.materializer_params.get("cat_missing_name", "(MISSING)") + else: + convert_missing = False + missing_name = "(MISSING)" + return _InteractableCategoricalVector.from_categorical( - cat, reduced_rank=reduced_rank + cat, + reduced_rank=reduced_rank, + convert_missing=convert_missing, + missing_name=missing_name, ) diff --git a/tests/test_formula.py b/tests/test_formula.py index 88a55c71..e394b46b 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -630,6 +630,38 @@ def test_interactable_vectors(left, right, reverse): assert result_vec.name == right.name + ":" + left.name +@pytest.mark.parametrize("cat_missing_method", ["zero", "convert"]) +@pytest.mark.parametrize( + "cat_missing_name", + ["__missing__", "(MISSING)"], +) +def test_cat_missing_handling(cat_missing_method, cat_missing_name): + df = pd.DataFrame( + { + "cat_1": pd.Categorical(["a", "b", None, "b", "a"]), + } + ) + + mat_from_pandas = tm.from_pandas( + df, + cat_threshold=0, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, + ) + + mat_from_formula = tm.from_formula( + "cat_1 - 1", + df, + cat_threshold=0, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, + ) + + assert mat_from_pandas.column_names == mat_from_formula.column_names + assert mat_from_pandas.term_names == mat_from_formula.term_names + np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula.A) + + # Tests from formulaic's test suite # ---------------------------------