From 96520101e7f3d9d5e23ad6f8a6a517505be508d9 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 16 Aug 2023 15:34:23 +0200
Subject: [PATCH] Add missing cateegorical functinoality to formulas

---
 src/tabmat/constructor.py | 12 +++++++++++
 src/tabmat/formula.py     | 42 ++++++++++++++++++++++++++++++++++++---
 tests/test_formula.py     | 32 +++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py
index 46efe83b..9849cbde 100644
--- a/src/tabmat/constructor.py
+++ b/src/tabmat/constructor.py
@@ -219,6 +219,8 @@ def from_formula(
     cat_threshold: int = 4,
     interaction_separator: str = ":",
     categorical_format: str = "{name}[{category}]",
+    cat_missing_method: str = "fail",
+    cat_missing_name: str = "(MISSING)",
     intercept_name: str = "Intercept",
     include_intercept: bool = False,
     add_column_for_intercept: bool = True,
@@ -249,6 +251,14 @@ def from_formula(
     categorical_format: str, default "{name}[T.{category}]"
         The format string used to generate the names of categorical variables.
         Has to include the placeholders ``{name}`` and ``{category}``.
+    cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail'
+        How to handle missing values in categorical columns:
+        - if 'fail', raise an error if there are missing values
+        - if 'zero', missing values will represent all-zero indicator columns.
+        - if 'convert', missing values will be converted to the '(MISSING)' category.
+    cat_missing_name: str, default '(MISSING)'
+        Name of the category to which missing values will be converted if
+        ``cat_missing_method='convert'``.
     intercept_name: str, default "Intercept"
         The name of the intercept column.
     include_intercept: bool, default False
@@ -286,6 +296,8 @@ def from_formula(
         sparse_threshold=sparse_threshold,
         cat_threshold=cat_threshold,
         add_column_for_intercept=add_column_for_intercept,
+        cat_missing_method=cat_missing_method,
+        cat_missing_name=cat_missing_name,
     )
     result = materializer.get_model_matrix(spec)
 
diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py
index 2bb81f9d..6d4798e0 100644
--- a/src/tabmat/formula.py
+++ b/src/tabmat/formula.py
@@ -44,6 +44,8 @@ def _init(self):
         self.add_column_for_intercept = self.params.get(
             "add_column_for_intercept", True
         )
+        self.cat_missing_method = self.params.get("cat_missing_method", "fail")
+        self.cat_missing_name = self.params.get("cat_missing_name", "(MISSING)")
 
         # We can override formulaic's C() function here
         self.context["C"] = _C
@@ -115,7 +117,12 @@ def _combine_columns(self, cols, spec, drop_rows):
         # Otherwise, concatenate columns into SplitMatrix
         return SplitMatrix(
             [
-                col[1].to_tabmat(self.dtype, self.sparse_threshold, self.cat_threshold)
+                col[1].to_tabmat(
+                    self.dtype,
+                    self.sparse_threshold,
+                    self.cat_threshold,
+                    self.cat_missing_method,
+                )
                 for col in cols
             ]
         )
@@ -292,6 +299,7 @@ def to_tabmat(
         dtype: numpy.dtype,
         sparse_threshold: float,
         cat_threshold: int,
+        cat_missing_method: str,
     ) -> MatrixBase:
         """Convert to an actual tabmat matrix."""
         pass
@@ -345,6 +353,7 @@ def to_tabmat(
         dtype: numpy.dtype = numpy.float64,
         sparse_threshold: float = 0.1,
         cat_threshold: int = 4,
+        cat_missing_method: str = "fail",
     ) -> Union[SparseMatrix, DenseMatrix]:
         if (self.values != 0).mean() > sparse_threshold:
             return DenseMatrix(self.values, column_names=[self.name])
@@ -381,6 +390,8 @@ def to_tabmat(
         dtype: numpy.dtype = numpy.float64,
         sparse_threshold: float = 0.1,
         cat_threshold: int = 4,
+        cat_missing_method: str = "fail",
+        cat_missing_name: str = "(MISSING)",
     ) -> SparseMatrix:
         return SparseMatrix(self.values, column_names=[self.name])
 
@@ -412,15 +423,25 @@ def __init__(
 
     @classmethod
     def from_categorical(
-        cls, cat: pandas.Categorical, reduced_rank: bool
+        cls,
+        cat: pandas.Categorical,
+        reduced_rank: bool,
+        convert_missing: bool = False,
+        missing_name: str = "(MISSING)",
     ) -> "_InteractableCategoricalVector":
         """Create an interactable categorical vector from a pandas categorical."""
         categories = list(cat.categories)
         codes = cat.codes.copy().astype(numpy.int64)
+
         if reduced_rank:
             codes[codes == 0] = -2
             codes[codes > 0] -= 1
             categories = categories[1:]
+
+        if convert_missing:
+            codes[codes == -1] = codes.max() + 1
+            categories.append(missing_name)
+
         return cls(
             codes=codes,
             categories=categories,
@@ -441,6 +462,7 @@ def to_tabmat(
         dtype: numpy.dtype = numpy.float64,
         sparse_threshold: float = 0.1,
         cat_threshold: int = 4,
+        cat_missing_method: str = "fail",
     ) -> Union[DenseMatrix, CategoricalMatrix, SplitMatrix]:
         codes = self.codes.copy()
         categories = self.categories.copy()
@@ -464,6 +486,7 @@ def to_tabmat(
             dtype=dtype,
             column_name=self.name,
             column_name_format="{category}",
+            cat_missing_method=cat_missing_method,
         )
 
         if (self.codes == -2).all():
@@ -689,8 +712,21 @@ def encode_contrasts(
     levels = levels if levels is not None else _state.get("categories")
     cat = pandas.Categorical(data._values, categories=levels)
     _state["categories"] = cat.categories
+
+    if _spec is not None and _spec.materializer_params is not None:
+        convert_missing = (
+            _spec.materializer_params.get("cat_missing_method", "fail") == "convert"
+        )
+        missing_name = _spec.materializer_params.get("cat_missing_name", "(MISSING)")
+    else:
+        convert_missing = False
+        missing_name = "(MISSING)"
+
     return _InteractableCategoricalVector.from_categorical(
-        cat, reduced_rank=reduced_rank
+        cat,
+        reduced_rank=reduced_rank,
+        convert_missing=convert_missing,
+        missing_name=missing_name,
     )
 
 
diff --git a/tests/test_formula.py b/tests/test_formula.py
index 88a55c71..e394b46b 100644
--- a/tests/test_formula.py
+++ b/tests/test_formula.py
@@ -630,6 +630,38 @@ def test_interactable_vectors(left, right, reverse):
         assert result_vec.name == right.name + ":" + left.name
 
 
+@pytest.mark.parametrize("cat_missing_method", ["zero", "convert"])
+@pytest.mark.parametrize(
+    "cat_missing_name",
+    ["__missing__", "(MISSING)"],
+)
+def test_cat_missing_handling(cat_missing_method, cat_missing_name):
+    df = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical(["a", "b", None, "b", "a"]),
+        }
+    )
+
+    mat_from_pandas = tm.from_pandas(
+        df,
+        cat_threshold=0,
+        cat_missing_method=cat_missing_method,
+        cat_missing_name=cat_missing_name,
+    )
+
+    mat_from_formula = tm.from_formula(
+        "cat_1 - 1",
+        df,
+        cat_threshold=0,
+        cat_missing_method=cat_missing_method,
+        cat_missing_name=cat_missing_name,
+    )
+
+    assert mat_from_pandas.column_names == mat_from_formula.column_names
+    assert mat_from_pandas.term_names == mat_from_formula.term_names
+    np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula.A)
+
+
 # Tests from formulaic's test suite
 # ---------------------------------