Skip to content

Commit

Permalink
Add missing cateegorical functinoality to formulas
Browse files Browse the repository at this point in the history
  • Loading branch information
stanmart committed Aug 16, 2023
1 parent c2c83b3 commit 9652010
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 3 deletions.
12 changes: 12 additions & 0 deletions src/tabmat/constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,8 @@ def from_formula(
cat_threshold: int = 4,
interaction_separator: str = ":",
categorical_format: str = "{name}[{category}]",
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
intercept_name: str = "Intercept",
include_intercept: bool = False,
add_column_for_intercept: bool = True,
Expand Down Expand Up @@ -249,6 +251,14 @@ def from_formula(
categorical_format: str, default "{name}[T.{category}]"
The format string used to generate the names of categorical variables.
Has to include the placeholders ``{name}`` and ``{category}``.
cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail'
How to handle missing values in categorical columns:
- if 'fail', raise an error if there are missing values
- if 'zero', missing values will represent all-zero indicator columns.
- if 'convert', missing values will be converted to the '(MISSING)' category.
cat_missing_name: str, default '(MISSING)'
Name of the category to which missing values will be converted if
``cat_missing_method='convert'``.
intercept_name: str, default "Intercept"
The name of the intercept column.
include_intercept: bool, default False
Expand Down Expand Up @@ -286,6 +296,8 @@ def from_formula(
sparse_threshold=sparse_threshold,
cat_threshold=cat_threshold,
add_column_for_intercept=add_column_for_intercept,
cat_missing_method=cat_missing_method,
cat_missing_name=cat_missing_name,
)
result = materializer.get_model_matrix(spec)

Expand Down
42 changes: 39 additions & 3 deletions src/tabmat/formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def _init(self):
self.add_column_for_intercept = self.params.get(
"add_column_for_intercept", True
)
self.cat_missing_method = self.params.get("cat_missing_method", "fail")
self.cat_missing_name = self.params.get("cat_missing_name", "(MISSING)")

# We can override formulaic's C() function here
self.context["C"] = _C
Expand Down Expand Up @@ -115,7 +117,12 @@ def _combine_columns(self, cols, spec, drop_rows):
# Otherwise, concatenate columns into SplitMatrix
return SplitMatrix(
[
col[1].to_tabmat(self.dtype, self.sparse_threshold, self.cat_threshold)
col[1].to_tabmat(
self.dtype,
self.sparse_threshold,
self.cat_threshold,
self.cat_missing_method,
)
for col in cols
]
)
Expand Down Expand Up @@ -292,6 +299,7 @@ def to_tabmat(
dtype: numpy.dtype,
sparse_threshold: float,
cat_threshold: int,
cat_missing_method: str,
) -> MatrixBase:
"""Convert to an actual tabmat matrix."""
pass
Expand Down Expand Up @@ -345,6 +353,7 @@ def to_tabmat(
dtype: numpy.dtype = numpy.float64,
sparse_threshold: float = 0.1,
cat_threshold: int = 4,
cat_missing_method: str = "fail",
) -> Union[SparseMatrix, DenseMatrix]:
if (self.values != 0).mean() > sparse_threshold:
return DenseMatrix(self.values, column_names=[self.name])
Expand Down Expand Up @@ -381,6 +390,8 @@ def to_tabmat(
dtype: numpy.dtype = numpy.float64,
sparse_threshold: float = 0.1,
cat_threshold: int = 4,
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
) -> SparseMatrix:
return SparseMatrix(self.values, column_names=[self.name])

Expand Down Expand Up @@ -412,15 +423,25 @@ def __init__(

@classmethod
def from_categorical(
cls, cat: pandas.Categorical, reduced_rank: bool
cls,
cat: pandas.Categorical,
reduced_rank: bool,
convert_missing: bool = False,
missing_name: str = "(MISSING)",
) -> "_InteractableCategoricalVector":
"""Create an interactable categorical vector from a pandas categorical."""
categories = list(cat.categories)
codes = cat.codes.copy().astype(numpy.int64)

if reduced_rank:
codes[codes == 0] = -2
codes[codes > 0] -= 1
categories = categories[1:]

if convert_missing:
codes[codes == -1] = codes.max() + 1
categories.append(missing_name)

return cls(
codes=codes,
categories=categories,
Expand All @@ -441,6 +462,7 @@ def to_tabmat(
dtype: numpy.dtype = numpy.float64,
sparse_threshold: float = 0.1,
cat_threshold: int = 4,
cat_missing_method: str = "fail",
) -> Union[DenseMatrix, CategoricalMatrix, SplitMatrix]:
codes = self.codes.copy()
categories = self.categories.copy()
Expand All @@ -464,6 +486,7 @@ def to_tabmat(
dtype=dtype,
column_name=self.name,
column_name_format="{category}",
cat_missing_method=cat_missing_method,
)

if (self.codes == -2).all():
Expand Down Expand Up @@ -689,8 +712,21 @@ def encode_contrasts(
levels = levels if levels is not None else _state.get("categories")
cat = pandas.Categorical(data._values, categories=levels)
_state["categories"] = cat.categories

if _spec is not None and _spec.materializer_params is not None:
convert_missing = (
_spec.materializer_params.get("cat_missing_method", "fail") == "convert"
)
missing_name = _spec.materializer_params.get("cat_missing_name", "(MISSING)")
else:
convert_missing = False
missing_name = "(MISSING)"

return _InteractableCategoricalVector.from_categorical(
cat, reduced_rank=reduced_rank
cat,
reduced_rank=reduced_rank,
convert_missing=convert_missing,
missing_name=missing_name,
)


Expand Down
32 changes: 32 additions & 0 deletions tests/test_formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,38 @@ def test_interactable_vectors(left, right, reverse):
assert result_vec.name == right.name + ":" + left.name


@pytest.mark.parametrize("cat_missing_method", ["zero", "convert"])
@pytest.mark.parametrize(
"cat_missing_name",
["__missing__", "(MISSING)"],
)
def test_cat_missing_handling(cat_missing_method, cat_missing_name):
df = pd.DataFrame(
{
"cat_1": pd.Categorical(["a", "b", None, "b", "a"]),
}
)

mat_from_pandas = tm.from_pandas(
df,
cat_threshold=0,
cat_missing_method=cat_missing_method,
cat_missing_name=cat_missing_name,
)

mat_from_formula = tm.from_formula(
"cat_1 - 1",
df,
cat_threshold=0,
cat_missing_method=cat_missing_method,
cat_missing_name=cat_missing_name,
)

assert mat_from_pandas.column_names == mat_from_formula.column_names
assert mat_from_pandas.term_names == mat_from_formula.term_names
np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula.A)


# Tests from formulaic's test suite
# ---------------------------------

Expand Down

0 comments on commit 9652010

Please sign in to comment.