Skip to content

Commit

Permalink
Add an experimental tabmat materializer class
Browse files Browse the repository at this point in the history
  • Loading branch information
stanmart committed Jun 15, 2023
1 parent 0d3835d commit bd2a2d3
Show file tree
Hide file tree
Showing 2 changed files with 263 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/tabmat/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .categorical_matrix import CategoricalMatrix
from .constructor import from_csc, from_pandas
from .dense_matrix import DenseMatrix
from .formula import TabmatMaterializer
from .matrix_base import MatrixBase
from .sparse_matrix import SparseMatrix
from .split_matrix import SplitMatrix
Expand All @@ -15,4 +16,5 @@
"CategoricalMatrix",
"from_csc",
"from_pandas",
"TabmatMaterializer",
]
261 changes: 261 additions & 0 deletions src/tabmat/formula.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
import functools
import itertools
from collections import OrderedDict

import numpy
import pandas
from formulaic import ModelMatrix, ModelSpec
from formulaic.materializers import FormulaMaterializer
from formulaic.materializers.base import EncodedTermStructure
from formulaic.materializers.types import NAAction
from interface_meta import override

from .categorical_matrix import CategoricalMatrix
from .dense_matrix import DenseMatrix
from .matrix_base import MatrixBase
from .sparse_matrix import SparseMatrix
from .split_matrix import SplitMatrix


class TabmatMaterializer(FormulaMaterializer):
"""Materializer for pandas input and tabmat output."""

REGISTER_NAME = "tabmat"
REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
REGISTER_OUTPUTS = "tabmat"

@override

This comment has been minimized.

Copy link
@MatthiasSchmidtblaicherQC

MatthiasSchmidtblaicherQC Jun 19, 2023

Contributor

Why do these methods need to be overridden in registration? Doesn't inheritance already replace the parent's method?

This comment has been minimized.

Copy link
@MartinStancsicsQC

MartinStancsicsQC Jun 20, 2023

Contributor

Generally yes, but formulaic uses the interface_meta package which requires us to be a bit more explicit when subclassing certain classes. Also, this is how the built-in PandasMaterializer subclasses FormulaMaterializer, so I just followed that example.

def _is_categorical(self, values):
if isinstance(values, (pandas.Series, pandas.Categorical)):
return values.dtype == object or isinstance(

This comment has been minimized.

Copy link
@MatthiasSchmidtblaicherQC

MatthiasSchmidtblaicherQC Jun 19, 2023

Contributor

For my understanding, does formulaic not handle pandas yet? Or does it not represent pandas objects in the way that is required here?

This comment has been minimized.

Copy link
@MartinStancsicsQC

MartinStancsicsQC Jun 20, 2023

Contributor

I think this is just there to provide a fallback for non-pandas objects. We might not need it as we only support pandas input.

values.dtype, pandas.CategoricalDtype
)
return super()._is_categorical(values)

@override
def _check_for_nulls(self, name, values, na_action, drop_rows):
if na_action is NAAction.IGNORE:
return

if isinstance(
values, dict
): # pragma: no cover; no formulaic transforms return dictionaries any more

This comment has been minimized.

Copy link
@MatthiasSchmidtblaicherQC

MatthiasSchmidtblaicherQC Jun 19, 2023

Contributor

So dicts in formulaic are just for backward compatibility? In that case, we can just eliminate this option from the subclass.

This comment has been minimized.

Copy link
@MartinStancsicsQC

MartinStancsicsQC Jun 20, 2023

Contributor

I can check when was the last time formulaic transforms returned dicts, and if we pin a later version we can remove this. But the real reason it's there is because this method is simply copied over from the PandasMaterializer class without too much thought 😅

for key, vs in values.items():
self._check_for_nulls(f"{name}[{key}]", vs, na_action, drop_rows)

elif na_action is NAAction.RAISE:
if isinstance(values, pandas.Series) and values.isnull().values.any():
raise ValueError(f"`{name}` contains null values after evaluation.")

elif na_action is NAAction.DROP:
if isinstance(values, pandas.Series):
drop_rows.update(numpy.flatnonzero(values.isnull().values))

else:
raise ValueError(
f"Do not know how to interpret `na_action` = {repr(na_action)}."
) # pragma: no cover; this is currently impossible to reach

@override
def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows):
series = value * numpy.ones(self.nrows - len(drop_rows))
return DenseMatrix(series)

@override
def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows):
if drop_rows:
values = values.drop(index=values.index[drop_rows])
if isinstance(values, pandas.Series):
values = values.to_numpy()
return DenseMatrix(values)

@override
def _encode_categorical(
self, values, metadata, encoder_state, spec, drop_rows, reduced_rank=False
):
# We do not do any encoding here as it is handled by tabmat
if drop_rows:
values = values.drop(index=values.index[drop_rows])
return CategoricalMatrix(values._values, drop_first=reduced_rank)

@override
def _get_columns_for_term(self, factors, spec, scale=1):
out = OrderedDict()

names = [
":".join(reversed(product))
for product in itertools.product(*reversed(factors))
]

for i, reversed_product in enumerate(
itertools.product(*(factor.items() for factor in reversed(factors)))
):
# TODO: implement this
out[names[i]] = functools.reduce(
_interact_columns,
(p[1] for p in reversed(reversed_product)),
)
if scale != 1:
# TODO: do we need this? Maybe raise?
out[names[i]] = scale * out[names[i]]
return out

@override
def _combine_columns(self, cols, spec, drop_rows):
# Special case no columns
if not cols:
values = numpy.empty((self.data.shape[0], 0))
return SplitMatrix([DenseMatrix(values)])

# Otherwise, concatenate columns into SplitMatrix
return SplitMatrix([col[1] for col in cols])

# Have to override _build_model_matrix, too, because of tabmat/glum's way
# of handling intercepts and categorical variables.
@override
def _build_model_matrix(self, spec: ModelSpec, drop_rows):
# Step 0: Apply any requested column/term clustering
# This must happen before Step 1 otherwise the greedy rank reduction
# below would result in a different outcome than if the columns had
# always been in the generated order.
terms = self._cluster_terms(spec.formula, cluster_by=spec.cluster_by)

# Step 1: Determine strategy to maintain structural full-rankness of output matrix
scoped_terms_for_terms = self._get_scoped_terms(
terms,
ensure_full_rank=spec.ensure_full_rank,
)

# Step 2: Generate the columns which will be collated into the full matrix
cols = []
for term, scoped_terms in scoped_terms_for_terms:
scoped_cols = OrderedDict()
for scoped_term in scoped_terms:
if not scoped_term.factors:
scoped_cols[
"Intercept"
] = scoped_term.scale * self._encode_constant(
1, None, {}, spec, drop_rows
)
else:
scoped_cols.update(
self._get_columns_for_term(
[
self._encode_evaled_factor(
scoped_factor.factor,
spec,
drop_rows,
reduced_rank=scoped_factor.reduced,
)
for scoped_factor in scoped_term.factors
],
spec=spec,
scale=scoped_term.scale,
)
)
cols.append((term, scoped_terms, scoped_cols))

# Step 3: Populate remaining model spec fields
if spec.structure:
cols = self._enforce_structure(cols, spec, drop_rows)
else:
spec = spec.update(
structure=[
EncodedTermStructure(
term,
[st.copy(without_values=True) for st in scoped_terms],
list(scoped_cols),
)
for term, scoped_terms, scoped_cols in cols
],
)

# Step 4: Collate factors into one ModelMatrix
return ModelMatrix(
self._combine_columns(
[
(name, values)
for term, scoped_terms, scoped_cols in cols
for name, values in scoped_cols.items()
],
spec=spec,
drop_rows=drop_rows,
),
spec=spec,
)


# There should be a better palce for this:
def _interact_columns(
left: MatrixBase, right: MatrixBase, dense_threshold: float = 0.1
) -> MatrixBase:
if isinstance(left, DenseMatrix) and isinstance(right, DenseMatrix):
return left.multiply(right)

if isinstance(left, SparseMatrix) and not isinstance(right, CategoricalMatrix):
return left.multiply(right)

if isinstance(right, SparseMatrix) and not isinstance(left, CategoricalMatrix):
return right.multiply(left)

if isinstance(left, CategoricalMatrix) and not isinstance(right, CategoricalMatrix):
if len(right.shape):
right = right.reshape(-1, 1) # type: ignore
return SparseMatrix(left.tocsr().multiply(right))
# TODO: we could do better by making it dense above a threshold

if isinstance(right, CategoricalMatrix) and not isinstance(left, CategoricalMatrix):
if len(left.shape):
left = left.reshape(-1, 1) # type: ignore
return SparseMatrix(right.tocsr().multiply(left))

if isinstance(left, CategoricalMatrix) and isinstance(right, CategoricalMatrix):
return _interact_categorical_categorical(left, right)

# Should be unreachable
raise RuntimeError(
f"_interact_columns not implemented for {type(left)} and {type(right)}"
)


def _interact_categorical_categorical(

This comment has been minimized.

Copy link
@MatthiasSchmidtblaicherQC

MatthiasSchmidtblaicherQC Jun 19, 2023

Contributor

This function will need lots of tests.

This comment has been minimized.

Copy link
@MartinStancsicsQC

MartinStancsicsQC Jun 20, 2023

Contributor

Just listing a few things here for my future self:

  • all four combinations of the drop_first attribute
  • zero-width matrices
  • missing values
left: CategoricalMatrix, right: CategoricalMatrix
) -> CategoricalMatrix:
card_right = len(right.cat.categories)

This comment has been minimized.

Copy link
@MatthiasSchmidtblaicherQC

MatthiasSchmidtblaicherQC Jun 19, 2023

Contributor

hard to guess what card means. I would suggest either writing out cardinality_right or to call it something shorter like dim.

This comment has been minimized.

Copy link
@MartinStancsicsQC

MartinStancsicsQC Jun 20, 2023

Contributor

Agreed, changed in the later commits


new_codes = left.cat.codes * card_right + right.cat.codes

if right.drop_first:
new_codes[new_codes % card_right == 0] = 0
new_codes -= new_codes // card_right
left_shift = card_right - 1
right_slice = slice(1, None)
else:
left_shift = card_right
right_slice = slice(None)

if left.drop_first:
new_codes -= left_shift
new_codes[new_codes < 0] = 0
left_slice = slice(1, None)
else:
left_slice = slice(None)

new_categories = [
f"{left_cat}__{right_cat}"

This comment has been minimized.

Copy link
@MatthiasSchmidtblaicherQC

MatthiasSchmidtblaicherQC Jun 19, 2023

Contributor

QCL-convetion would be to join with __x__ separator. I would make that the default, potentially adding a keyword argument separator or similar that can be changed by the user.

This comment has been minimized.

Copy link
@MartinStancsicsQC

MartinStancsicsQC Jun 19, 2023

Contributor

Good idea. Regarding categorical names, the main limitation at the moment is that we cannot (without too many ugly hacks) do var_1[level_a]:var2[level_b], only something like var_1:var_2[level_a:level_b]. I.e., levels and column names must be separated.

In QCL terms that would mean something like var_1__x__var_2__level_1__x__level_2 (or var_1__x__var_2[level_1__x__level_2]) instead of var_1__level_1__x__var_2__level_2. Would that be consistent with other similar applications?

This comment has been minimized.

Copy link
@MatthiasSchmidtblaicherQC

MatthiasSchmidtblaicherQC Jun 19, 2023

Contributor

In similar applications, we would not even write the variable name in the new level, i.e. we would just write level_1__x__level_2. Would this work here?

This comment has been minimized.

Copy link
@MartinStancsicsQC

MartinStancsicsQC via email Jun 19, 2023

Contributor
for left_cat, right_cat in itertools.product(
left.cat.categories[left_slice], right.cat.categories[right_slice]
)
]

new_drop_first = left.drop_first or right.drop_first
if new_drop_first:
new_categories = ["__drop__"] + new_categories

new_col = pandas.Categorical.from_codes(
new_codes,
new_categories,
ordered=left.cat.ordered and right.cat.ordered,
)

return CategoricalMatrix(new_col, drop_first=new_drop_first)

0 comments on commit bd2a2d3

Please sign in to comment.