From bd2a2d329c36e41ea69502ac1065821edb09e79b Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 15 Jun 2023 16:55:33 +0200 Subject: [PATCH 01/72] Add an experimental tabmat materializer class --- src/tabmat/__init__.py | 2 + src/tabmat/formula.py | 261 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 263 insertions(+) create mode 100644 src/tabmat/formula.py diff --git a/src/tabmat/__init__.py b/src/tabmat/__init__.py index 9f4a8889..6aab6757 100644 --- a/src/tabmat/__init__.py +++ b/src/tabmat/__init__.py @@ -1,6 +1,7 @@ from .categorical_matrix import CategoricalMatrix from .constructor import from_csc, from_pandas from .dense_matrix import DenseMatrix +from .formula import TabmatMaterializer from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix from .split_matrix import SplitMatrix @@ -15,4 +16,5 @@ "CategoricalMatrix", "from_csc", "from_pandas", + "TabmatMaterializer", ] diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py new file mode 100644 index 00000000..906ae548 --- /dev/null +++ b/src/tabmat/formula.py @@ -0,0 +1,261 @@ +import functools +import itertools +from collections import OrderedDict + +import numpy +import pandas +from formulaic import ModelMatrix, ModelSpec +from formulaic.materializers import FormulaMaterializer +from formulaic.materializers.base import EncodedTermStructure +from formulaic.materializers.types import NAAction +from interface_meta import override + +from .categorical_matrix import CategoricalMatrix +from .dense_matrix import DenseMatrix +from .matrix_base import MatrixBase +from .sparse_matrix import SparseMatrix +from .split_matrix import SplitMatrix + + +class TabmatMaterializer(FormulaMaterializer): + """Materializer for pandas input and tabmat output.""" + + REGISTER_NAME = "tabmat" + REGISTER_INPUTS = ("pandas.core.frame.DataFrame",) + REGISTER_OUTPUTS = "tabmat" + + @override + def _is_categorical(self, values): + if isinstance(values, (pandas.Series, pandas.Categorical)): + return values.dtype == object or isinstance( + values.dtype, pandas.CategoricalDtype + ) + return super()._is_categorical(values) + + @override + def _check_for_nulls(self, name, values, na_action, drop_rows): + if na_action is NAAction.IGNORE: + return + + if isinstance( + values, dict + ): # pragma: no cover; no formulaic transforms return dictionaries any more + for key, vs in values.items(): + self._check_for_nulls(f"{name}[{key}]", vs, na_action, drop_rows) + + elif na_action is NAAction.RAISE: + if isinstance(values, pandas.Series) and values.isnull().values.any(): + raise ValueError(f"`{name}` contains null values after evaluation.") + + elif na_action is NAAction.DROP: + if isinstance(values, pandas.Series): + drop_rows.update(numpy.flatnonzero(values.isnull().values)) + + else: + raise ValueError( + f"Do not know how to interpret `na_action` = {repr(na_action)}." + ) # pragma: no cover; this is currently impossible to reach + + @override + def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows): + series = value * numpy.ones(self.nrows - len(drop_rows)) + return DenseMatrix(series) + + @override + def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): + if drop_rows: + values = values.drop(index=values.index[drop_rows]) + if isinstance(values, pandas.Series): + values = values.to_numpy() + return DenseMatrix(values) + + @override + def _encode_categorical( + self, values, metadata, encoder_state, spec, drop_rows, reduced_rank=False + ): + # We do not do any encoding here as it is handled by tabmat + if drop_rows: + values = values.drop(index=values.index[drop_rows]) + return CategoricalMatrix(values._values, drop_first=reduced_rank) + + @override + def _get_columns_for_term(self, factors, spec, scale=1): + out = OrderedDict() + + names = [ + ":".join(reversed(product)) + for product in itertools.product(*reversed(factors)) + ] + + for i, reversed_product in enumerate( + itertools.product(*(factor.items() for factor in reversed(factors))) + ): + # TODO: implement this + out[names[i]] = functools.reduce( + _interact_columns, + (p[1] for p in reversed(reversed_product)), + ) + if scale != 1: + # TODO: do we need this? Maybe raise? + out[names[i]] = scale * out[names[i]] + return out + + @override + def _combine_columns(self, cols, spec, drop_rows): + # Special case no columns + if not cols: + values = numpy.empty((self.data.shape[0], 0)) + return SplitMatrix([DenseMatrix(values)]) + + # Otherwise, concatenate columns into SplitMatrix + return SplitMatrix([col[1] for col in cols]) + + # Have to override _build_model_matrix, too, because of tabmat/glum's way + # of handling intercepts and categorical variables. + @override + def _build_model_matrix(self, spec: ModelSpec, drop_rows): + # Step 0: Apply any requested column/term clustering + # This must happen before Step 1 otherwise the greedy rank reduction + # below would result in a different outcome than if the columns had + # always been in the generated order. + terms = self._cluster_terms(spec.formula, cluster_by=spec.cluster_by) + + # Step 1: Determine strategy to maintain structural full-rankness of output matrix + scoped_terms_for_terms = self._get_scoped_terms( + terms, + ensure_full_rank=spec.ensure_full_rank, + ) + + # Step 2: Generate the columns which will be collated into the full matrix + cols = [] + for term, scoped_terms in scoped_terms_for_terms: + scoped_cols = OrderedDict() + for scoped_term in scoped_terms: + if not scoped_term.factors: + scoped_cols[ + "Intercept" + ] = scoped_term.scale * self._encode_constant( + 1, None, {}, spec, drop_rows + ) + else: + scoped_cols.update( + self._get_columns_for_term( + [ + self._encode_evaled_factor( + scoped_factor.factor, + spec, + drop_rows, + reduced_rank=scoped_factor.reduced, + ) + for scoped_factor in scoped_term.factors + ], + spec=spec, + scale=scoped_term.scale, + ) + ) + cols.append((term, scoped_terms, scoped_cols)) + + # Step 3: Populate remaining model spec fields + if spec.structure: + cols = self._enforce_structure(cols, spec, drop_rows) + else: + spec = spec.update( + structure=[ + EncodedTermStructure( + term, + [st.copy(without_values=True) for st in scoped_terms], + list(scoped_cols), + ) + for term, scoped_terms, scoped_cols in cols + ], + ) + + # Step 4: Collate factors into one ModelMatrix + return ModelMatrix( + self._combine_columns( + [ + (name, values) + for term, scoped_terms, scoped_cols in cols + for name, values in scoped_cols.items() + ], + spec=spec, + drop_rows=drop_rows, + ), + spec=spec, + ) + + +# There should be a better palce for this: +def _interact_columns( + left: MatrixBase, right: MatrixBase, dense_threshold: float = 0.1 +) -> MatrixBase: + if isinstance(left, DenseMatrix) and isinstance(right, DenseMatrix): + return left.multiply(right) + + if isinstance(left, SparseMatrix) and not isinstance(right, CategoricalMatrix): + return left.multiply(right) + + if isinstance(right, SparseMatrix) and not isinstance(left, CategoricalMatrix): + return right.multiply(left) + + if isinstance(left, CategoricalMatrix) and not isinstance(right, CategoricalMatrix): + if len(right.shape): + right = right.reshape(-1, 1) # type: ignore + return SparseMatrix(left.tocsr().multiply(right)) + # TODO: we could do better by making it dense above a threshold + + if isinstance(right, CategoricalMatrix) and not isinstance(left, CategoricalMatrix): + if len(left.shape): + left = left.reshape(-1, 1) # type: ignore + return SparseMatrix(right.tocsr().multiply(left)) + + if isinstance(left, CategoricalMatrix) and isinstance(right, CategoricalMatrix): + return _interact_categorical_categorical(left, right) + + # Should be unreachable + raise RuntimeError( + f"_interact_columns not implemented for {type(left)} and {type(right)}" + ) + + +def _interact_categorical_categorical( + left: CategoricalMatrix, right: CategoricalMatrix +) -> CategoricalMatrix: + card_right = len(right.cat.categories) + + new_codes = left.cat.codes * card_right + right.cat.codes + + if right.drop_first: + new_codes[new_codes % card_right == 0] = 0 + new_codes -= new_codes // card_right + left_shift = card_right - 1 + right_slice = slice(1, None) + else: + left_shift = card_right + right_slice = slice(None) + + if left.drop_first: + new_codes -= left_shift + new_codes[new_codes < 0] = 0 + left_slice = slice(1, None) + else: + left_slice = slice(None) + + new_categories = [ + f"{left_cat}__{right_cat}" + for left_cat, right_cat in itertools.product( + left.cat.categories[left_slice], right.cat.categories[right_slice] + ) + ] + + new_drop_first = left.drop_first or right.drop_first + if new_drop_first: + new_categories = ["__drop__"] + new_categories + + new_col = pandas.Categorical.from_codes( + new_codes, + new_categories, + ordered=left.cat.ordered and right.cat.ordered, + ) + + return CategoricalMatrix(new_col, drop_first=new_drop_first) From 100bdb87d0570f675015e28305446df276977aad Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 19 Jun 2023 11:22:01 +0200 Subject: [PATCH 02/72] Nicer way of handling interactions --- src/tabmat/formula.py | 240 ++++++++++++++++++++++++------------------ 1 file changed, 140 insertions(+), 100 deletions(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 906ae548..183ed5a6 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -1,4 +1,4 @@ -import functools +import copy import itertools from collections import OrderedDict @@ -12,7 +12,6 @@ from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix -from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix from .split_matrix import SplitMatrix @@ -59,7 +58,7 @@ def _check_for_nulls(self, name, values, na_action, drop_rows): @override def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows): series = value * numpy.ones(self.nrows - len(drop_rows)) - return DenseMatrix(series) + return InteractableDenseMatrix(series) @override def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): @@ -67,7 +66,7 @@ def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): values = values.drop(index=values.index[drop_rows]) if isinstance(values, pandas.Series): values = values.to_numpy() - return DenseMatrix(values) + return InteractableDenseMatrix(values) @override def _encode_categorical( @@ -76,29 +75,7 @@ def _encode_categorical( # We do not do any encoding here as it is handled by tabmat if drop_rows: values = values.drop(index=values.index[drop_rows]) - return CategoricalMatrix(values._values, drop_first=reduced_rank) - - @override - def _get_columns_for_term(self, factors, spec, scale=1): - out = OrderedDict() - - names = [ - ":".join(reversed(product)) - for product in itertools.product(*reversed(factors)) - ] - - for i, reversed_product in enumerate( - itertools.product(*(factor.items() for factor in reversed(factors))) - ): - # TODO: implement this - out[names[i]] = functools.reduce( - _interact_columns, - (p[1] for p in reversed(reversed_product)), - ) - if scale != 1: - # TODO: do we need this? Maybe raise? - out[names[i]] = scale * out[names[i]] - return out + return InteractableCategoricalMatrix(values._values, drop_first=reduced_rank) @override def _combine_columns(self, cols, spec, drop_rows): @@ -108,7 +85,7 @@ def _combine_columns(self, cols, spec, drop_rows): return SplitMatrix([DenseMatrix(values)]) # Otherwise, concatenate columns into SplitMatrix - return SplitMatrix([col[1] for col in cols]) + return SplitMatrix([col[1].to_non_interactable() for col in cols]) # Have to override _build_model_matrix, too, because of tabmat/glum's way # of handling intercepts and categorical variables. @@ -164,7 +141,7 @@ def _build_model_matrix(self, spec: ModelSpec, drop_rows): EncodedTermStructure( term, [st.copy(without_values=True) for st in scoped_terms], - list(scoped_cols), + _colnames_from_scoped_cols(scoped_cols), ) for term, scoped_terms, scoped_cols in cols ], @@ -185,77 +162,140 @@ def _build_model_matrix(self, spec: ModelSpec, drop_rows): ) -# There should be a better palce for this: -def _interact_columns( - left: MatrixBase, right: MatrixBase, dense_threshold: float = 0.1 -) -> MatrixBase: - if isinstance(left, DenseMatrix) and isinstance(right, DenseMatrix): - return left.multiply(right) - - if isinstance(left, SparseMatrix) and not isinstance(right, CategoricalMatrix): - return left.multiply(right) - - if isinstance(right, SparseMatrix) and not isinstance(left, CategoricalMatrix): - return right.multiply(left) - - if isinstance(left, CategoricalMatrix) and not isinstance(right, CategoricalMatrix): - if len(right.shape): - right = right.reshape(-1, 1) # type: ignore - return SparseMatrix(left.tocsr().multiply(right)) - # TODO: we could do better by making it dense above a threshold - - if isinstance(right, CategoricalMatrix) and not isinstance(left, CategoricalMatrix): - if len(left.shape): - left = left.reshape(-1, 1) # type: ignore - return SparseMatrix(right.tocsr().multiply(left)) - - if isinstance(left, CategoricalMatrix) and isinstance(right, CategoricalMatrix): - return _interact_categorical_categorical(left, right) - - # Should be unreachable - raise RuntimeError( - f"_interact_columns not implemented for {type(left)} and {type(right)}" - ) - - -def _interact_categorical_categorical( - left: CategoricalMatrix, right: CategoricalMatrix -) -> CategoricalMatrix: - card_right = len(right.cat.categories) - - new_codes = left.cat.codes * card_right + right.cat.codes - - if right.drop_first: - new_codes[new_codes % card_right == 0] = 0 - new_codes -= new_codes // card_right - left_shift = card_right - 1 - right_slice = slice(1, None) - else: - left_shift = card_right - right_slice = slice(None) - - if left.drop_first: - new_codes -= left_shift - new_codes[new_codes < 0] = 0 - left_slice = slice(1, None) - else: - left_slice = slice(None) - - new_categories = [ - f"{left_cat}__{right_cat}" - for left_cat, right_cat in itertools.product( - left.cat.categories[left_slice], right.cat.categories[right_slice] - ) - ] +def _colnames_from_scoped_cols(scoped_cols): + colnames = [] + for name, col in scoped_cols.items(): + if isinstance(col, CategoricalMatrix): + if col.drop_first: + colnames.extend([f"{name}__{cat}" for cat in col.cat.categories[1:]]) + else: + colnames.extend([f"{name}__{cat}" for cat in col.cat.categories]) + else: + colnames.append(name) + return colnames - new_drop_first = left.drop_first or right.drop_first - if new_drop_first: - new_categories = ["__drop__"] + new_categories - new_col = pandas.Categorical.from_codes( - new_codes, - new_categories, - ordered=left.cat.ordered and right.cat.ordered, - ) +class InteractableDenseMatrix(DenseMatrix): + def __mul__(self, other): + if isinstance(other, (InteractableDenseMatrix, int, float)): + return self.multiply(other) + elif isinstance( + other, (InteractableSparseMatrix, InteractableCategoricalMatrix) + ): + return other.__mul__(self) + else: + raise TypeError(f"Cannot multiply {type(self)} and {type(other)}") + # Multiplication with sparse and categorical is handled by the other classes - return CategoricalMatrix(new_col, drop_first=new_drop_first) + def __rmul__(self, other): + return self.__mul__(other) + + def to_non_interactable(self): + return DenseMatrix(self) + + +class InteractableSparseMatrix(SparseMatrix): + def __mul__(self, other): + if isinstance(other, (InteractableDenseMatrix, InteractableSparseMatrix)): + return self.multiply(other) + elif isinstance(other, InteractableCategoricalMatrix): + return other.__mul__(self) + elif isinstance(other, (int, float)): + return self.multiply(numpy.array(other)) + else: + raise TypeError(f"Cannot multiply {type(self)} and {type(other)}") + # Multiplication with categorical is handled by the categorical + + def __rmul__(self, other): + return self.__mul__(other) + + def to_non_interactable(self): + return SparseMatrix(self) + + +class InteractableCategoricalMatrix(CategoricalMatrix): + def __init__(self, *args, **kwargs): + multipliers = kwargs.pop("multipliers", None) + super().__init__(*args, **kwargs) + if multipliers is None: + self.multipliers = numpy.ones_like(self.cat, dtype=numpy.float_) + else: + self.multipliers = multipliers + + def __mul__(self, other): + if isinstance(other, (InteractableDenseMatrix, float, int)): + result = copy.copy(self) + result.multipliers = result.multipliers * numpy.array(other) + return result + elif isinstance(other, InteractableSparseMatrix): + result = copy.copy(self) + result.multipliers = result.multipliers * other.todense() + return result + elif isinstance(other, InteractableCategoricalMatrix): + return self._interact_categorical(other) + else: + raise TypeError( + f"Can't multiply InteractableCategoricalMatrix with {type(other)}" + ) + + def __rmul__(self, other): + if isinstance(other, InteractableCategoricalMatrix): + other._interact_categorical(self) # order matters + else: + return self.__mul__(other) + + def to_non_interactable(self): + if numpy.all(self.multipliers == 1): + return CategoricalMatrix( + self.cat, + drop_first=self.drop_first, + dtype=self.dtype, + ) + else: + return SparseMatrix( + self.tocsr().multiply(self.multipliers[:, numpy.newaxis]) + ) + + def _interact_categorical(self, other): + cardinality_right = len(other.cat.categories) + + new_codes = self.cat.codes * cardinality_right + other.cat.codes + + if other.drop_first: + new_codes[new_codes % cardinality_right == 0] = 0 + new_codes -= new_codes // cardinality_right + left_shift = cardinality_right - 1 + right_slice = slice(1, None) + else: + left_shift = cardinality_right + right_slice = slice(None) + + if self.drop_first: + new_codes -= left_shift + new_codes[new_codes < 0] = 0 + left_slice = slice(1, None) + else: + left_slice = slice(None) + + new_categories = [ + f"{left_cat}__{right_cat}" + for left_cat, right_cat in itertools.product( + self.cat.categories[left_slice], other.cat.categories[right_slice] + ) + ] + + new_drop_first = self.drop_first or other.drop_first + if new_drop_first: + new_categories = ["__drop__"] + new_categories + + cat = pandas.Categorical.from_codes( + categories=new_categories, + codes=new_codes, + ordered=self.cat.ordered and other.cat.ordered, + ) + + return InteractableCategoricalMatrix( + cat, + multipliers=self.multipliers * other.multipliers, + drop_first=new_drop_first, + ) From 85da52ea5cec9eb683a87e428ad5c7b00e1974b3 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 19 Jun 2023 12:14:57 +0200 Subject: [PATCH 03/72] Have proper column names [skip ci] --- src/tabmat/formula.py | 48 ++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 183ed5a6..d0868f38 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -87,8 +87,8 @@ def _combine_columns(self, cols, spec, drop_rows): # Otherwise, concatenate columns into SplitMatrix return SplitMatrix([col[1].to_non_interactable() for col in cols]) - # Have to override _build_model_matrix, too, because of tabmat/glum's way - # of handling intercepts and categorical variables. + # Have to override this because of culumn names + # (and possibly intercept later on) @override def _build_model_matrix(self, spec: ModelSpec, drop_rows): # Step 0: Apply any requested column/term clustering @@ -136,12 +136,26 @@ def _build_model_matrix(self, spec: ModelSpec, drop_rows): if spec.structure: cols = self._enforce_structure(cols, spec, drop_rows) else: + # for term, scoped_terms, columns in spec.structure: + # expanded_columns = list(itertools.chain(colname_dict[col] for col in columns)) + # expanded_structure.append( + # EncodedTermStructure(term, scoped_terms, expanded_columns) + # ) + spec = spec.update( structure=[ EncodedTermStructure( term, [st.copy(without_values=True) for st in scoped_terms], - _colnames_from_scoped_cols(scoped_cols), + # This is the only line that is different from the original: + list( + itertools.chain( + *( + mat.get_names(col) + for col, mat in scoped_cols.items() + ) + ) + ), ) for term, scoped_terms, scoped_cols in cols ], @@ -162,19 +176,6 @@ def _build_model_matrix(self, spec: ModelSpec, drop_rows): ) -def _colnames_from_scoped_cols(scoped_cols): - colnames = [] - for name, col in scoped_cols.items(): - if isinstance(col, CategoricalMatrix): - if col.drop_first: - colnames.extend([f"{name}__{cat}" for cat in col.cat.categories[1:]]) - else: - colnames.extend([f"{name}__{cat}" for cat in col.cat.categories]) - else: - colnames.append(name) - return colnames - - class InteractableDenseMatrix(DenseMatrix): def __mul__(self, other): if isinstance(other, (InteractableDenseMatrix, int, float)): @@ -193,6 +194,9 @@ def __rmul__(self, other): def to_non_interactable(self): return DenseMatrix(self) + def get_names(self, col): + return [col] + class InteractableSparseMatrix(SparseMatrix): def __mul__(self, other): @@ -212,6 +216,9 @@ def __rmul__(self, other): def to_non_interactable(self): return SparseMatrix(self) + def get_names(self, col): + return [col] + class InteractableCategoricalMatrix(CategoricalMatrix): def __init__(self, *args, **kwargs): @@ -278,7 +285,7 @@ def _interact_categorical(self, other): left_slice = slice(None) new_categories = [ - f"{left_cat}__{right_cat}" + f"{left_cat}:{right_cat}" for left_cat, right_cat in itertools.product( self.cat.categories[left_slice], other.cat.categories[right_slice] ) @@ -299,3 +306,10 @@ def _interact_categorical(self, other): multipliers=self.multipliers * other.multipliers, drop_first=new_drop_first, ) + + def get_names(self, col): + if self.drop_first: + categories = self.cat.categories[1:] + else: + categories = self.cat.categories + return [f"{col}[{cat}]" for cat in categories] From ce7dfaa7a9de05f1a95a1700cfa510f218b2c6fc Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 19 Jun 2023 13:45:54 +0200 Subject: [PATCH 04/72] Make dummy ordering consistent with pandas [skip ci] --- src/tabmat/formula.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index d0868f38..dae73acd 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -264,20 +264,20 @@ def to_non_interactable(self): ) def _interact_categorical(self, other): - cardinality_right = len(other.cat.categories) + cardinality_self = len(self.cat.categories) - new_codes = self.cat.codes * cardinality_right + other.cat.codes + new_codes = other.cat.codes * cardinality_self + self.cat.codes - if other.drop_first: - new_codes[new_codes % cardinality_right == 0] = 0 - new_codes -= new_codes // cardinality_right - left_shift = cardinality_right - 1 + if self.drop_first: + new_codes[new_codes % cardinality_self == 0] = 0 + new_codes -= new_codes // cardinality_self + left_shift = cardinality_self - 1 right_slice = slice(1, None) else: - left_shift = cardinality_right + left_shift = cardinality_self right_slice = slice(None) - if self.drop_first: + if other.drop_first: new_codes -= left_shift new_codes[new_codes < 0] = 0 left_slice = slice(1, None) @@ -285,9 +285,9 @@ def _interact_categorical(self, other): left_slice = slice(None) new_categories = [ - f"{left_cat}:{right_cat}" - for left_cat, right_cat in itertools.product( - self.cat.categories[left_slice], other.cat.categories[right_slice] + f"{self_cat}:{other_cat}" + for other_cat, self_cat in itertools.product( + other.cat.categories[left_slice], self.cat.categories[right_slice] ) ] From d23cca572402da50411876e4d16629a366113a80 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 19 Jun 2023 13:58:25 +0200 Subject: [PATCH 05/72] Fix mistake in categorical interactions [skip ci] --- src/tabmat/formula.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index dae73acd..83c58560 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -271,23 +271,21 @@ def _interact_categorical(self, other): if self.drop_first: new_codes[new_codes % cardinality_self == 0] = 0 new_codes -= new_codes // cardinality_self - left_shift = cardinality_self - 1 - right_slice = slice(1, None) + self_slice = slice(1, None) else: - left_shift = cardinality_self - right_slice = slice(None) + self_slice = slice(None) if other.drop_first: - new_codes -= left_shift + new_codes -= (cardinality_self - 1) new_codes[new_codes < 0] = 0 - left_slice = slice(1, None) + other_slice = slice(1, None) else: - left_slice = slice(None) + other_slice = slice(None) new_categories = [ f"{self_cat}:{other_cat}" for other_cat, self_cat in itertools.product( - other.cat.categories[left_slice], self.cat.categories[right_slice] + other.cat.categories[other_slice], self.cat.categories[self_slice] ) ] From 55b01bf25c06aff572778e802727ba590d5abfd3 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 19 Jun 2023 14:00:17 +0200 Subject: [PATCH 06/72] Add formulaic to environment files Have not added to the conda recipe yet. Should probably be optional. --- environment-win.yml | 1 + environment.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/environment-win.yml b/environment-win.yml index 79e9f4e4..dadb5a1a 100644 --- a/environment-win.yml +++ b/environment-win.yml @@ -5,6 +5,7 @@ channels: dependencies: - libblas>=0=*mkl - pandas + - formulaic # development tools - black diff --git a/environment.yml b/environment.yml index 1f4d34a2..bdc6b749 100644 --- a/environment.yml +++ b/environment.yml @@ -4,6 +4,7 @@ channels: - nodefaults dependencies: - pandas + - formulaic # development tools - black From 5b7da3c1af6095243a7e79cde31c08e653d35e86 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 19 Jun 2023 15:04:55 +0200 Subject: [PATCH 07/72] Add from_formula constructor --- src/tabmat/__init__.py | 5 ++--- src/tabmat/constructor.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/src/tabmat/__init__.py b/src/tabmat/__init__.py index 6aab6757..1d295c5e 100644 --- a/src/tabmat/__init__.py +++ b/src/tabmat/__init__.py @@ -1,7 +1,6 @@ from .categorical_matrix import CategoricalMatrix -from .constructor import from_csc, from_pandas +from .constructor import from_csc, from_formula, from_pandas from .dense_matrix import DenseMatrix -from .formula import TabmatMaterializer from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix from .split_matrix import SplitMatrix @@ -15,6 +14,6 @@ "SplitMatrix", "CategoricalMatrix", "from_csc", + "from_formula", "from_pandas", - "TabmatMaterializer", ] diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index b782f2da..dea7d879 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -1,13 +1,17 @@ +import sys import warnings from typing import List, Tuple, Union import numpy as np import pandas as pd +from formulaic import Formula, ModelSpec +from formulaic.utils.layered_mapping import LayeredMapping from pandas.api.types import is_numeric_dtype from scipy import sparse as sps from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix +from .formula import TabmatMaterializer from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix from .split_matrix import SplitMatrix @@ -198,3 +202,35 @@ def from_csc(mat: sps.csc_matrix, threshold=0.1): """ dense, sparse, dense_idx, sparse_idx = _split_sparse_and_dense_parts(mat, threshold) return SplitMatrix([dense, sparse], [dense_idx, sparse_idx]) + + +def from_formula( + formula: Union[str, Formula], + df: pd.DataFrame, + ensure_full_rank: bool = False, + context=0, +): + """ + Transform a pandas DataFrame to a SplitMatrix using a Wilkinson formula. + + Parameters + ---------- + formula: str + A formula accepted by formulaic. + df: pd.DataFrame + pandas DataFrame to be converted. + ensure_full_rank: bool, default False + If True, ensure that the matrix has full structural rank by categories. + """ + if isinstance(context, int): + if hasattr(sys, "_getframe"): + frame = sys._getframe(context + 1) + context = LayeredMapping(frame.f_locals, frame.f_globals) + else: + context = None # pragma: no cover + spec = ModelSpec( + formula=Formula(formula), + ensure_full_rank=ensure_full_rank, + ) + materializer = TabmatMaterializer(df, context=context) + return materializer.get_model_matrix(spec) From 51ecfc2ee624ceb8f433b4c3ed6cc51ea32c656f Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 19 Jun 2023 16:12:37 +0200 Subject: [PATCH 08/72] Add some tests --- tests/test_formula.py | 48 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 tests/test_formula.py diff --git a/tests/test_formula.py b/tests/test_formula.py new file mode 100644 index 00000000..6b46104d --- /dev/null +++ b/tests/test_formula.py @@ -0,0 +1,48 @@ +import formulaic +import numpy as np +import pandas as pd +import pytest + +import tabmat as tm + + +@pytest.fixture +def df(): + df = pd.DataFrame( + { + "num_1": [1.0, 2.0, 3.0, 4.0, 5.0], + "num_2": [5.0, 4.0, 3.0, 2.0, 1.0], + "cat_1": pd.Categorical(["a", "b", "c", "b", "a"]), + "cat_2": pd.Categorical(["x", "y", "z", "x", "y"]), + "cat_3": pd.Categorical(["1", "2", "1", "2", "1"]), + } + ) + return df + + +@pytest.mark.parametrize("ensure_full_rank", [True, False]) +@pytest.mark.parametrize( + "formula", + [ + pytest.param("num_1 + num_2", id="numeric"), + pytest.param("cat_1 + cat_2", id="categorical"), + pytest.param("cat_1 * cat_2 * cat_3", id="interaction"), + pytest.param("num_1 + cat_1 * num_2 * cat_2", id="mixed"), + pytest.param("{np.log(num_1)} + {num_in_scope * num_2}", id="functions"), + pytest.param("{num_1 * num_in_scope}", id="variable_in_scope"), + pytest.param("bs(num_1, 3)", id="spline"), + pytest.param( + "poly(num_1, 3, raw=True) + poly(num_2, 3, raw=False)", id="polynomial" + ), + pytest.param( + "C(cat_1, spans_intercept=False) * cat_2 * cat_3", + id="custom_contrasts", + marks=pytest.mark.xfail, + ), + ], +) +def test_against_pandas(df, formula, ensure_full_rank): + num_in_scope = 2 # noqa + model_df = formulaic.model_matrix(formula, df, ensure_full_rank=ensure_full_rank) + model_tabmat = tm.from_formula(formula, df, ensure_full_rank=ensure_full_rank) + np.testing.assert_array_equal(model_df.to_numpy(), model_tabmat.A) From ffa4955ed2acb0ab0771dde7144b7370d623ba6f Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 19 Jun 2023 17:47:04 +0200 Subject: [PATCH 09/72] Add more tests --- tests/test_formula.py | 149 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 147 insertions(+), 2 deletions(-) diff --git a/tests/test_formula.py b/tests/test_formula.py index 6b46104d..b16aa18e 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd import pytest +from scipy import sparse as sps import tabmat as tm @@ -20,7 +21,91 @@ def df(): return df -@pytest.mark.parametrize("ensure_full_rank", [True, False]) +@pytest.mark.parametrize( + "formula, expected", + [ + pytest.param( + "num_1", + tm.SplitMatrix( + [ + tm.DenseMatrix( + np.array( + [[1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0, 5.0]] + ).T + ) + ] + ), + id="numeric", + ), + pytest.param( + "cat_1", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), + tm.CategoricalMatrix( + pd.Categorical( + ["a", "b", "c", "b", "a"], categories=["a", "b", "c"] + ), + drop_first=True, + ), + ] + ), + id="categorical", + ), + pytest.param( + "num_1 : cat_1", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), + tm.SparseMatrix( + sps.csc_matrix( + np.array( + [ + [1.0, 0.0, 0.0, 0.0, 5.0], + [0.0, 2.0, 0.0, 4.0, 0.0], + [0.0, 0.0, 3.0, 0.0, 0.0], + ] + ).T + ) + ), + ] + ), + id="interaction_cat_num", + ), + pytest.param( + "cat_1 : cat_3 - 1", + tm.SplitMatrix( + [ + tm.CategoricalMatrix( + pd.Categorical( + ["a:1", "b:2", "c:1", "b:2", "a:1"], + categories=["a:1", "b:1", "c:1", "a:2", "c:2", "b:2"], + ), + drop_first=False, + ), + ] + ), + id="interaction_cat_cat", + ), + ], +) +def test_matrix_against_expectation(df, formula, expected): + model_df = tm.from_formula(formula, df, ensure_full_rank=True) + assert len(model_df.matrices) == len(expected.matrices) + for res, exp in zip(model_df.matrices, expected.matrices): + assert type(res) == type(exp) + if isinstance(res, tm.DenseMatrix): + np.testing.assert_array_equal(res, exp) + elif isinstance(res, tm.SparseMatrix): + np.testing.assert_array_equal(res.A, res.A) + elif isinstance(res, tm.CategoricalMatrix): + assert (exp.cat == res.cat).all() + assert exp.drop_first == res.drop_first + + +@pytest.mark.parametrize( + "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] +) @pytest.mark.parametrize( "formula", [ @@ -41,8 +126,68 @@ def df(): ), ], ) -def test_against_pandas(df, formula, ensure_full_rank): +def test_matrix_against_pandas(df, formula, ensure_full_rank): num_in_scope = 2 # noqa model_df = formulaic.model_matrix(formula, df, ensure_full_rank=ensure_full_rank) model_tabmat = tm.from_formula(formula, df, ensure_full_rank=ensure_full_rank) np.testing.assert_array_equal(model_df.to_numpy(), model_tabmat.A) + + +@pytest.mark.parametrize( + "formula, expected_names", + [ + pytest.param("num_1 + num_2", ("Intercept", "num_1", "num_2"), id="numeric"), + pytest.param("num_1 + num_2 - 1", ("num_1", "num_2"), id="no_intercept"), + pytest.param("cat_1", ("Intercept", "cat_1[b]", "cat_1[c]"), id="categorical"), + pytest.param( + "cat_2 * cat_3", + ( + "Intercept", + "cat_2[y]", + "cat_2[z]", + "cat_3[2]", + "cat_2:cat_3[y:2]", + "cat_2:cat_3[z:2]", + ), + id="interaction", + ), + pytest.param( + "poly(num_1, 3) - 1", + ("poly(num_1, 3)[1]", "poly(num_1, 3)[2]", "poly(num_1, 3)[3]"), + id="polynomial", + ), + pytest.param( + "{np.log(num_1 ** 2)}", ("Intercept", "np.log(num_1 ** 2)"), id="functions" + ), + ], +) +def test_names_against_expectation(df, formula, expected_names): + model_tabmat = tm.from_formula(formula, df, ensure_full_rank=True) + assert model_tabmat.model_spec.column_names == expected_names + + +@pytest.mark.parametrize( + "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] +) +@pytest.mark.parametrize( + "formula", + [ + pytest.param("num_1 + num_2", id="numeric"), + pytest.param("cat_1 + cat_2", id="categorical", marks=pytest.mark.xfail), + pytest.param( + "cat_1 * cat_2 * cat_3", id="interaction", marks=pytest.mark.xfail + ), + pytest.param("{np.log(num_1)} + {num_in_scope * num_2}", id="functions"), + pytest.param("{num_1 * num_in_scope}", id="variable_in_scope"), + pytest.param("bs(num_1, 3)", id="spline"), + pytest.param( + "poly(num_1, 3, raw=True) + poly(num_2, 3, raw=False)", id="polynomial" + ), + ], +) +def test_names_against_pandas(df, formula, ensure_full_rank): + num_in_scope = 2 # noqa + model_df = formulaic.model_matrix(formula, df, ensure_full_rank=ensure_full_rank) + model_tabmat = tm.from_formula(formula, df, ensure_full_rank=ensure_full_rank) + assert model_tabmat.model_spec.column_names == model_df.model_spec.column_names + assert model_tabmat.model_spec.column_names == tuple(model_df.columns) From a54a1a3b1202ebef608f78b7c553f6175a62ccec Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 21 Jun 2023 17:52:09 +0200 Subject: [PATCH 10/72] Major refactoring - simplify categorical interactions - NaNs in categoricals should be handled correctly - parity with formulaic in categorical names --- src/tabmat/formula.py | 307 ++++++++++++++++++++++++++++-------------- tests/test_formula.py | 47 +++++-- 2 files changed, 238 insertions(+), 116 deletions(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 83c58560..88547a80 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -1,6 +1,8 @@ -import copy +import functools import itertools +from abc import ABC, abstractmethod from collections import OrderedDict +from typing import List, Optional import numpy import pandas @@ -9,6 +11,7 @@ from formulaic.materializers.base import EncodedTermStructure from formulaic.materializers.types import NAAction from interface_meta import override +from scipy import sparse from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix @@ -58,7 +61,7 @@ def _check_for_nulls(self, name, values, na_action, drop_rows): @override def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows): series = value * numpy.ones(self.nrows - len(drop_rows)) - return InteractableDenseMatrix(series) + return _InteractableDenseColumn(series) @override def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): @@ -66,7 +69,7 @@ def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): values = values.drop(index=values.index[drop_rows]) if isinstance(values, pandas.Series): values = values.to_numpy() - return InteractableDenseMatrix(values) + return _InteractableDenseColumn(values) @override def _encode_categorical( @@ -75,7 +78,18 @@ def _encode_categorical( # We do not do any encoding here as it is handled by tabmat if drop_rows: values = values.drop(index=values.index[drop_rows]) - return InteractableCategoricalMatrix(values._values, drop_first=reduced_rank) + cat = values._values + categories = list(cat.categories) + codes = cat.codes.copy().astype(numpy.int64) + if reduced_rank: + codes[codes == 0] = -2 + codes[codes > 0] -= 1 + categories = categories[1:] + return _InteractableCategoricalColumn( + codes=codes, + categories=categories, + multipliers=numpy.ones(len(cat.codes)), + ) @override def _combine_columns(self, cols, spec, drop_rows): @@ -175,139 +189,226 @@ def _build_model_matrix(self, spec: ModelSpec, drop_rows): spec=spec, ) - -class InteractableDenseMatrix(DenseMatrix): - def __mul__(self, other): - if isinstance(other, (InteractableDenseMatrix, int, float)): - return self.multiply(other) - elif isinstance( - other, (InteractableSparseMatrix, InteractableCategoricalMatrix) + @override + def _get_columns_for_term(self, factors, spec, scale=1): + """Assemble the columns for a model matrix given factors and a scale.""" + out = OrderedDict() + for reverse_product in itertools.product( + *(factor.items() for factor in reversed(factors)) ): - return other.__mul__(self) - else: - raise TypeError(f"Cannot multiply {type(self)} and {type(other)}") - # Multiplication with sparse and categorical is handled by the other classes + product = reverse_product[::-1] + out[":".join(p[0] for p in product)] = scale * functools.reduce( + _interact, (p[1].set_name(p[0]) for p in product) + ) + return out + + +class _InteractableColumn(ABC): + name: Optional[str] + + @abstractmethod + def to_non_interactable(self): + pass + + @abstractmethod + def get_names(self, col): + pass + + @abstractmethod + def set_name(self, name): + pass + + +class _InteractableDenseColumn(_InteractableColumn): + def __init__(self, values: numpy.ndarray, name: Optional[str] = None): + self.values = values + self.name = name def __rmul__(self, other): - return self.__mul__(other) + if isinstance(other, (int, float)): + return _InteractableDenseColumn( + values=self.values * other, + name=self.name, + ) def to_non_interactable(self): - return DenseMatrix(self) + return DenseMatrix(self.values) def get_names(self, col): return [col] + def set_name(self, name): + self.name = name + return self -class InteractableSparseMatrix(SparseMatrix): - def __mul__(self, other): - if isinstance(other, (InteractableDenseMatrix, InteractableSparseMatrix)): - return self.multiply(other) - elif isinstance(other, InteractableCategoricalMatrix): - return other.__mul__(self) - elif isinstance(other, (int, float)): - return self.multiply(numpy.array(other)) - else: - raise TypeError(f"Cannot multiply {type(self)} and {type(other)}") - # Multiplication with categorical is handled by the categorical + +class _InteractableSparseColumn(_InteractableColumn): + def __init__(self, values: sparse.csc_matrix, name: Optional[str] = None): + self.values = values + self.name = name def __rmul__(self, other): - return self.__mul__(other) + if isinstance(other, (int, float)): + return _InteractableSparseColumn( + values=self.values * other, + name=self.name, + ) def to_non_interactable(self): - return SparseMatrix(self) + return SparseMatrix(self.values) def get_names(self, col): return [col] + def set_name(self, name): + self.name = name + return self -class InteractableCategoricalMatrix(CategoricalMatrix): - def __init__(self, *args, **kwargs): - multipliers = kwargs.pop("multipliers", None) - super().__init__(*args, **kwargs) - if multipliers is None: - self.multipliers = numpy.ones_like(self.cat, dtype=numpy.float_) - else: - self.multipliers = multipliers - - def __mul__(self, other): - if isinstance(other, (InteractableDenseMatrix, float, int)): - result = copy.copy(self) - result.multipliers = result.multipliers * numpy.array(other) - return result - elif isinstance(other, InteractableSparseMatrix): - result = copy.copy(self) - result.multipliers = result.multipliers * other.todense() - return result - elif isinstance(other, InteractableCategoricalMatrix): - return self._interact_categorical(other) - else: - raise TypeError( - f"Can't multiply InteractableCategoricalMatrix with {type(other)}" - ) + +class _InteractableCategoricalColumn(_InteractableColumn): + def __init__( + self, + codes: numpy.ndarray, + categories: List[str], + multipliers: numpy.ndarray, + name: Optional[str] = None, + ): + # sentinel values for codes: + # -1: missing + # -2: drop + self.codes = codes + self.categories = categories + self.multipliers = multipliers + self.name = None def __rmul__(self, other): - if isinstance(other, InteractableCategoricalMatrix): - other._interact_categorical(self) # order matters - else: - return self.__mul__(other) + if isinstance(other, (int, float)): + return _InteractableCategoricalColumn( + categories=self.categories, + codes=self.codes, + multipliers=self.multipliers * other, + ) def to_non_interactable(self): - if numpy.all(self.multipliers == 1): - return CategoricalMatrix( - self.cat, - drop_first=self.drop_first, - dtype=self.dtype, - ) + codes = self.codes.copy() + categories = self.categories.copy() + if -2 in self.codes: + codes[codes >= 0] += 1 + codes[codes == -2] = 0 + categories.insert(0, "__drop__") + drop_first = True else: - return SparseMatrix( - self.tocsr().multiply(self.multipliers[:, numpy.newaxis]) - ) + drop_first = False - def _interact_categorical(self, other): - cardinality_self = len(self.cat.categories) + cat = pandas.Categorical.from_codes( + codes=codes, + categories=categories, + ordered=False, + ) - new_codes = other.cat.codes * cardinality_self + self.cat.codes + categorical_part = CategoricalMatrix(cat, drop_first=drop_first) - if self.drop_first: - new_codes[new_codes % cardinality_self == 0] = 0 - new_codes -= new_codes // cardinality_self - self_slice = slice(1, None) + if (self.multipliers == 1).all(): + return categorical_part else: - self_slice = slice(None) + return SparseMatrix( + sparse.csc_matrix( + categorical_part.tocsr().multiply( + self.multipliers[:, numpy.newaxis] + ) + ) + ) + + def get_names(self, col): + return self.categories + + def set_name(self, name, name_format="{name}[T.{cat}]"): + if self.name is None: + # Make sure to only format the name once + self.name = name + self.categories = [ + name_format.format(name=name, cat=cat) for cat in self.categories + ] + return self + + +def _interact( + left: _InteractableColumn, right: _InteractableColumn, reverse=False, separator=":" +): + if isinstance(left, _InteractableDenseColumn): + if isinstance(right, _InteractableDenseColumn): + if not reverse: + new_name = f"{left.name}{separator}{right.name}" + else: + new_name = f"{right.name}{separator}{left.name}" + return _InteractableDenseColumn(left.values * right.values, name=new_name) - if other.drop_first: - new_codes -= (cardinality_self - 1) - new_codes[new_codes < 0] = 0 - other_slice = slice(1, None) else: - other_slice = slice(None) + return _interact(right, left, reverse=True, separator=separator) + + if isinstance(left, _InteractableSparseColumn): + if isinstance(right, (_InteractableDenseColumn, _InteractableSparseColumn)): + if not reverse: + new_name = f"{left.name}{separator}{right.name}" + else: + new_name = f"{right.name}{separator}{left.name}" + return _InteractableSparseColumn( + left.values.multiply(right.values), + name=new_name, + ) - new_categories = [ - f"{self_cat}:{other_cat}" - for other_cat, self_cat in itertools.product( - other.cat.categories[other_slice], self.cat.categories[self_slice] + else: + return _interact(right, left, reverse=True, separator=separator) + + if isinstance(left, _InteractableCategoricalColumn): + if isinstance(right, (_InteractableDenseColumn, _InteractableSparseColumn)): + if isinstance(right, _InteractableDenseColumn): + right_values = right.values + else: + right_values = right.values.todense() + if not reverse: + new_categories = [ + f"{cat}{separator}{right.name}" for cat in left.categories + ] + else: + new_categories = [ + f"{right.name}{separator}{cat}" for cat in left.categories + ] + return _InteractableCategoricalColumn( + left.codes, + new_categories, + left.multipliers * right_values, ) - ] - new_drop_first = self.drop_first or other.drop_first - if new_drop_first: - new_categories = ["__drop__"] + new_categories + elif isinstance(right, _InteractableCategoricalColumn): + return _interact_categoricals(left, right) - cat = pandas.Categorical.from_codes( - categories=new_categories, - codes=new_codes, - ordered=self.cat.ordered and other.cat.ordered, + raise TypeError( + f"Cannot interact {type(left).__name__} with {type(right).__name__}" ) - return InteractableCategoricalMatrix( - cat, - multipliers=self.multipliers * other.multipliers, - drop_first=new_drop_first, - ) - def get_names(self, col): - if self.drop_first: - categories = self.cat.categories[1:] - else: - categories = self.cat.categories - return [f"{col}[{cat}]" for cat in categories] +def _interact_categoricals( + left: _InteractableCategoricalColumn, + right: _InteractableCategoricalColumn, + separator=":", +): + cardinality_left = len(left.categories) + new_codes = right.codes * cardinality_left + left.codes + + na_mask = (left.codes == -1) | (right.codes == -1) + drop_mask = (left.codes == -2) | (right.codes == -2) + + new_codes[drop_mask] = -2 + new_codes[na_mask] = -1 + + new_categories = [ + f"{left_cat}{separator}{right_cat}" + for right_cat, left_cat in itertools.product(right.categories, left.categories) + ] + + return _InteractableCategoricalColumn( + codes=new_codes, + categories=new_categories, + multipliers=left.multipliers * right.multipliers, + ) diff --git a/tests/test_formula.py b/tests/test_formula.py index b16aa18e..2dff11a1 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -44,7 +44,14 @@ def df(): tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), tm.CategoricalMatrix( pd.Categorical( - ["a", "b", "c", "b", "a"], categories=["a", "b", "c"] + [ + "__drop__", + "cat_1[T.b]", + "cat_1[T.c]", + "cat_1[T.b]", + "__drop__", + ], + categories=["__drop__", "cat_1[T.b]", "cat_1[T.c]"], ), drop_first=True, ), @@ -78,8 +85,21 @@ def df(): [ tm.CategoricalMatrix( pd.Categorical( - ["a:1", "b:2", "c:1", "b:2", "a:1"], - categories=["a:1", "b:1", "c:1", "a:2", "c:2", "b:2"], + [ + "cat_1[T.a]:cat_3[T.1]", + "cat_1[T.b]:cat_3[T.2]", + "cat_1[T.c]:cat_3[T.1]", + "cat_1[T.b]:cat_3[T.2]", + "cat_1[T.a]:cat_3[T.1]", + ], + categories=[ + "cat_1[T.a]:cat_3[T.1]", + "cat_1[T.b]:cat_3[T.1]", + "cat_1[T.c]:cat_3[T.1]", + "cat_1[T.a]:cat_3[T.2]", + "cat_1[T.c]:cat_3[T.2]", + "cat_1[T.b]:cat_3[T.2]", + ], ), drop_first=False, ), @@ -138,16 +158,18 @@ def test_matrix_against_pandas(df, formula, ensure_full_rank): [ pytest.param("num_1 + num_2", ("Intercept", "num_1", "num_2"), id="numeric"), pytest.param("num_1 + num_2 - 1", ("num_1", "num_2"), id="no_intercept"), - pytest.param("cat_1", ("Intercept", "cat_1[b]", "cat_1[c]"), id="categorical"), + pytest.param( + "cat_1", ("Intercept", "cat_1[T.b]", "cat_1[T.c]"), id="categorical" + ), pytest.param( "cat_2 * cat_3", ( "Intercept", - "cat_2[y]", - "cat_2[z]", - "cat_3[2]", - "cat_2:cat_3[y:2]", - "cat_2:cat_3[z:2]", + "cat_2[T.y]", + "cat_2[T.z]", + "cat_3[T.2]", + "cat_2[T.y]:cat_3[T.2]", + "cat_2[T.z]:cat_3[T.2]", ), id="interaction", ), @@ -173,10 +195,9 @@ def test_names_against_expectation(df, formula, expected_names): "formula", [ pytest.param("num_1 + num_2", id="numeric"), - pytest.param("cat_1 + cat_2", id="categorical", marks=pytest.mark.xfail), - pytest.param( - "cat_1 * cat_2 * cat_3", id="interaction", marks=pytest.mark.xfail - ), + pytest.param("cat_1 + cat_2", id="categorical"), + pytest.param("cat_1 * cat_2 * cat_3", id="interaction"), + pytest.param("num_1 + cat_1 * num_2 * cat_2", id="mixed"), pytest.param("{np.log(num_1)} + {num_in_scope * num_2}", id="functions"), pytest.param("{num_1 * num_in_scope}", id="variable_in_scope"), pytest.param("bs(num_1, 3)", id="spline"), From 6157acde910529e7e4c2b7a7870bea65100c3f4c Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 22 Jun 2023 11:28:52 +0200 Subject: [PATCH 11/72] Make name formatting custommizable - interaction_separator - categorical_format - intercept_name --- src/tabmat/constructor.py | 11 ++- src/tabmat/formula.py | 39 ++++++---- tests/test_formula.py | 147 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 181 insertions(+), 16 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index dea7d879..9e2bad19 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -208,6 +208,9 @@ def from_formula( formula: Union[str, Formula], df: pd.DataFrame, ensure_full_rank: bool = False, + interaction_separator: str = ":", + categorical_format: str = "{name}[T.{category}]", + intercept_name: str = "Intercept", context=0, ): """ @@ -232,5 +235,11 @@ def from_formula( formula=Formula(formula), ensure_full_rank=ensure_full_rank, ) - materializer = TabmatMaterializer(df, context=context) + materializer = TabmatMaterializer( + df, + context=context, + interaction_separator=interaction_separator, + categorical_format=categorical_format, + intercept_name=intercept_name, + ) return materializer.get_model_matrix(spec) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 88547a80..c9ceba17 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -26,6 +26,14 @@ class TabmatMaterializer(FormulaMaterializer): REGISTER_INPUTS = ("pandas.core.frame.DataFrame",) REGISTER_OUTPUTS = "tabmat" + @override + def _init(self): + self.interaction_separator = self.params.get("interaction_separator", ":") + self.categorical_format = self.params.get( + "categorical_format", "{name}:[{category}]" + ) + self.intercept_name = self.params.get("intercept_name", "Intercept") + @override def _is_categorical(self, values): if isinstance(values, (pandas.Series, pandas.Categorical)): @@ -61,7 +69,7 @@ def _check_for_nulls(self, name, values, na_action, drop_rows): @override def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows): series = value * numpy.ones(self.nrows - len(drop_rows)) - return _InteractableDenseColumn(series) + return _InteractableDenseColumn(series, name=self.intercept_name) @override def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): @@ -164,10 +172,7 @@ def _build_model_matrix(self, spec: ModelSpec, drop_rows): # This is the only line that is different from the original: list( itertools.chain( - *( - mat.get_names(col) - for col, mat in scoped_cols.items() - ) + *(mat.get_names() for mat in scoped_cols.values()) ) ), ) @@ -198,7 +203,11 @@ def _get_columns_for_term(self, factors, spec, scale=1): ): product = reverse_product[::-1] out[":".join(p[0] for p in product)] = scale * functools.reduce( - _interact, (p[1].set_name(p[0]) for p in product) + functools.partial(_interact, separator=self.interaction_separator), + ( + p[1].set_name(p[0], name_format=self.categorical_format) + for p in product + ), ) return out @@ -234,10 +243,10 @@ def __rmul__(self, other): def to_non_interactable(self): return DenseMatrix(self.values) - def get_names(self, col): - return [col] + def get_names(self): + return [self.name] - def set_name(self, name): + def set_name(self, name, name_format=None): self.name = name return self @@ -257,10 +266,10 @@ def __rmul__(self, other): def to_non_interactable(self): return SparseMatrix(self.values) - def get_names(self, col): - return [col] + def get_names(self): + return [self.name] - def set_name(self, name): + def set_name(self, name, name_format=None): self.name = name return self @@ -319,7 +328,7 @@ def to_non_interactable(self): ) ) - def get_names(self, col): + def get_names(self): return self.categories def set_name(self, name, name_format="{name}[T.{cat}]"): @@ -327,7 +336,7 @@ def set_name(self, name, name_format="{name}[T.{cat}]"): # Make sure to only format the name once self.name = name self.categories = [ - name_format.format(name=name, cat=cat) for cat in self.categories + name_format.format(name=name, category=cat) for cat in self.categories ] return self @@ -381,7 +390,7 @@ def _interact( ) elif isinstance(right, _InteractableCategoricalColumn): - return _interact_categoricals(left, right) + return _interact_categoricals(left, right, separator=separator) raise TypeError( f"Cannot interact {type(left).__name__} with {type(right).__name__}" diff --git a/tests/test_formula.py b/tests/test_formula.py index 2dff11a1..0a7235c1 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -123,6 +123,115 @@ def test_matrix_against_expectation(df, formula, expected): assert exp.drop_first == res.drop_first +@pytest.mark.parametrize( + "formula, expected", + [ + pytest.param( + "num_1", + tm.SplitMatrix( + [ + tm.DenseMatrix( + np.array( + [[1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0, 5.0]] + ).T + ) + ] + ), + id="numeric", + ), + pytest.param( + "cat_1", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), + tm.CategoricalMatrix( + pd.Categorical( + [ + "__drop__", + "cat_1__b", + "cat_1__c", + "cat_1__b", + "__drop__", + ], + categories=["__drop__", "cat_1__b", "cat_1__c"], + ), + drop_first=True, + ), + ] + ), + id="categorical", + ), + pytest.param( + "num_1 : cat_1", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), + tm.SparseMatrix( + sps.csc_matrix( + np.array( + [ + [1.0, 0.0, 0.0, 0.0, 5.0], + [0.0, 2.0, 0.0, 4.0, 0.0], + [0.0, 0.0, 3.0, 0.0, 0.0], + ] + ).T + ) + ), + ] + ), + id="interaction_cat_num", + ), + pytest.param( + "cat_1 : cat_3 - 1", + tm.SplitMatrix( + [ + tm.CategoricalMatrix( + pd.Categorical( + [ + "cat_1__a__x__cat_3__1", + "cat_1__b__x__cat_3__2", + "cat_1__c__x__cat_3__1", + "cat_1__b__x__cat_3__2", + "cat_1__a__x__cat_3__1", + ], + categories=[ + "cat_1__a__x__cat_3__1", + "cat_1__b__x__cat_3__1", + "cat_1__c__x__cat_3__1", + "cat_1__a__x__cat_3__2", + "cat_1__c__x__cat_3__2", + "cat_1__b__x__cat_3__2", + ], + ), + drop_first=False, + ), + ] + ), + id="interaction_cat_cat", + ), + ], +) +def test_matrix_against_expectation_qcl(df, formula, expected): + model_df = tm.from_formula( + formula, + df, + ensure_full_rank=True, + interaction_separator="__x__", + categorical_format="{name}__{category}", + intercept_name="intercept", + ) + assert len(model_df.matrices) == len(expected.matrices) + for res, exp in zip(model_df.matrices, expected.matrices): + assert type(res) == type(exp) + if isinstance(res, tm.DenseMatrix): + np.testing.assert_array_equal(res, exp) + elif isinstance(res, tm.SparseMatrix): + np.testing.assert_array_equal(res.A, res.A) + elif isinstance(res, tm.CategoricalMatrix): + assert (exp.cat == res.cat).all() + assert exp.drop_first == res.drop_first + + @pytest.mark.parametrize( "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] ) @@ -188,6 +297,44 @@ def test_names_against_expectation(df, formula, expected_names): assert model_tabmat.model_spec.column_names == expected_names +@pytest.mark.parametrize( + "formula, expected_names", + [ + pytest.param("cat_1", ("intercept", "cat_1__b", "cat_1__c"), id="categorical"), + pytest.param( + "cat_2 * cat_3", + ( + "intercept", + "cat_2__y", + "cat_2__z", + "cat_3__2", + "cat_2__y__x__cat_3__2", + "cat_2__z__x__cat_3__2", + ), + id="interaction", + ), + pytest.param( + "poly(num_1, 3) - 1", + ("poly(num_1, 3)[1]", "poly(num_1, 3)[2]", "poly(num_1, 3)[3]"), + id="polynomial", + ), + pytest.param( + "{np.log(num_1 ** 2)}", ("intercept", "np.log(num_1 ** 2)"), id="functions" + ), + ], +) +def test_names_against_expectation_qcl(df, formula, expected_names): + model_tabmat = tm.from_formula( + formula, + df, + ensure_full_rank=True, + categorical_format="{name}__{category}", + interaction_separator="__x__", + intercept_name="intercept", + ) + assert model_tabmat.model_spec.column_names == expected_names + + @pytest.mark.parametrize( "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] ) From c9959cca6119648fec9fb68634bf03c02a6a817a Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 22 Jun 2023 11:29:24 +0200 Subject: [PATCH 12/72] Add formulaic to conda recipe --- conda.recipe/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index de1ec993..6855671a 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -38,6 +38,7 @@ requirements: - {{ pin_compatible('numpy') }} - pandas - scipy + - formulaic test: requires: From 64af94406650294b334818a5761cda61ec832206 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 22 Jun 2023 13:46:01 +0200 Subject: [PATCH 13/72] Implement `C()` function to convert to categoricals --- src/tabmat/constructor.py | 6 ++-- src/tabmat/formula.py | 64 ++++++++++++++++++++++++++++++++------- tests/test_formula.py | 13 +++++++- 3 files changed, 69 insertions(+), 14 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index 9e2bad19..f02e50a6 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -11,7 +11,7 @@ from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix -from .formula import TabmatMaterializer +from .formula import _C, TabmatMaterializer from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix from .split_matrix import SplitMatrix @@ -229,8 +229,10 @@ def from_formula( if hasattr(sys, "_getframe"): frame = sys._getframe(context + 1) context = LayeredMapping(frame.f_locals, frame.f_globals) + # We can override the built-in C function here + context["C"] = _C else: - context = None # pragma: no cover + context = {"C": _C} # pragma: no cover spec = ModelSpec( formula=Formula(formula), ensure_full_rank=ensure_full_rank, diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index c9ceba17..19800936 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -9,7 +9,7 @@ from formulaic import ModelMatrix, ModelSpec from formulaic.materializers import FormulaMaterializer from formulaic.materializers.base import EncodedTermStructure -from formulaic.materializers.types import NAAction +from formulaic.materializers.types import FactorValues, NAAction from interface_meta import override from scipy import sparse @@ -87,16 +87,8 @@ def _encode_categorical( if drop_rows: values = values.drop(index=values.index[drop_rows]) cat = values._values - categories = list(cat.categories) - codes = cat.codes.copy().astype(numpy.int64) - if reduced_rank: - codes[codes == 0] = -2 - codes[codes > 0] -= 1 - categories = categories[1:] - return _InteractableCategoricalColumn( - codes=codes, - categories=categories, - multipliers=numpy.ones(len(cat.codes)), + return _InteractableCategoricalColumn.from_categorical( + cat, reduced_rank=reduced_rank ) @override @@ -290,6 +282,20 @@ def __init__( self.multipliers = multipliers self.name = None + @classmethod + def from_categorical(cls, cat: pandas.Categorical, reduced_rank: bool): + categories = list(cat.categories) + codes = cat.codes.copy().astype(numpy.int64) + if reduced_rank: + codes[codes == 0] = -2 + codes[codes > 0] -= 1 + categories = categories[1:] + return cls( + codes=codes, + categories=categories, + multipliers=numpy.ones(len(cat.codes)), + ) + def __rmul__(self, other): if isinstance(other, (int, float)): return _InteractableCategoricalColumn( @@ -421,3 +427,39 @@ def _interact_categoricals( categories=new_categories, multipliers=left.multipliers * right.multipliers, ) + + +def _C( + data, + *, + spans_intercept: bool = True, +): + """ + Mark data as being categorical. + + A reduced-functionality version of the ``formulaic`` ``C()`` function. It does not + support custom contrasts or the level argument, but it allows setting + ``spans_intercept=False`` to avoid dropping categories. + """ + + def encoder( + values, + reduced_rank, + drop_rows, + encoder_state, + model_spec, + ): + values = pandas.Series(values).astype("category") + if drop_rows: + values = values.drop(index=values.index[drop_rows]) + cat = values._values + return _InteractableCategoricalColumn.from_categorical( + cat, reduced_rank=reduced_rank + ) + + return FactorValues( + data, + kind="categorical", + spans_intercept=spans_intercept, + encoder=encoder, + ) diff --git a/tests/test_formula.py b/tests/test_formula.py index 0a7235c1..8cc77d75 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -248,10 +248,13 @@ def test_matrix_against_expectation_qcl(df, formula, expected): pytest.param( "poly(num_1, 3, raw=True) + poly(num_2, 3, raw=False)", id="polynomial" ), + pytest.param( + "C(num_1)", + id="convert_to_categorical", + ), pytest.param( "C(cat_1, spans_intercept=False) * cat_2 * cat_3", id="custom_contrasts", - marks=pytest.mark.xfail, ), ], ) @@ -351,6 +354,14 @@ def test_names_against_expectation_qcl(df, formula, expected_names): pytest.param( "poly(num_1, 3, raw=True) + poly(num_2, 3, raw=False)", id="polynomial" ), + pytest.param( + "C(num_1)", + id="convert_to_categorical", + ), + pytest.param( + "C(cat_1, spans_intercept=False) * cat_2 * cat_3", + id="custom_contrasts", + ), ], ) def test_names_against_pandas(df, formula, ensure_full_rank): From 8abca00ec54e6f724cf0ab0b4281bc52dc073873 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 22 Jun 2023 14:22:40 +0200 Subject: [PATCH 14/72] Auto-convert strings to categories --- src/tabmat/formula.py | 2 +- tests/test_formula.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 19800936..41a12efd 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -86,7 +86,7 @@ def _encode_categorical( # We do not do any encoding here as it is handled by tabmat if drop_rows: values = values.drop(index=values.index[drop_rows]) - cat = values._values + cat = pandas.Categorical(values._values) return _InteractableCategoricalColumn.from_categorical( cat, reduced_rank=reduced_rank ) diff --git a/tests/test_formula.py b/tests/test_formula.py index 8cc77d75..b197b502 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -16,6 +16,7 @@ def df(): "cat_1": pd.Categorical(["a", "b", "c", "b", "a"]), "cat_2": pd.Categorical(["x", "y", "z", "x", "y"]), "cat_3": pd.Categorical(["1", "2", "1", "2", "1"]), + "str_1": ["a", "b", "c", "b", "a"], } ) return df @@ -256,6 +257,7 @@ def test_matrix_against_expectation_qcl(df, formula, expected): "C(cat_1, spans_intercept=False) * cat_2 * cat_3", id="custom_contrasts", ), + pytest.param("str_1", id="string_as_categorical"), ], ) def test_matrix_against_pandas(df, formula, ensure_full_rank): From 124d47cfec3eac59443f26317acb0777fbb202bf Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 22 Jun 2023 17:05:29 +0200 Subject: [PATCH 15/72] Fix C() not working from materializer interface --- src/tabmat/constructor.py | 6 ++---- src/tabmat/formula.py | 9 ++++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index f02e50a6..4d000726 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -11,7 +11,7 @@ from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix -from .formula import _C, TabmatMaterializer +from .formula import TabmatMaterializer from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix from .split_matrix import SplitMatrix @@ -229,10 +229,8 @@ def from_formula( if hasattr(sys, "_getframe"): frame = sys._getframe(context + 1) context = LayeredMapping(frame.f_locals, frame.f_globals) - # We can override the built-in C function here - context["C"] = _C else: - context = {"C": _C} # pragma: no cover + context = None spec = ModelSpec( formula=Formula(formula), ensure_full_rank=ensure_full_rank, diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 41a12efd..6a535a82 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -30,10 +30,13 @@ class TabmatMaterializer(FormulaMaterializer): def _init(self): self.interaction_separator = self.params.get("interaction_separator", ":") self.categorical_format = self.params.get( - "categorical_format", "{name}:[{category}]" + "categorical_format", "{name}[T.{category}]" ) self.intercept_name = self.params.get("intercept_name", "Intercept") + # We can override formulaic's C() function here + self.context["C"] = _C + @override def _is_categorical(self, values): if isinstance(values, (pandas.Series, pandas.Categorical)): @@ -77,7 +80,7 @@ def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): values = values.drop(index=values.index[drop_rows]) if isinstance(values, pandas.Series): values = values.to_numpy() - return _InteractableDenseColumn(values) + return _InteractableDenseColumn(values.astype(numpy.float_)) @override def _encode_categorical( @@ -337,7 +340,7 @@ def to_non_interactable(self): def get_names(self): return self.categories - def set_name(self, name, name_format="{name}[T.{cat}]"): + def set_name(self, name, name_format="{name}[T.{category}]"): if self.name is None: # Make sure to only format the name once self.name = name From bb1faf631c6e2ceaaa6d564fc2d610dae583b0b3 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 22 Jun 2023 17:06:48 +0200 Subject: [PATCH 16/72] Add the pandasmaterializer tests from formulaic --- tests/test_formula.py | 330 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 330 insertions(+) diff --git a/tests/test_formula.py b/tests/test_formula.py index b197b502..93397002 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -1,10 +1,17 @@ +import pickle +import re +from io import BytesIO + import formulaic import numpy as np import pandas as pd import pytest +from formulaic.materializers.types import EvaluatedFactor, FactorValues +from formulaic.parser.types import Factor from scipy import sparse as sps import tabmat as tm +from tabmat.formula import TabmatMaterializer @pytest.fixture @@ -372,3 +379,326 @@ def test_names_against_pandas(df, formula, ensure_full_rank): model_tabmat = tm.from_formula(formula, df, ensure_full_rank=ensure_full_rank) assert model_tabmat.model_spec.column_names == model_df.model_spec.column_names assert model_tabmat.model_spec.column_names == tuple(model_df.columns) + + +FORMULAIC_TESTS = { + # '': (, , , ) + "a": (["Intercept", "a"], ["Intercept", "a"], ["Intercept", "a"], 2), + "A": ( + ["Intercept", "A[T.b]", "A[T.c]"], + ["Intercept", "A[T.a]", "A[T.b]", "A[T.c]"], + ["Intercept", "A[T.c]"], + 2, + ), + "C(A)": ( + ["Intercept", "C(A)[T.b]", "C(A)[T.c]"], + ["Intercept", "C(A)[T.a]", "C(A)[T.b]", "C(A)[T.c]"], + ["Intercept", "C(A)[T.c]"], + 2, + ), + "A:a": ( + ["Intercept", "A[T.a]:a", "A[T.b]:a", "A[T.c]:a"], + ["Intercept", "A[T.a]:a", "A[T.b]:a", "A[T.c]:a"], + ["Intercept", "A[T.a]:a"], + 1, + ), + "A:B": ( + [ + "Intercept", + "B[T.b]", + "B[T.c]", + "A[T.b]:B[T.a]", + "A[T.c]:B[T.a]", + "A[T.b]:B[T.b]", + "A[T.c]:B[T.b]", + "A[T.b]:B[T.c]", + "A[T.c]:B[T.c]", + ], + [ + "Intercept", + "A[T.a]:B[T.a]", + "A[T.b]:B[T.a]", + "A[T.c]:B[T.a]", + "A[T.a]:B[T.b]", + "A[T.b]:B[T.b]", + "A[T.c]:B[T.b]", + "A[T.a]:B[T.c]", + "A[T.b]:B[T.c]", + "A[T.c]:B[T.c]", + ], + ["Intercept"], + 1, + ), +} + + +class TestFormulaicTests: + @pytest.fixture + def data(self): + return pd.DataFrame( + {"a": [1, 2, 3], "b": [1, 2, 3], "A": ["a", "b", "c"], "B": ["a", "b", "c"]} + ) + + @pytest.fixture + def data_with_nulls(self): + return pd.DataFrame( + {"a": [1, 2, None], "A": ["a", None, "c"], "B": ["a", "b", None]} + ) + + @pytest.fixture + def materializer(self, data): + return TabmatMaterializer(data) + + @pytest.mark.parametrize("formula,tests", FORMULAIC_TESTS.items()) + def test_get_model_matrix(self, materializer, formula, tests): + mm = materializer.get_model_matrix(formula, ensure_full_rank=True) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (3, len(tests[0])) + assert list(mm.model_spec.column_names) == tests[0] + + mm = materializer.get_model_matrix(formula, ensure_full_rank=False) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (3, len(tests[1])) + assert list(mm.model_spec.column_names) == tests[1] + + def test_get_model_matrix_edge_cases(self, materializer): + mm = materializer.get_model_matrix(("a",), ensure_full_rank=True) + assert isinstance(mm, formulaic.ModelMatrices) + assert isinstance(mm[0], tm.MatrixBase) + + mm = materializer.get_model_matrix("a ~ A", ensure_full_rank=True) + assert isinstance(mm, formulaic.ModelMatrices) + assert "lhs" in mm.model_spec + assert "rhs" in mm.model_spec + + mm = materializer.get_model_matrix(("a ~ A",), ensure_full_rank=True) + assert isinstance(mm, formulaic.ModelMatrices) + assert isinstance(mm[0], formulaic.ModelMatrices) + + def test_get_model_matrix_invalid_output(self, materializer): + with pytest.raises( + formulaic.errors.FormulaMaterializationError, + match=r"Nominated output .* is invalid\. Available output types are: ", + ): + materializer.get_model_matrix( + "a", ensure_full_rank=True, output="invalid_output" + ) + + @pytest.mark.parametrize("formula,tests", FORMULAIC_TESTS.items()) + def test_na_handling(self, data_with_nulls, formula, tests): + mm = TabmatMaterializer(data_with_nulls).get_model_matrix(formula) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (tests[3], len(tests[2])) + assert list(mm.model_spec.column_names) == tests[2] + + # Tabmat does not allo NAs in categoricals + if formula == "a": + mm = TabmatMaterializer(data_with_nulls).get_model_matrix( + formula, na_action="ignore" + ) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (3, len(tests[0]) + (-1 if "A" in formula else 0)) + + if formula != "C(A)": # C(A) pre-encodes the data, stripping out nulls. + with pytest.raises(ValueError): + TabmatMaterializer(data_with_nulls).get_model_matrix( + formula, na_action="raise" + ) + + def test_state(self, materializer): + mm = materializer.get_model_matrix("center(a) - 1") + assert isinstance(mm, tm.MatrixBase) + assert list(mm.model_spec.column_names) == ["center(a)"] + assert np.allclose(mm.getcol(0).squeeze(), [-1, 0, 1]) + + mm2 = TabmatMaterializer(pd.DataFrame({"a": [4, 5, 6]})).get_model_matrix( + mm.model_spec + ) + assert isinstance(mm2, tm.MatrixBase) + assert list(mm2.model_spec.column_names) == ["center(a)"] + assert np.allclose(mm2.getcol(0).squeeze(), [2, 3, 4]) + + mm3 = mm.model_spec.get_model_matrix(pd.DataFrame({"a": [4, 5, 6]})) + assert isinstance(mm3, tm.MatrixBase) + assert list(mm3.model_spec.column_names) == ["center(a)"] + assert np.allclose(mm3.getcol(0).squeeze(), [2, 3, 4]) + + def test_factor_evaluation_edge_cases(self, materializer): + # Test that categorical kinds are set if type would otherwise be numerical + ev_factor = materializer._evaluate_factor( + Factor("a", eval_method="lookup", kind="categorical"), + formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=set(), + ) + assert ev_factor.metadata.kind.value == "categorical" + + # Test that other kind mismatches result in an exception + materializer.factor_cache = {} + with pytest.raises( + formulaic.errors.FactorEncodingError, + match=re.escape( + "Factor `A` is expecting values of kind 'numerical', " + "but they are actually of kind 'categorical'." + ), + ): + materializer._evaluate_factor( + Factor("A", eval_method="lookup", kind="numerical"), + formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=set(), + ) + + # Test that if an encoding has already been determined, that an exception is raised + # if the new encoding does not match + materializer.factor_cache = {} + with pytest.raises( + formulaic.errors.FactorEncodingError, + match=re.escape( + "The model specification expects factor `a` to have values of kind " + "`categorical`, but they are actually of kind `numerical`." + ), + ): + materializer._evaluate_factor( + Factor("a", eval_method="lookup", kind="numerical"), + formulaic.model_spec.ModelSpec( + formula=[], encoder_state={"a": ("categorical", {})} + ), + drop_rows=set(), + ) + + def test__is_categorical(self, materializer): + assert materializer._is_categorical([1, 2, 3]) is False + assert materializer._is_categorical(pd.Series(["a", "b", "c"])) is True + assert materializer._is_categorical(pd.Categorical(["a", "b", "c"])) is True + assert materializer._is_categorical(FactorValues({}, kind="categorical")) + + def test_encoding_edge_cases(self, materializer): + # Verify that constant encoding works well + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("10", eval_method="literal", kind="constant"), + values=FactorValues(10, kind="constant"), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=[], + ) + np.testing.assert_array_equal(encoded_factor["10"].values, [10, 10, 10]) + + # Verify that unencoded dictionaries with drop-fields work + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("a", eval_method="lookup", kind="numerical"), + values=FactorValues( + {"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])}, + kind="numerical", + spans_intercept=True, + drop_field="a", + ), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=set(), + ) + np.testing.assert_array_equal(encoded_factor["a[a]"].values, [1, 2, 3]) + np.testing.assert_array_equal(encoded_factor["a[b]"].values, [4, 5, 6]) + + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("a", eval_method="lookup", kind="numerical"), + values=FactorValues( + {"a": pd.Series([1, 2, 3]), "b": pd.Series([4, 5, 6])}, + kind="numerical", + spans_intercept=True, + drop_field="a", + ), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=set(), + reduced_rank=True, + ) + np.testing.assert_array_equal(encoded_factor["a[b]"].values, [4, 5, 6]) + + # Verify that encoding of nested dictionaries works well + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("A", eval_method="python", kind="numerical"), + values=FactorValues( + { + "a": pd.Series([1, 2, 3]), + "b": pd.Series([4, 5, 6]), + "__metadata__": None, + }, + kind="numerical", + ), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=[], + ) + np.testing.assert_array_equal(encoded_factor["A[a]"].values, [1, 2, 3]) + + encoded_factor = materializer._encode_evaled_factor( + factor=EvaluatedFactor( + factor=Factor("B", eval_method="python", kind="categorical"), + values=FactorValues( + {"a": pd.Series(["a", "b", "c"])}, kind="categorical" + ), + ), + spec=formulaic.model_spec.ModelSpec(formula=[]), + drop_rows=[], + ) + encoded_matrix = encoded_factor["B[a]"].set_name("B[a]").to_non_interactable() + assert list(encoded_matrix.cat) == ["B[a][T.a]", "B[a][T.b]", "B[a][T.c]"] + + @pytest.mark.xfail(reason="Cannot create an empty SplitMatrix in tabmat") + def test_empty(self, materializer): + mm = materializer.get_model_matrix("0", ensure_full_rank=True) + assert mm.shape[1] == 0 + mm = materializer.get_model_matrix("0", ensure_full_rank=False) + assert mm.shape[1] == 0 + + def test_category_reordering(self): + data = pd.DataFrame({"A": ["a", "b", "c"]}) + data2 = pd.DataFrame({"A": ["c", "b", "a"]}) + data3 = pd.DataFrame( + {"A": pd.Categorical(["c", "b", "a"], categories=["c", "b", "a"])} + ) + + m = TabmatMaterializer(data).get_model_matrix("A + 0", ensure_full_rank=False) + assert list(m.model_spec.column_names) == ["A[T.a]", "A[T.b]", "A[T.c]"] + + m2 = TabmatMaterializer(data2).get_model_matrix("A + 0", ensure_full_rank=False) + assert list(m2.model_spec.column_names) == ["A[T.a]", "A[T.b]", "A[T.c]"] + + m3 = TabmatMaterializer(data3).get_model_matrix("A + 0", ensure_full_rank=False) + assert list(m3.model_spec.column_names) == ["A[T.c]", "A[T.b]", "A[T.a]"] + + def test_term_clustering(self, materializer): + assert materializer.get_model_matrix( + "a + b + a:A + b:A" + ).model_spec.column_names == ( + "Intercept", + "a", + "b", + "a:A[T.b]", + "a:A[T.c]", + "b:A[T.b]", + "b:A[T.c]", + ) + assert materializer.get_model_matrix( + "a + b + a:A + b:A", cluster_by="numerical_factors" + ).model_spec.column_names == ( + "Intercept", + "a", + "a:A[T.b]", + "a:A[T.c]", + "b", + "b:A[T.b]", + "b:A[T.c]", + ) + + def test_model_spec_pickleable(self, materializer): + o = BytesIO() + ms = materializer.get_model_matrix("a ~ a:A") + pickle.dump(ms.model_spec, o) + o.seek(0) + ms2 = pickle.load(o) + assert isinstance(ms, formulaic.parser.types.Structured) + assert ms2.lhs.formula.root == ["a"] From 5716573b96a9cf7c84b8d8037b274c85b4237402 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 22 Jun 2023 17:11:22 +0200 Subject: [PATCH 17/72] Add formulaic to setup.py deps --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4c2a36ed..0ee93ea4 100644 --- a/setup.py +++ b/setup.py @@ -157,7 +157,7 @@ ], package_dir={"": "src"}, packages=find_packages(where="src"), - install_requires=["numpy", "pandas", "scipy"], + install_requires=["numpy", "pandas", "scipy", "formulaic"], ext_modules=cythonize( ext_modules, annotate=False, From 21fcdff454c6c67cbee5c4b6cd84e6f61aa78b91 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 22 Jun 2023 17:21:56 +0200 Subject: [PATCH 18/72] Implement suggestions from code review --- src/tabmat/constructor.py | 9 +++++++-- src/tabmat/formula.py | 12 +++--------- tests/test_formula.py | 9 +++++++++ 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index 4d000726..b74deaf6 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -214,16 +214,21 @@ def from_formula( context=0, ): """ - Transform a pandas DataFrame to a SplitMatrix using a Wilkinson formula. + Transform a pandas data frame to a SplitMatrix using a Wilkinson formula. Parameters ---------- formula: str A formula accepted by formulaic. df: pd.DataFrame - pandas DataFrame to be converted. + pandas data frame to be converted. ensure_full_rank: bool, default False If True, ensure that the matrix has full structural rank by categories. + context: + The context to use for evaluating the formula. If an integer, the + context is taken from the stack frame of the caller at the given + depth. If None, the context is taken from the stack frame of the + caller at depth 1. If a dict, it is used as the context directly. """ if isinstance(context, int): if hasattr(sys, "_getframe"): diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 6a535a82..b789067d 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -50,13 +50,7 @@ def _check_for_nulls(self, name, values, na_action, drop_rows): if na_action is NAAction.IGNORE: return - if isinstance( - values, dict - ): # pragma: no cover; no formulaic transforms return dictionaries any more - for key, vs in values.items(): - self._check_for_nulls(f"{name}[{key}]", vs, na_action, drop_rows) - - elif na_action is NAAction.RAISE: + if na_action is NAAction.RAISE: if isinstance(values, pandas.Series) and values.isnull().values.any(): raise ValueError(f"`{name}` contains null values after evaluation.") @@ -67,7 +61,7 @@ def _check_for_nulls(self, name, values, na_action, drop_rows): else: raise ValueError( f"Do not know how to interpret `na_action` = {repr(na_action)}." - ) # pragma: no cover; this is currently impossible to reach + ) @override def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows): @@ -104,7 +98,7 @@ def _combine_columns(self, cols, spec, drop_rows): # Otherwise, concatenate columns into SplitMatrix return SplitMatrix([col[1].to_non_interactable() for col in cols]) - # Have to override this because of culumn names + # Have to override this because of column names # (and possibly intercept later on) @override def _build_model_matrix(self, spec: ModelSpec, drop_rows): diff --git a/tests/test_formula.py b/tests/test_formula.py index 93397002..55f5ddc8 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd import pytest +from formulaic.materializers import FormulaMaterializer from formulaic.materializers.types import EvaluatedFactor, FactorValues from formulaic.parser.types import Factor from scipy import sparse as sps @@ -29,6 +30,14 @@ def df(): return df +def test_retrieval(): + assert FormulaMaterializer.for_materializer("tabmat") is TabmatMaterializer + assert ( + FormulaMaterializer.for_data(pd.DataFrame(), output="tabmat") + is TabmatMaterializer + ) + + @pytest.mark.parametrize( "formula, expected", [ From eaf968ed2017846e363f10cda98efd5deead29cb Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 22 Jun 2023 17:51:57 +0200 Subject: [PATCH 19/72] Clean up code - Add docstrings - Add type hints - Rename some classes --- src/tabmat/constructor.py | 9 +- src/tabmat/formula.py | 184 +++++++++++++++++++++++++++----------- tests/test_formula.py | 2 +- 3 files changed, 140 insertions(+), 55 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index b74deaf6..0ba66adb 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -212,7 +212,7 @@ def from_formula( categorical_format: str = "{name}[T.{category}]", intercept_name: str = "Intercept", context=0, -): +) -> SplitMatrix: """ Transform a pandas data frame to a SplitMatrix using a Wilkinson formula. @@ -224,6 +224,13 @@ def from_formula( pandas data frame to be converted. ensure_full_rank: bool, default False If True, ensure that the matrix has full structural rank by categories. + interaction_separator: str, default ":" + The separator between the names of interacted variables. + categorical_format: str, default "{name}[T.{category}]" + The format string used to generate the names of categorical variables. + Has to include the placeholders ``{name}`` and ``{category}``. + intercept_name: str, default "Intercept" + The name of the intercept column. context: The context to use for evaluating the formula. If an integer, the context is taken from the stack frame of the caller at the given diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index b789067d..c3daf8bb 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -2,7 +2,7 @@ import itertools from abc import ABC, abstractmethod from collections import OrderedDict -from typing import List, Optional +from typing import List, Optional, Union import numpy import pandas @@ -15,6 +15,7 @@ from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix +from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix from .split_matrix import SplitMatrix @@ -66,7 +67,7 @@ def _check_for_nulls(self, name, values, na_action, drop_rows): @override def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows): series = value * numpy.ones(self.nrows - len(drop_rows)) - return _InteractableDenseColumn(series, name=self.intercept_name) + return _InteractableDenseVector(series, name=self.intercept_name) @override def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): @@ -74,7 +75,7 @@ def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): values = values.drop(index=values.index[drop_rows]) if isinstance(values, pandas.Series): values = values.to_numpy() - return _InteractableDenseColumn(values.astype(numpy.float_)) + return _InteractableDenseVector(values.astype(numpy.float_)) @override def _encode_categorical( @@ -84,7 +85,7 @@ def _encode_categorical( if drop_rows: values = values.drop(index=values.index[drop_rows]) cat = pandas.Categorical(values._values) - return _InteractableCategoricalColumn.from_categorical( + return _InteractableCategoricalVector.from_categorical( cat, reduced_rank=reduced_rank ) @@ -96,7 +97,7 @@ def _combine_columns(self, cols, spec, drop_rows): return SplitMatrix([DenseMatrix(values)]) # Otherwise, concatenate columns into SplitMatrix - return SplitMatrix([col[1].to_non_interactable() for col in cols]) + return SplitMatrix([col[1].to_tabmat() for col in cols]) # Have to override this because of column names # (and possibly intercept later on) @@ -201,69 +202,101 @@ def _get_columns_for_term(self, factors, spec, scale=1): return out -class _InteractableColumn(ABC): +class _InteractableVector(ABC): + """Abstract base class for interactable vectors, which are mostly thin + wrappers over numpy arrays, scipy sparse matrices and pandas categoricals. + """ + name: Optional[str] @abstractmethod - def to_non_interactable(self): + def to_tabmat(self) -> MatrixBase: + """Convert to an actual tabmat matrix.""" pass @abstractmethod - def get_names(self, col): + def get_names(self) -> List[str]: + """Return the names of the columns represented by this vector. + + Returns + ------- + List[str] + The names of the columns represented by this vector. + """ pass @abstractmethod - def set_name(self, name): + def set_name(self, name, name_format): + """Set the name of the vector. + + Parameters + ---------- + name : str + The name to set. + name_format : str + The format string to use to format the name. Only used for + categoricals. Has to include the placeholders ``{name}`` + and ``{category}`` + + Returns + ------- + self + A reference to the vector itself. + """ pass -class _InteractableDenseColumn(_InteractableColumn): +class _InteractableDenseVector(_InteractableVector): def __init__(self, values: numpy.ndarray, name: Optional[str] = None): self.values = values self.name = name def __rmul__(self, other): if isinstance(other, (int, float)): - return _InteractableDenseColumn( + return _InteractableDenseVector( values=self.values * other, name=self.name, ) - def to_non_interactable(self): + def to_tabmat(self) -> DenseMatrix: return DenseMatrix(self.values) - def get_names(self): + def get_names(self) -> List[str]: + if self.name is None: + raise RuntimeError("Name not set") return [self.name] - def set_name(self, name, name_format=None): + def set_name(self, name, name_format=None) -> "_InteractableDenseVector": self.name = name return self -class _InteractableSparseColumn(_InteractableColumn): +class _InteractableSparseVector(_InteractableVector): def __init__(self, values: sparse.csc_matrix, name: Optional[str] = None): self.values = values self.name = name def __rmul__(self, other): if isinstance(other, (int, float)): - return _InteractableSparseColumn( + return _InteractableSparseVector( values=self.values * other, name=self.name, ) - def to_non_interactable(self): + def to_tabmat(self) -> SparseMatrix: return SparseMatrix(self.values) - def get_names(self): + def get_names(self) -> List[str]: + if self.name is None: + raise RuntimeError("Name not set") return [self.name] - def set_name(self, name, name_format=None): + def set_name(self, name, name_format=None) -> "_InteractableSparseVector": self.name = name return self -class _InteractableCategoricalColumn(_InteractableColumn): +class _InteractableCategoricalVector(_InteractableVector): def __init__( self, codes: numpy.ndarray, @@ -277,10 +310,13 @@ def __init__( self.codes = codes self.categories = categories self.multipliers = multipliers - self.name = None + self.name = name @classmethod - def from_categorical(cls, cat: pandas.Categorical, reduced_rank: bool): + def from_categorical( + cls, cat: pandas.Categorical, reduced_rank: bool + ) -> "_InteractableCategoricalVector": + """Create an interactable categorical vector from a pandas categorical.""" categories = list(cat.categories) codes = cat.codes.copy().astype(numpy.int64) if reduced_rank: @@ -295,13 +331,14 @@ def from_categorical(cls, cat: pandas.Categorical, reduced_rank: bool): def __rmul__(self, other): if isinstance(other, (int, float)): - return _InteractableCategoricalColumn( + return _InteractableCategoricalVector( categories=self.categories, codes=self.codes, multipliers=self.multipliers * other, + name=self.name, ) - def to_non_interactable(self): + def to_tabmat(self) -> Union[CategoricalMatrix, SparseMatrix]: codes = self.codes.copy() categories = self.categories.copy() if -2 in self.codes: @@ -331,10 +368,14 @@ def to_non_interactable(self): ) ) - def get_names(self): + def get_names(self) -> List[str]: + if self.name is None: + raise RuntimeError("Name not set") return self.categories - def set_name(self, name, name_format="{name}[T.{category}]"): + def set_name( + self, name, name_format="{name}[T.{category}]" + ) -> "_InteractableCategoricalVector": if self.name is None: # Make sure to only format the name once self.name = name @@ -345,26 +386,44 @@ def set_name(self, name, name_format="{name}[T.{category}]"): def _interact( - left: _InteractableColumn, right: _InteractableColumn, reverse=False, separator=":" -): - if isinstance(left, _InteractableDenseColumn): - if isinstance(right, _InteractableDenseColumn): + left: _InteractableVector, right: _InteractableVector, reverse=False, separator=":" +) -> _InteractableVector: + """Interact two interactable vectors. + + Parameters + ---------- + left : _InteractableVector + The left vector. + right : _InteractableVector + The right vector. + reverse : bool, optional + Whether to reverse the order of the interaction, by default False + separator : str, optional + The separator to use between the names of the interacted vectors, by default ":" + + Returns + ------- + _InteractableVector + The interacted vector. + """ + if isinstance(left, _InteractableDenseVector): + if isinstance(right, _InteractableDenseVector): if not reverse: new_name = f"{left.name}{separator}{right.name}" else: new_name = f"{right.name}{separator}{left.name}" - return _InteractableDenseColumn(left.values * right.values, name=new_name) + return _InteractableDenseVector(left.values * right.values, name=new_name) else: return _interact(right, left, reverse=True, separator=separator) - if isinstance(left, _InteractableSparseColumn): - if isinstance(right, (_InteractableDenseColumn, _InteractableSparseColumn)): + if isinstance(left, _InteractableSparseVector): + if isinstance(right, (_InteractableDenseVector, _InteractableSparseVector)): if not reverse: new_name = f"{left.name}{separator}{right.name}" else: new_name = f"{right.name}{separator}{left.name}" - return _InteractableSparseColumn( + return _InteractableSparseVector( left.values.multiply(right.values), name=new_name, ) @@ -372,9 +431,9 @@ def _interact( else: return _interact(right, left, reverse=True, separator=separator) - if isinstance(left, _InteractableCategoricalColumn): - if isinstance(right, (_InteractableDenseColumn, _InteractableSparseColumn)): - if isinstance(right, _InteractableDenseColumn): + if isinstance(left, _InteractableCategoricalVector): + if isinstance(right, (_InteractableDenseVector, _InteractableSparseVector)): + if isinstance(right, _InteractableDenseVector): right_values = right.values else: right_values = right.values.todense() @@ -382,29 +441,48 @@ def _interact( new_categories = [ f"{cat}{separator}{right.name}" for cat in left.categories ] + new_name = f"{left.name}{separator}{right.name}" else: new_categories = [ f"{right.name}{separator}{cat}" for cat in left.categories ] - return _InteractableCategoricalColumn( - left.codes, - new_categories, - left.multipliers * right_values, + new_name = f"{right.name}{separator}{left.name}" + return _InteractableCategoricalVector( + codes=left.codes, + categories=new_categories, + multipliers=left.multipliers * right_values, + name=new_name, ) - elif isinstance(right, _InteractableCategoricalColumn): + elif isinstance(right, _InteractableCategoricalVector): return _interact_categoricals(left, right, separator=separator) - raise TypeError( - f"Cannot interact {type(left).__name__} with {type(right).__name__}" - ) + raise TypeError( + f"Cannot interact {type(left).__name__} with {type(right).__name__}" + ) def _interact_categoricals( - left: _InteractableCategoricalColumn, - right: _InteractableCategoricalColumn, + left: _InteractableCategoricalVector, + right: _InteractableCategoricalVector, separator=":", -): +) -> _InteractableCategoricalVector: + """Interact two categorical vectors. + + Parameters + ---------- + left : _InteractableCategoricalVector + The left categorical vector. + right : _InteractableCategoricalVector + The right categorical vector. + separator : str, optional + The separator to use between the names of the interacted vectors, by default ":" + + Returns + ------- + _InteractableCategoricalVector + The interacted categorical vector. + """ cardinality_left = len(left.categories) new_codes = right.codes * cardinality_left + left.codes @@ -419,10 +497,11 @@ def _interact_categoricals( for right_cat, left_cat in itertools.product(right.categories, left.categories) ] - return _InteractableCategoricalColumn( + return _InteractableCategoricalVector( codes=new_codes, categories=new_categories, multipliers=left.multipliers * right.multipliers, + name=f"{left.name}{separator}{right.name}", ) @@ -432,7 +511,7 @@ def _C( spans_intercept: bool = True, ): """ - Mark data as being categorical. + Mark data as categorical. A reduced-functionality version of the ``formulaic`` ``C()`` function. It does not support custom contrasts or the level argument, but it allows setting @@ -446,11 +525,10 @@ def encoder( encoder_state, model_spec, ): - values = pandas.Series(values).astype("category") if drop_rows: values = values.drop(index=values.index[drop_rows]) - cat = values._values - return _InteractableCategoricalColumn.from_categorical( + cat = pandas.Categorical(values._values) + return _InteractableCategoricalVector.from_categorical( cat, reduced_rank=reduced_rank ) diff --git a/tests/test_formula.py b/tests/test_formula.py index 55f5ddc8..a366daa6 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -653,7 +653,7 @@ def test_encoding_edge_cases(self, materializer): spec=formulaic.model_spec.ModelSpec(formula=[]), drop_rows=[], ) - encoded_matrix = encoded_factor["B[a]"].set_name("B[a]").to_non_interactable() + encoded_matrix = encoded_factor["B[a]"].set_name("B[a]").to_tabmat() assert list(encoded_matrix.cat) == ["B[a][T.a]", "B[a][T.b]", "B[a][T.c]"] @pytest.mark.xfail(reason="Cannot create an empty SplitMatrix in tabmat") From 7cb70f623b695492aed7b847870bfcfdf1ed07f2 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 22 Jun 2023 17:59:10 +0200 Subject: [PATCH 20/72] Pin formulaic minimum version --- conda.recipe/meta.yaml | 2 +- environment-win.yml | 2 +- environment.yml | 2 +- setup.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 6855671a..2d183c3d 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -38,7 +38,7 @@ requirements: - {{ pin_compatible('numpy') }} - pandas - scipy - - formulaic + - formulaic>=0.4 test: requires: diff --git a/environment-win.yml b/environment-win.yml index dadb5a1a..fce22a21 100644 --- a/environment-win.yml +++ b/environment-win.yml @@ -5,7 +5,7 @@ channels: dependencies: - libblas>=0=*mkl - pandas - - formulaic + - formulaic>=0.4 # development tools - black diff --git a/environment.yml b/environment.yml index bdc6b749..9c425c19 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: - nodefaults dependencies: - pandas - - formulaic + - formulaic>=0.4 # development tools - black diff --git a/setup.py b/setup.py index 0ee93ea4..632728ac 100644 --- a/setup.py +++ b/setup.py @@ -157,7 +157,7 @@ ], package_dir={"": "src"}, packages=find_packages(where="src"), - install_requires=["numpy", "pandas", "scipy", "formulaic"], + install_requires=["numpy", "pandas", "scipy", "formulaic>=0.4"], ext_modules=cythonize( ext_modules, annotate=False, From fb629c6215a77fc4b8acc9e6b060bb5f27587c89 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Thu, 15 Jun 2023 22:20:48 +0200 Subject: [PATCH 21/72] Add support for architectures not supported by xsimd (#262) --- ...arch64_python3.10_default.____cpython.yaml | 16 +++++++++++ ...pc64le_python3.10_default.____cpython.yaml | 14 ++++++++++ .github/workflows/ci.yml | 2 ++ .github/workflows/conda-build.sh | 9 ++++-- CHANGELOG.rst | 7 +++++ src/tabmat/ext/dense_helpers-tmpl.cpp | 28 +++++++++++++++++++ src/tabmat/ext/sparse_helpers-tmpl.cpp | 18 ++++++++++++ 7 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 .ci_support/linux_aarch64_python3.10_default.____cpython.yaml create mode 100644 .ci_support/linux_ppc64le_python3.10_default.____cpython.yaml diff --git a/.ci_support/linux_aarch64_python3.10_default.____cpython.yaml b/.ci_support/linux_aarch64_python3.10_default.____cpython.yaml new file mode 100644 index 00000000..1c3d120f --- /dev/null +++ b/.ci_support/linux_aarch64_python3.10_default.____cpython.yaml @@ -0,0 +1,16 @@ +BUILD: +- aarch64-conda_cos7-linux-gnu +c_compiler: +- gcc +c_compiler_version: +- '12' +cxx_compiler: +- gxx +cxx_compiler_version: +- '12' +numpy: +- '1.21' +python: +- 3.10.* *_cpython +target_platform: +- linux-aarch64 diff --git a/.ci_support/linux_ppc64le_python3.10_default.____cpython.yaml b/.ci_support/linux_ppc64le_python3.10_default.____cpython.yaml new file mode 100644 index 00000000..d1e7b0f7 --- /dev/null +++ b/.ci_support/linux_ppc64le_python3.10_default.____cpython.yaml @@ -0,0 +1,14 @@ +c_compiler: +- gcc +c_compiler_version: +- '12' +cxx_compiler: +- gxx +cxx_compiler_version: +- '12' +numpy: +- '1.21' +python: +- 3.10.* *_cpython +target_platform: +- linux-ppc64le diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 604e6244..0ee0c664 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,6 +48,8 @@ jobs: matrix: CONDA_BUILD_YML: - linux_64_python3.7_default.____cpython + - linux_aarch64_python3.10_default.____cpython + - linux_ppc64le_python3.10_default.____cpython steps: - name: Pull image run: docker pull condaforge/mambaforge:latest diff --git a/.github/workflows/conda-build.sh b/.github/workflows/conda-build.sh index 71ec92f4..c6c6d28b 100755 --- a/.github/workflows/conda-build.sh +++ b/.github/workflows/conda-build.sh @@ -7,5 +7,10 @@ export CONDA_BUILD_YML=$1 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" source ${SCRIPT_DIR}/base.sh $* conda activate base -mamba install -y conda-build -conda build -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe +mamba install -y boa + +if grep -q "linux-aarch64\|linux-ppc64le" .ci_support/${CONDA_BUILD_YML}.yaml; then + CONDA_BUILD_ARGS="${CONDA_BUILD_ARGS:-} --no-test" +fi + +conda mambabuild ${CONDA_BUILD_ARGS:-} -m .ci_support/${CONDA_BUILD_YML}.yaml conda.recipe diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a0fd5c5b..0ae862e7 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,13 @@ Changelog ========= +unreleased +---------- + +**Other changes:** + +- Support building on architectures that are unsupported by xsimd. + 3.1.8 - 2023-06-13 ------------------ diff --git a/src/tabmat/ext/dense_helpers-tmpl.cpp b/src/tabmat/ext/dense_helpers-tmpl.cpp index 00af6375..430d028d 100644 --- a/src/tabmat/ext/dense_helpers-tmpl.cpp +++ b/src/tabmat/ext/dense_helpers-tmpl.cpp @@ -45,7 +45,11 @@ namespace xs = xsimd; // setup simd accumulators % for ir in range(IBLOCK): % for jr in range(JBLOCK): +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + auto accumsimd${ir}_${jr} = (F)0.0; +#else auto accumsimd${ir}_${jr} = xs::XSIMD_BROADCAST(((F)0.0)); +#endif % endfor % endfor @@ -78,10 +82,18 @@ namespace xs = xsimd; % endfor ) { % for ir in range(IBLOCK): +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + auto Xtd${ir} = *Lptr${ir}; +#else auto Xtd${ir} = xs::load_aligned(Lptr${ir}); +#endif % for jr in range(JBLOCK): { +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + auto Xsimd = *Rptr${jr}; +#else auto Xsimd = xs::load_aligned(Rptr${jr}); +#endif accumsimd${ir}_${jr} = xs::fma(Xtd${ir}, Xsimd, accumsimd${ir}_${jr}); } % endfor @@ -91,7 +103,11 @@ namespace xs = xsimd; // horizontal sum of the simd blocks % for ir in range(IBLOCK): % for jr in range(JBLOCK): +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + F accum${ir}_${jr} = accumsimd${ir}_${jr}; +#else F accum${ir}_${jr} = xs::XSIMD_REDUCE_ADD(accumsimd${ir}_${jr}); +#endif % endfor % endfor @@ -150,7 +166,11 @@ void dense_base${kparallel}(F* R, F* L, F* d, F* out, Py_ssize_t jmin2, Py_ssize_t jmax2, Py_ssize_t kmin, Py_ssize_t kmax, Int innerblock, Int kstep) { +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + constexpr std::size_t simd_size = 1; +#else constexpr std::size_t simd_size = xsimd::simd_type::size; +#endif for (Py_ssize_t imin = imin2; imin < imax2; imin+=innerblock) { Py_ssize_t imax = imin + innerblock; if (imax > imax2) { @@ -248,7 +268,11 @@ template void _dense${order}_sandwich(Int* rows, Int* cols, F* X, F* d, F* out, Int in_n, Int out_m, Int m, Int n, Int thresh1d, Int kratio, Int innerblock) { +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + constexpr std::size_t simd_size = 1; +#else constexpr std::size_t simd_size = xsimd::simd_type::size; +#endif constexpr auto alignment = simd_size * sizeof(F); bool kparallel = (in_n / (kratio*thresh1d)) > (out_m / thresh1d); @@ -292,7 +316,11 @@ template void _dense${order}_rmatvec(Int* rows, Int* cols, F* X, F* v, F* out, Int n_rows, Int n_cols, Int m, Int n) { +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + constexpr std::size_t simd_size = 1; +#else constexpr std::size_t simd_size = xsimd::simd_type::size; +#endif constexpr std::size_t alignment = simd_size * sizeof(F); auto outglobal = make_aligned_unique(omp_get_max_threads()*n_cols, alignment); diff --git a/src/tabmat/ext/sparse_helpers-tmpl.cpp b/src/tabmat/ext/sparse_helpers-tmpl.cpp index 9704d4cc..bdf535fd 100644 --- a/src/tabmat/ext/sparse_helpers-tmpl.cpp +++ b/src/tabmat/ext/sparse_helpers-tmpl.cpp @@ -30,7 +30,12 @@ void _csr_dense${order}_sandwich( Int nrows, Int nA_cols, Int nB_cols ) { +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + constexpr Int simd_size = 1; +#else constexpr Int simd_size = xsimd::simd_type::size; +#endif + constexpr auto alignment = simd_size*sizeof(F); int kblock = 128; @@ -95,15 +100,28 @@ void _csr_dense${order}_sandwich( } F Q = Adata[A_idx]; +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + auto Qsimd = Q; +#else auto Qsimd = xs::XSIMD_BROADCAST(Q); +#endif Py_ssize_t Cj = Cjj; Py_ssize_t Cjmax2 = Cjj + ((Cjmax - Cjj) / simd_size) * simd_size; for (; Cj < Cjmax2; Cj+=simd_size) { +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + auto Bsimd = R[(Py_ssize_t) (Ck-Ckk) * jblock + (Cj-Cjj)]; + auto outsimd = outtemp.get()[Ci * nB_cols_rounded + Cj]; +#else auto Bsimd = xs::load_aligned(&R[(Py_ssize_t) (Ck-Ckk) * jblock + (Cj-Cjj)]); auto outsimd = xs::load_aligned(&outtemp.get()[Ci * nB_cols_rounded + Cj]); +#endif outsimd = xs::fma(Qsimd, Bsimd, outsimd); +#ifdef XSIMD_NO_SUPPORTED_ARCHITECTURE + outtemp.get()[Ci * nB_cols_rounded + Cj] = outsimd; +#else outsimd.store_aligned(&outtemp.get()[Ci * nB_cols_rounded + Cj]); +#endif } for (; Cj < Cjmax; Cj++) { From 9fb2993ecf21943b67e5dfae6101bff99c339648 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 16 Jun 2023 13:59:31 +0200 Subject: [PATCH 22/72] Release 3.1.9 (#263) --- CHANGELOG.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0ae862e7..9c10aa55 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,8 +7,8 @@ Changelog ========= -unreleased ----------- +3.1.9 - 2023-06-16 +------------------- **Other changes:** From 1ad6a93810d9699ed9801b755927c5bc52ad6422 Mon Sep 17 00:00:00 2001 From: "quant-ranger[bot]" <132915763+quant-ranger[bot]@users.noreply.github.com> Date: Mon, 19 Jun 2023 13:30:25 +0200 Subject: [PATCH 23/72] Pre-commit autoupdate (#264) Co-authored-by: quant-ranger[bot] <132915763+quant-ranger[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 338b677b..2fa1dbd2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,7 +33,7 @@ repos: additional_dependencies: - python=3.8 - repo: https://github.com/Quantco/pre-commit-mirrors-pyupgrade - rev: 3.6.0 + rev: 3.7.0 hooks: - id: pyupgrade-conda - repo: https://github.com/Quantco/pre-commit-mirrors-cython-lint From 5b2c88577b2485a93664a87530db2571f1be0441 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 23 Jun 2023 11:16:47 +0200 Subject: [PATCH 24/72] Add params for density and cardinality thresholds --- src/tabmat/constructor.py | 6 ++++ src/tabmat/formula.py | 75 +++++++++++++++++++++++++++++---------- tests/test_formula.py | 11 ++++-- 3 files changed, 71 insertions(+), 21 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index 0ba66adb..232d6426 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -207,6 +207,9 @@ def from_csc(mat: sps.csc_matrix, threshold=0.1): def from_formula( formula: Union[str, Formula], df: pd.DataFrame, + dtype: np.dtype = np.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, ensure_full_rank: bool = False, interaction_separator: str = ":", categorical_format: str = "{name}[T.{category}]", @@ -253,5 +256,8 @@ def from_formula( interaction_separator=interaction_separator, categorical_format=categorical_format, intercept_name=intercept_name, + dtype=dtype, + sparse_threshold=sparse_threshold, + cat_threshold=cat_threshold, ) return materializer.get_model_matrix(spec) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index c3daf8bb..a9a72e3c 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -13,6 +13,8 @@ from interface_meta import override from scipy import sparse +import tabmat as tm + from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix from .matrix_base import MatrixBase @@ -34,6 +36,9 @@ def _init(self): "categorical_format", "{name}[T.{category}]" ) self.intercept_name = self.params.get("intercept_name", "Intercept") + self.dtype = self.params.get("dtype", numpy.float64) + self.sparse_threshold = self.params.get("sparse_threshold", 0.1) + self.cat_threshold = self.params.get("cat_threshold", 4) # We can override formulaic's C() function here self.context["C"] = _C @@ -74,8 +79,13 @@ def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): if drop_rows: values = values.drop(index=values.index[drop_rows]) if isinstance(values, pandas.Series): - values = values.to_numpy() - return _InteractableDenseVector(values.astype(numpy.float_)) + values = values.to_numpy().astype(self.dtype) + if (values != 0).mean() <= self.sparse_threshold: + return _InteractableSparseVector( + sparse.csc_matrix(values[:, numpy.newaxis]) + ) + else: + return _InteractableDenseVector(values) @override def _encode_categorical( @@ -93,11 +103,16 @@ def _encode_categorical( def _combine_columns(self, cols, spec, drop_rows): # Special case no columns if not cols: - values = numpy.empty((self.data.shape[0], 0)) - return SplitMatrix([DenseMatrix(values)]) + values = numpy.empty((self.data.shape[0], 0), dtype=self.dtype) + return DenseMatrix(values) # Otherwise, concatenate columns into SplitMatrix - return SplitMatrix([col[1].to_tabmat() for col in cols]) + return SplitMatrix( + [ + col[1].to_tabmat(self.dtype, self.sparse_threshold, self.cat_threshold) + for col in cols + ] + ) # Have to override this because of column names # (and possibly intercept later on) @@ -210,7 +225,12 @@ class _InteractableVector(ABC): name: Optional[str] @abstractmethod - def to_tabmat(self) -> MatrixBase: + def to_tabmat( + self, + dtype: numpy.dtype, + sparse_threshold: float, + cat_threshold: int, + ) -> MatrixBase: """Convert to an actual tabmat matrix.""" pass @@ -258,8 +278,17 @@ def __rmul__(self, other): name=self.name, ) - def to_tabmat(self) -> DenseMatrix: - return DenseMatrix(self.values) + def to_tabmat( + self, + dtype: numpy.dtype = numpy.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + ) -> DenseMatrix: + if (self.values != 0).mean() > sparse_threshold: + return DenseMatrix(self.values) + else: + # Columns can become sparser, but not denser through interactions + return SparseMatrix(sparse.csc_matrix(self.values[:, numpy.newaxis])) def get_names(self) -> List[str]: if self.name is None: @@ -283,7 +312,12 @@ def __rmul__(self, other): name=self.name, ) - def to_tabmat(self) -> SparseMatrix: + def to_tabmat( + self, + dtype: numpy.dtype = numpy.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + ) -> SparseMatrix: return SparseMatrix(self.values) def get_names(self) -> List[str]: @@ -338,7 +372,12 @@ def __rmul__(self, other): name=self.name, ) - def to_tabmat(self) -> Union[CategoricalMatrix, SparseMatrix]: + def to_tabmat( + self, + dtype: numpy.dtype = numpy.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + ) -> Union[CategoricalMatrix, SplitMatrix]: codes = self.codes.copy() categories = self.categories.copy() if -2 in self.codes: @@ -355,18 +394,18 @@ def to_tabmat(self) -> Union[CategoricalMatrix, SparseMatrix]: ordered=False, ) - categorical_part = CategoricalMatrix(cat, drop_first=drop_first) + categorical_part = CategoricalMatrix(cat, drop_first=drop_first, dtype=dtype) - if (self.multipliers == 1).all(): + if (self.codes == -2).all(): + # All values are dropped + return DenseMatrix(numpy.empty((len(codes), 0), dtype=dtype)) + elif (self.multipliers == 1).all() and len(categories) >= cat_threshold: return categorical_part else: - return SparseMatrix( - sparse.csc_matrix( - categorical_part.tocsr().multiply( - self.multipliers[:, numpy.newaxis] - ) - ) + sparse_matrix = sparse.csc_matrix( + categorical_part.tocsr().multiply(self.multipliers[:, numpy.newaxis]) ) + return tm.from_csc(sparse_matrix, threshold=sparse_threshold) def get_names(self) -> List[str]: if self.name is None: diff --git a/tests/test_formula.py b/tests/test_formula.py index a366daa6..0300f52d 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -127,7 +127,9 @@ def test_retrieval(): ], ) def test_matrix_against_expectation(df, formula, expected): - model_df = tm.from_formula(formula, df, ensure_full_rank=True) + model_df = tm.from_formula( + formula, df, ensure_full_rank=True, cat_threshold=1, sparse_threshold=0.5 + ) assert len(model_df.matrices) == len(expected.matrices) for res, exp in zip(model_df.matrices, expected.matrices): assert type(res) == type(exp) @@ -232,6 +234,8 @@ def test_matrix_against_expectation_qcl(df, formula, expected): model_df = tm.from_formula( formula, df, + cat_threshold=1, + sparse_threshold=0.5, ensure_full_rank=True, interaction_separator="__x__", categorical_format="{name}__{category}", @@ -653,10 +657,11 @@ def test_encoding_edge_cases(self, materializer): spec=formulaic.model_spec.ModelSpec(formula=[]), drop_rows=[], ) - encoded_matrix = encoded_factor["B[a]"].set_name("B[a]").to_tabmat() + encoded_matrix = ( + encoded_factor["B[a]"].set_name("B[a]").to_tabmat(cat_threshold=1) + ) assert list(encoded_matrix.cat) == ["B[a][T.a]", "B[a][T.b]", "B[a][T.c]"] - @pytest.mark.xfail(reason="Cannot create an empty SplitMatrix in tabmat") def test_empty(self, materializer): mm = materializer.get_model_matrix("0", ensure_full_rank=True) assert mm.shape[1] == 0 From 63cbc7b6d4345cf25d36775f72fe8053b96e32c7 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 23 Jun 2023 13:35:11 +0200 Subject: [PATCH 25/72] Skip python 3.6 build --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index ec42dcbe..278280a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ default_section = 'THIRDPARTY' [tool.cibuildwheel] skip = [ + "cp36-*", "*-win32", "*-manylinux_i686", "pp*", From 2daba934f50ddbe9987fb32aea1cf5878b974b55 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 23 Jun 2023 14:22:10 +0200 Subject: [PATCH 26/72] Refactor to avoid circular imports --- src/tabmat/constructor.py | 28 ++-------------------------- src/tabmat/constructor_util.py | 32 ++++++++++++++++++++++++++++++++ src/tabmat/formula.py | 23 +++++++++++++---------- tests/test_split_matrix.py | 2 +- 4 files changed, 48 insertions(+), 37 deletions(-) create mode 100644 src/tabmat/constructor_util.py diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index 232d6426..ecb5745f 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -1,6 +1,6 @@ import sys import warnings -from typing import List, Tuple, Union +from typing import List, Union import numpy as np import pandas as pd @@ -10,6 +10,7 @@ from scipy import sparse as sps from .categorical_matrix import CategoricalMatrix +from .constructor_util import _split_sparse_and_dense_parts from .dense_matrix import DenseMatrix from .formula import TabmatMaterializer from .matrix_base import MatrixBase @@ -168,31 +169,6 @@ def from_pandas( return matrices[0] -def _split_sparse_and_dense_parts( - arg1: sps.csc_matrix, threshold: float = 0.1 -) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]: - """ - Split matrix. - - Return the dense and sparse parts of a matrix and the corresponding indices - for each at the provided threshold. - """ - if not isinstance(arg1, sps.csc_matrix): - raise TypeError( - f"X must be of type scipy.sparse.csc_matrix or matrix.SparseMatrix," - f"not {type(arg1)}" - ) - if not 0 <= threshold <= 1: - raise ValueError("Threshold must be between 0 and 1.") - densities = np.diff(arg1.indptr) / arg1.shape[0] - dense_indices = np.where(densities > threshold)[0] - sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices) - - X_dense_F = DenseMatrix(np.asfortranarray(arg1[:, dense_indices].toarray())) - X_sparse = SparseMatrix(arg1[:, sparse_indices]) - return X_dense_F, X_sparse, dense_indices, sparse_indices - - def from_csc(mat: sps.csc_matrix, threshold=0.1): """ Convert a CSC-format sparse matrix into a ``SplitMatrix``. diff --git a/src/tabmat/constructor_util.py b/src/tabmat/constructor_util.py new file mode 100644 index 00000000..958b34a8 --- /dev/null +++ b/src/tabmat/constructor_util.py @@ -0,0 +1,32 @@ +from typing import Tuple + +import numpy as np +import scipy.sparse as sps + +from .dense_matrix import DenseMatrix +from .sparse_matrix import SparseMatrix + + +def _split_sparse_and_dense_parts( + arg1: sps.csc_matrix, threshold: float = 0.1 +) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]: + """ + Split matrix. + + Return the dense and sparse parts of a matrix and the corresponding indices + for each at the provided threshold. + """ + if not isinstance(arg1, sps.csc_matrix): + raise TypeError( + f"X must be of type scipy.sparse.csc_matrix or matrix.SparseMatrix," + f"not {type(arg1)}" + ) + if not 0 <= threshold <= 1: + raise ValueError("Threshold must be between 0 and 1.") + densities = np.diff(arg1.indptr) / arg1.shape[0] + dense_indices = np.where(densities > threshold)[0] + sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices) + + X_dense_F = DenseMatrix(np.asfortranarray(arg1[:, dense_indices].toarray())) + X_sparse = SparseMatrix(arg1[:, sparse_indices]) + return X_dense_F, X_sparse, dense_indices, sparse_indices diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index a9a72e3c..a9812719 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -11,11 +11,10 @@ from formulaic.materializers.base import EncodedTermStructure from formulaic.materializers.types import FactorValues, NAAction from interface_meta import override -from scipy import sparse - -import tabmat as tm +from scipy import sparse as sps from .categorical_matrix import CategoricalMatrix +from .constructor_util import _split_sparse_and_dense_parts from .dense_matrix import DenseMatrix from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix @@ -81,9 +80,7 @@ def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows): if isinstance(values, pandas.Series): values = values.to_numpy().astype(self.dtype) if (values != 0).mean() <= self.sparse_threshold: - return _InteractableSparseVector( - sparse.csc_matrix(values[:, numpy.newaxis]) - ) + return _InteractableSparseVector(sps.csc_matrix(values[:, numpy.newaxis])) else: return _InteractableDenseVector(values) @@ -288,7 +285,7 @@ def to_tabmat( return DenseMatrix(self.values) else: # Columns can become sparser, but not denser through interactions - return SparseMatrix(sparse.csc_matrix(self.values[:, numpy.newaxis])) + return SparseMatrix(sps.csc_matrix(self.values[:, numpy.newaxis])) def get_names(self) -> List[str]: if self.name is None: @@ -301,7 +298,7 @@ def set_name(self, name, name_format=None) -> "_InteractableDenseVector": class _InteractableSparseVector(_InteractableVector): - def __init__(self, values: sparse.csc_matrix, name: Optional[str] = None): + def __init__(self, values: sps.csc_matrix, name: Optional[str] = None): self.values = values self.name = name @@ -402,10 +399,16 @@ def to_tabmat( elif (self.multipliers == 1).all() and len(categories) >= cat_threshold: return categorical_part else: - sparse_matrix = sparse.csc_matrix( + sparse_matrix = sps.csc_matrix( categorical_part.tocsr().multiply(self.multipliers[:, numpy.newaxis]) ) - return tm.from_csc(sparse_matrix, threshold=sparse_threshold) + ( + dense_part, + sparse_part, + dense_idx, + sparse_idx, + ) = _split_sparse_and_dense_parts(sparse_matrix, sparse_threshold) + return SplitMatrix([dense_part, sparse_part], [dense_idx, sparse_idx]) def get_names(self) -> List[str]: if self.name is None: diff --git a/tests/test_split_matrix.py b/tests/test_split_matrix.py index 30d9f621..2d0fdc56 100644 --- a/tests/test_split_matrix.py +++ b/tests/test_split_matrix.py @@ -7,7 +7,7 @@ import tabmat as tm from tabmat import from_pandas -from tabmat.constructor import _split_sparse_and_dense_parts +from tabmat.constructor_util import _split_sparse_and_dense_parts from tabmat.dense_matrix import DenseMatrix from tabmat.ext.sparse import csr_dense_sandwich from tabmat.split_matrix import SplitMatrix From ef84e7dc2e65295b467f8dee0ea84e20ef02b1d2 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 26 Jun 2023 07:51:45 +0200 Subject: [PATCH 27/72] Interaction of dropped and NA is dropped --- src/tabmat/formula.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index a9812719..fb2b398f 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -531,8 +531,8 @@ def _interact_categoricals( na_mask = (left.codes == -1) | (right.codes == -1) drop_mask = (left.codes == -2) | (right.codes == -2) - new_codes[drop_mask] = -2 new_codes[na_mask] = -1 + new_codes[drop_mask] = -2 new_categories = [ f"{left_cat}{separator}{right_cat}" From 927b2bee45d5b21e7c7e6cf8771df5ba84d69c4d Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 27 Jun 2023 09:58:25 +0200 Subject: [PATCH 28/72] Add type hint for context --- src/tabmat/constructor.py | 6 +++--- tests/test_formula.py | 6 ------ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index ecb5745f..fbb803b7 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -1,6 +1,6 @@ import sys import warnings -from typing import List, Union +from typing import Any, List, Mapping, Optional, Union import numpy as np import pandas as pd @@ -190,7 +190,7 @@ def from_formula( interaction_separator: str = ":", categorical_format: str = "{name}[T.{category}]", intercept_name: str = "Intercept", - context=0, + context: Optional[Union[int, Mapping[str, Any]]] = 0, ) -> SplitMatrix: """ Transform a pandas data frame to a SplitMatrix using a Wilkinson formula. @@ -210,7 +210,7 @@ def from_formula( Has to include the placeholders ``{name}`` and ``{category}``. intercept_name: str, default "Intercept" The name of the intercept column. - context: + context: Union[int, Mapping[str, Any]], default 0 The context to use for evaluating the formula. If an integer, the context is taken from the stack frame of the caller at the given depth. If None, the context is taken from the stack frame of the diff --git a/tests/test_formula.py b/tests/test_formula.py index 0300f52d..26299af3 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -512,12 +512,6 @@ def test_na_handling(self, data_with_nulls, formula, tests): assert isinstance(mm, tm.MatrixBase) assert mm.shape == (3, len(tests[0]) + (-1 if "A" in formula else 0)) - if formula != "C(A)": # C(A) pre-encodes the data, stripping out nulls. - with pytest.raises(ValueError): - TabmatMaterializer(data_with_nulls).get_model_matrix( - formula, na_action="raise" - ) - def test_state(self, materializer): mm = materializer.get_model_matrix("center(a) - 1") assert isinstance(mm, tm.MatrixBase) From fbd9ad9873c12258df9afd7059d6cef40a2421e6 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 27 Jun 2023 15:06:31 +0200 Subject: [PATCH 29/72] Add unit tests for interactable vectors --- src/tabmat/formula.py | 13 +++++---- tests/test_formula.py | 62 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index fb2b398f..94dabfa4 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -457,7 +457,7 @@ def _interact( return _InteractableDenseVector(left.values * right.values, name=new_name) else: - return _interact(right, left, reverse=True, separator=separator) + return _interact(right, left, reverse=not reverse, separator=separator) if isinstance(left, _InteractableSparseVector): if isinstance(right, (_InteractableDenseVector, _InteractableSparseVector)): @@ -466,19 +466,19 @@ def _interact( else: new_name = f"{right.name}{separator}{left.name}" return _InteractableSparseVector( - left.values.multiply(right.values), + left.values.multiply(right.values.reshape((-1, 1))), name=new_name, ) else: - return _interact(right, left, reverse=True, separator=separator) + return _interact(right, left, reverse=not reverse, separator=separator) if isinstance(left, _InteractableCategoricalVector): if isinstance(right, (_InteractableDenseVector, _InteractableSparseVector)): if isinstance(right, _InteractableDenseVector): right_values = right.values else: - right_values = right.values.todense() + right_values = right.values.toarray().squeeze() if not reverse: new_categories = [ f"{cat}{separator}{right.name}" for cat in left.categories @@ -497,7 +497,10 @@ def _interact( ) elif isinstance(right, _InteractableCategoricalVector): - return _interact_categoricals(left, right, separator=separator) + if not reverse: + return _interact_categoricals(left, right, separator=separator) + else: + return _interact_categoricals(right, left, separator=separator) raise TypeError( f"Cannot interact {type(left).__name__} with {type(right).__name__}" diff --git a/tests/test_formula.py b/tests/test_formula.py index 26299af3..4b615000 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -12,7 +12,13 @@ from scipy import sparse as sps import tabmat as tm -from tabmat.formula import TabmatMaterializer +from tabmat.formula import ( + TabmatMaterializer, + _interact, + _InteractableCategoricalVector, + _InteractableDenseVector, + _InteractableSparseVector, +) @pytest.fixture @@ -394,6 +400,60 @@ def test_names_against_pandas(df, formula, ensure_full_rank): assert model_tabmat.model_spec.column_names == tuple(model_df.columns) +VECTORS = [ + _InteractableDenseVector(np.array([1, 2, 3, 4, 5], dtype=np.float64)).set_name( + "dense" + ), + _InteractableSparseVector( + sps.csc_matrix(np.array([[1, 0, 0, 0, 2]], dtype=np.float64).T) + ).set_name("sparse"), + _InteractableCategoricalVector.from_categorical( + pd.Categorical(["a", "b", "c", "b", "a"]), reduced_rank=True + ).set_name("cat_reduced"), + _InteractableCategoricalVector.from_categorical( + pd.Categorical(["a", "b", "c", "b", "a"]), reduced_rank=False + ).set_name("cat_full"), +] + + +@pytest.mark.parametrize( + "left", VECTORS, ids=["dense", "sparse", "cat_full", "cat_reduced"] +) +@pytest.mark.parametrize( + "right", VECTORS, ids=["dense", "sparse", "cat_full", "cat_reduced"] +) +@pytest.mark.parametrize("reverse", [False, True], ids=["not_reversed", "reversed"]) +def test_interactable_vectors(left, right, reverse): + n = left.to_tabmat().shape[0] + left_np = left.to_tabmat().A.reshape((n, -1)) + right_np = right.to_tabmat().A.reshape((n, -1)) + + if reverse: + left_np, right_np = right_np, left_np + + if isinstance(left, _InteractableCategoricalVector) and isinstance( + right, _InteractableCategoricalVector + ): + result_np = np.zeros((n, left_np.shape[1] * right_np.shape[1])) + for i in range(right_np.shape[1]): + for j in range(left_np.shape[1]): + result_np[:, i * left_np.shape[1] + j] = left_np[:, j] * right_np[:, i] + else: + result_np = left_np * right_np + + result_tm = _interact(left, right, reverse=reverse) + np.testing.assert_array_equal( + result_tm.to_tabmat().A.squeeze(), result_np.squeeze() + ) + if not reverse: + assert result_tm.name == left.name + ":" + right.name + else: + assert result_tm.name == right.name + ":" + left.name + + +# Tests from formulaic's test suite +# --------------------------------- + FORMULAIC_TESTS = { # '': (, , , ) "a": (["Intercept", "a"], ["Intercept", "a"], ["Intercept", "a"], 2), From 20b617b7f51512ef482b04cdbd29b3b7a6709a0a Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 27 Jun 2023 15:10:30 +0200 Subject: [PATCH 30/72] Add more checks --- tests/test_formula.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/test_formula.py b/tests/test_formula.py index 4b615000..098f1f55 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -441,14 +441,30 @@ def test_interactable_vectors(left, right, reverse): else: result_np = left_np * right_np - result_tm = _interact(left, right, reverse=reverse) + result_vec = _interact(left, right, reverse=reverse) + + # Test types + if isinstance(left, _InteractableCategoricalVector) or isinstance( + right, _InteractableCategoricalVector + ): + assert isinstance(result_vec, _InteractableCategoricalVector) + elif isinstance(left, _InteractableSparseVector) or isinstance( + right, _InteractableSparseVector + ): + assert isinstance(result_vec, _InteractableSparseVector) + else: + assert isinstance(result_vec, _InteractableDenseVector) + + # Test values np.testing.assert_array_equal( - result_tm.to_tabmat().A.squeeze(), result_np.squeeze() + result_vec.to_tabmat().A.squeeze(), result_np.squeeze() ) + + # Test names if not reverse: - assert result_tm.name == left.name + ":" + right.name + assert result_vec.name == left.name + ":" + right.name else: - assert result_tm.name == right.name + ":" + left.name + assert result_vec.name == right.name + ":" + left.name # Tests from formulaic's test suite From 010ad8ec323b5326d49b31612af1c589222315e3 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 27 Jun 2023 16:45:00 +0200 Subject: [PATCH 31/72] Change argument name --- src/tabmat/constructor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index fbb803b7..c19c8908 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -182,7 +182,7 @@ def from_csc(mat: sps.csc_matrix, threshold=0.1): def from_formula( formula: Union[str, Formula], - df: pd.DataFrame, + data: pd.DataFrame, dtype: np.dtype = np.float64, sparse_threshold: float = 0.1, cat_threshold: int = 4, @@ -199,7 +199,7 @@ def from_formula( ---------- formula: str A formula accepted by formulaic. - df: pd.DataFrame + data: pd.DataFrame pandas data frame to be converted. ensure_full_rank: bool, default False If True, ensure that the matrix has full structural rank by categories. @@ -227,7 +227,7 @@ def from_formula( ensure_full_rank=ensure_full_rank, ) materializer = TabmatMaterializer( - df, + data, context=context, interaction_separator=interaction_separator, categorical_format=categorical_format, From fcc1a916c26dfb4b7a5816fd2d7d5ddd94f8abd0 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 27 Jun 2023 19:03:59 +0200 Subject: [PATCH 32/72] Make C() stateful (remember levels) --- src/tabmat/formula.py | 100 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 90 insertions(+), 10 deletions(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 94dabfa4..8a8aefc9 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -2,14 +2,17 @@ import itertools from abc import ABC, abstractmethod from collections import OrderedDict -from typing import List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import numpy import pandas from formulaic import ModelMatrix, ModelSpec +from formulaic.errors import FactorEncodingError from formulaic.materializers import FormulaMaterializer from formulaic.materializers.base import EncodedTermStructure -from formulaic.materializers.types import FactorValues, NAAction +from formulaic.materializers.types import FactorValues, NAAction, ScopedTerm +from formulaic.parser.types import Term +from formulaic.transforms import stateful_transform from interface_meta import override from scipy import sparse as sps @@ -213,6 +216,51 @@ def _get_columns_for_term(self, factors, spec, scale=1): ) return out + # Again, need a small change to handle categoricals properly + @override + def _enforce_structure( + self, + cols: List[Tuple[Term, List[ScopedTerm], Dict[str, Any]]], + spec, + drop_rows: set, + ): + # TODO: Verify that imputation strategies are intuitive and make sense. + assert len(cols) == len(spec.structure) + for i, col_spec in enumerate(cols): + scoped_cols = col_spec[2] + target_cols = spec.structure[i][2] + if len(scoped_cols) > len(target_cols): + raise FactorEncodingError( + f"Term `{col_spec[0]}` has generated too many columns compared to " + f"specification: generated {list(scoped_cols)}, expecting " + f"{target_cols}." + ) + if len(scoped_cols) < len(target_cols): + if len(scoped_cols) == 0: + col = self._encode_constant(0, None, None, spec, drop_rows) + elif len(scoped_cols) == 1: + col = tuple(scoped_cols.values())[0] + # This is the small change: + if isinstance(col, _InteractableCategoricalVector): + target_cols = [col.name] + else: + raise FactorEncodingError( + f"Term `{col_spec[0]}` has generated insufficient columns " + "compared to specification: generated {list(scoped_cols)}, " + f"expecting {target_cols}." + ) + scoped_cols = {name: col for name in target_cols} + elif set(scoped_cols) != set(target_cols): + raise FactorEncodingError( + f"Term `{col_spec[0]}` has generated columns that are inconsistent " + "with specification: generated {list(scoped_cols)}, expecting " + f"{target_cols}." + ) + + yield col_spec[0], col_spec[1], { + col: scoped_cols[col] for col in target_cols + } + class _InteractableVector(ABC): """Abstract base class for interactable vectors, which are mostly thin @@ -553,6 +601,7 @@ def _interact_categoricals( def _C( data, *, + levels: Optional[Iterable[str]] = None, spans_intercept: bool = True, ): """ @@ -564,17 +613,20 @@ def _C( """ def encoder( - values, - reduced_rank, - drop_rows, - encoder_state, - model_spec, + values: Any, + reduced_rank: bool, + drop_rows: List[int], + encoder_state: Dict[str, Any], + model_spec: ModelSpec, ): if drop_rows: values = values.drop(index=values.index[drop_rows]) - cat = pandas.Categorical(values._values) - return _InteractableCategoricalVector.from_categorical( - cat, reduced_rank=reduced_rank + return encode_contrasts( + values, + levels=levels, + reduced_rank=reduced_rank, + _state=encoder_state, + _spec=model_spec, ) return FactorValues( @@ -583,3 +635,31 @@ def encoder( spans_intercept=spans_intercept, encoder=encoder, ) + + +@stateful_transform +def encode_contrasts( + data, + *, + levels: Optional[Iterable[str]] = None, + reduced_rank: bool = False, + _state=None, + _spec=None, +) -> FactorValues[_InteractableCategoricalVector]: + """ + Encode a categorical dataset into one an _InteractableCategoricalVector + + Parameters + ---------- + data: The categorical data array/series to be encoded. + levels: The complete set of levels (categories) posited to be present in + the data. This can also be used to reorder the levels as needed. + reduced_rank: Whether to reduce the rank of output encoded columns in + order to avoid spanning the intercept. + """ + levels = levels if levels is not None else _state.get("categories") + cat = pandas.Categorical(data._values, categories=levels) + _state["categories"] = cat.categories + return _InteractableCategoricalVector.from_categorical( + cat, reduced_rank=reduced_rank + ) From d9d1353104aff0da550834c83e6806ad211cc20f Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 27 Jun 2023 19:27:55 +0200 Subject: [PATCH 33/72] Add test for categorizer state --- tests/test_formula.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_formula.py b/tests/test_formula.py index 098f1f55..7d10aada 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -400,6 +400,19 @@ def test_names_against_pandas(df, formula, ensure_full_rank): assert model_tabmat.model_spec.column_names == tuple(model_df.columns) +@pytest.mark.parametrize( + "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] +) +def test_C_state(df, ensure_full_rank): + model_tabmat = tm.from_formula("C(str_1) : C(cat_1)", df, cat_threshold=0) + model_tabmat_2 = model_tabmat.model_spec.get_model_matrix(df[:2]) + np.testing.assert_array_equal(model_tabmat.A[:2, :], model_tabmat_2.A) + np.testing.assert_array_equal( + model_tabmat.matrices[1].cat.categories, + model_tabmat_2.matrices[1].cat.categories, + ) + + VECTORS = [ _InteractableDenseVector(np.array([1, 2, 3, 4, 5], dtype=np.float64)).set_name( "dense" From 695de6f4e268466936c0d634a7e0ca09aa074ff5 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 28 Jun 2023 14:56:51 +0200 Subject: [PATCH 34/72] More correct handling of encoding categoricals --- src/tabmat/formula.py | 58 ++++++++++++++++++++++++++++++++++++------- tests/test_formula.py | 13 ++++++++-- 2 files changed, 60 insertions(+), 11 deletions(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 8a8aefc9..0735e6c8 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -94,9 +94,12 @@ def _encode_categorical( # We do not do any encoding here as it is handled by tabmat if drop_rows: values = values.drop(index=values.index[drop_rows]) - cat = pandas.Categorical(values._values) - return _InteractableCategoricalVector.from_categorical( - cat, reduced_rank=reduced_rank + return encode_contrasts( + values, + reduced_rank=reduced_rank, + _metadata=metadata, + _state=encoder_state, + _spec=spec, ) @override @@ -216,7 +219,7 @@ def _get_columns_for_term(self, factors, spec, scale=1): ) return out - # Again, need a small change to handle categoricals properly + # Again, need a correction to handle categoricals properly @override def _enforce_structure( self, @@ -224,11 +227,23 @@ def _enforce_structure( spec, drop_rows: set, ): - # TODO: Verify that imputation strategies are intuitive and make sense. assert len(cols) == len(spec.structure) for i, col_spec in enumerate(cols): scoped_cols = col_spec[2] - target_cols = spec.structure[i][2] + target_cols = spec.structure[i][2].copy() + + # Correction for categorical variables: + for name, col in scoped_cols.items(): + if isinstance(col, _InteractableCategoricalVector): + try: + _replace_sequence(target_cols, col.get_names(), name) + except ValueError: + raise FactorEncodingError( + f"Term `{col_spec[0]}` has generated columns that are " + "inconsistent with the specification: generated: " + f"{col.get_names()}, expecting: {target_cols}." + ) + if len(scoped_cols) > len(target_cols): raise FactorEncodingError( f"Term `{col_spec[0]}` has generated too many columns compared to " @@ -240,9 +255,6 @@ def _enforce_structure( col = self._encode_constant(0, None, None, spec, drop_rows) elif len(scoped_cols) == 1: col = tuple(scoped_cols.values())[0] - # This is the small change: - if isinstance(col, _InteractableCategoricalVector): - target_cols = [col.name] else: raise FactorEncodingError( f"Term `{col_spec[0]}` has generated insufficient columns " @@ -663,3 +675,31 @@ def encode_contrasts( return _InteractableCategoricalVector.from_categorical( cat, reduced_rank=reduced_rank ) + + +def _replace_sequence(lst: List[str], sequence: List[str], replacement: "str") -> None: + """Replace a sequence of elements in a list with a single element. + + Raises a ValueError if the sequence is not in the list in the correct order. + Only checks for the first possible start of the sequence. + + Parameters + ---------- + lst : List[str] + The list to replace elements in. + sequence : List[str] + The sequence of elements to replace. + replacement : str + The element to replace the sequence with. + """ + try: + start = lst.index(sequence[0]) + except ValueError: + start = 0 # Will handle this below + + for elem in sequence: + if lst[start] != elem: + raise ValueError("The sequence is not in the list") + del lst[start] + + lst.insert(start, replacement) diff --git a/tests/test_formula.py b/tests/test_formula.py index 7d10aada..55344e3f 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -400,11 +400,20 @@ def test_names_against_pandas(df, formula, ensure_full_rank): assert model_tabmat.model_spec.column_names == tuple(model_df.columns) +@pytest.mark.parametrize( + "formula", + [ + pytest.param("str_1 : cat_1", id="implicit"), + pytest.param("C(str_1) : C(cat_1, spans_intercept=False)", id="explicit"), + ], +) @pytest.mark.parametrize( "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] ) -def test_C_state(df, ensure_full_rank): - model_tabmat = tm.from_formula("C(str_1) : C(cat_1)", df, cat_threshold=0) +def test_C_state(df, formula, ensure_full_rank): + model_tabmat = tm.from_formula( + "str_1 : cat_1", df, cat_threshold=0, ensure_full_rank=ensure_full_rank + ) model_tabmat_2 = model_tabmat.model_spec.get_model_matrix(df[:2]) np.testing.assert_array_equal(model_tabmat.A[:2, :], model_tabmat_2.A) np.testing.assert_array_equal( From 064daac74b35aa4f80b6de5abc021c12ddb5dc51 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 28 Jun 2023 19:19:20 +0200 Subject: [PATCH 35/72] Make adding an intercept implicitly parametrizable Default is False --- src/tabmat/constructor.py | 9 ++- tests/test_formula.py | 125 ++++++++++++++++++++++++++++---------- 2 files changed, 101 insertions(+), 33 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index c19c8908..bb19af53 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd from formulaic import Formula, ModelSpec +from formulaic.parser import DefaultFormulaParser from formulaic.utils.layered_mapping import LayeredMapping from pandas.api.types import is_numeric_dtype from scipy import sparse as sps @@ -190,6 +191,7 @@ def from_formula( interaction_separator: str = ":", categorical_format: str = "{name}[T.{category}]", intercept_name: str = "Intercept", + include_intercept: bool = False, context: Optional[Union[int, Mapping[str, Any]]] = 0, ) -> SplitMatrix: """ @@ -210,6 +212,9 @@ def from_formula( Has to include the placeholders ``{name}`` and ``{category}``. intercept_name: str, default "Intercept" The name of the intercept column. + include_intercept: bool, default False + Whether to include an intercept column if the formula does not + include (``+ 1``) or exclude (``+ 0`` or ``- 1``) it explicitly. context: Union[int, Mapping[str, Any]], default 0 The context to use for evaluating the formula. If an integer, the context is taken from the stack frame of the caller at the given @@ -223,7 +228,9 @@ def from_formula( else: context = None spec = ModelSpec( - formula=Formula(formula), + formula=Formula( + formula, _parser=DefaultFormulaParser(include_intercept=include_intercept) + ), ensure_full_rank=ensure_full_rank, ) materializer = TabmatMaterializer( diff --git a/tests/test_formula.py b/tests/test_formula.py index 55344e3f..01c7887e 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -48,7 +48,7 @@ def test_retrieval(): "formula, expected", [ pytest.param( - "num_1", + "1 + num_1", tm.SplitMatrix( [ tm.DenseMatrix( @@ -61,7 +61,7 @@ def test_retrieval(): id="numeric", ), pytest.param( - "cat_1", + "1 + cat_1", tm.SplitMatrix( [ tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), @@ -83,7 +83,7 @@ def test_retrieval(): id="categorical", ), pytest.param( - "num_1 : cat_1", + "1 + num_1 : cat_1", tm.SplitMatrix( [ tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), @@ -152,7 +152,7 @@ def test_matrix_against_expectation(df, formula, expected): "formula, expected", [ pytest.param( - "num_1", + "1 + num_1", tm.SplitMatrix( [ tm.DenseMatrix( @@ -165,7 +165,7 @@ def test_matrix_against_expectation(df, formula, expected): id="numeric", ), pytest.param( - "cat_1", + "1 + cat_1", tm.SplitMatrix( [ tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), @@ -187,7 +187,7 @@ def test_matrix_against_expectation(df, formula, expected): id="categorical", ), pytest.param( - "num_1 : cat_1", + "1 + num_1 : cat_1", tm.SplitMatrix( [ tm.DenseMatrix(np.array([[1.0, 1.0, 1.0, 1.0, 1.0]]).T), @@ -289,20 +289,24 @@ def test_matrix_against_expectation_qcl(df, formula, expected): def test_matrix_against_pandas(df, formula, ensure_full_rank): num_in_scope = 2 # noqa model_df = formulaic.model_matrix(formula, df, ensure_full_rank=ensure_full_rank) - model_tabmat = tm.from_formula(formula, df, ensure_full_rank=ensure_full_rank) + model_tabmat = tm.from_formula( + formula, df, ensure_full_rank=ensure_full_rank, include_intercept=True + ) np.testing.assert_array_equal(model_df.to_numpy(), model_tabmat.A) @pytest.mark.parametrize( "formula, expected_names", [ - pytest.param("num_1 + num_2", ("Intercept", "num_1", "num_2"), id="numeric"), + pytest.param( + "1 + num_1 + num_2", ("Intercept", "num_1", "num_2"), id="numeric" + ), pytest.param("num_1 + num_2 - 1", ("num_1", "num_2"), id="no_intercept"), pytest.param( - "cat_1", ("Intercept", "cat_1[T.b]", "cat_1[T.c]"), id="categorical" + "1 + cat_1", ("Intercept", "cat_1[T.b]", "cat_1[T.c]"), id="categorical" ), pytest.param( - "cat_2 * cat_3", + "1 + cat_2 * cat_3", ( "Intercept", "cat_2[T.y]", @@ -319,7 +323,9 @@ def test_matrix_against_pandas(df, formula, ensure_full_rank): id="polynomial", ), pytest.param( - "{np.log(num_1 ** 2)}", ("Intercept", "np.log(num_1 ** 2)"), id="functions" + "1 + {np.log(num_1 ** 2)}", + ("Intercept", "np.log(num_1 ** 2)"), + id="functions", ), ], ) @@ -331,9 +337,11 @@ def test_names_against_expectation(df, formula, expected_names): @pytest.mark.parametrize( "formula, expected_names", [ - pytest.param("cat_1", ("intercept", "cat_1__b", "cat_1__c"), id="categorical"), pytest.param( - "cat_2 * cat_3", + "1 + cat_1", ("intercept", "cat_1__b", "cat_1__c"), id="categorical" + ), + pytest.param( + "1 + cat_2 * cat_3", ( "intercept", "cat_2__y", @@ -350,7 +358,9 @@ def test_names_against_expectation(df, formula, expected_names): id="polynomial", ), pytest.param( - "{np.log(num_1 ** 2)}", ("intercept", "np.log(num_1 ** 2)"), id="functions" + "1 + {np.log(num_1 ** 2)}", + ("intercept", "np.log(num_1 ** 2)"), + id="functions", ), ], ) @@ -372,22 +382,22 @@ def test_names_against_expectation_qcl(df, formula, expected_names): @pytest.mark.parametrize( "formula", [ - pytest.param("num_1 + num_2", id="numeric"), - pytest.param("cat_1 + cat_2", id="categorical"), - pytest.param("cat_1 * cat_2 * cat_3", id="interaction"), - pytest.param("num_1 + cat_1 * num_2 * cat_2", id="mixed"), - pytest.param("{np.log(num_1)} + {num_in_scope * num_2}", id="functions"), - pytest.param("{num_1 * num_in_scope}", id="variable_in_scope"), - pytest.param("bs(num_1, 3)", id="spline"), + pytest.param("1 + num_1 + num_2", id="numeric"), + pytest.param("1 + cat_1 + cat_2", id="categorical"), + pytest.param("1 + cat_1 * cat_2 * cat_3", id="interaction"), + pytest.param("1 + num_1 + cat_1 * num_2 * cat_2", id="mixed"), + pytest.param("1 + {np.log(num_1)} + {num_in_scope * num_2}", id="functions"), + pytest.param("1 + {num_1 * num_in_scope}", id="variable_in_scope"), + pytest.param("1 + bs(num_1, 3)", id="spline"), pytest.param( - "poly(num_1, 3, raw=True) + poly(num_2, 3, raw=False)", id="polynomial" + "1 + poly(num_1, 3, raw=True) + poly(num_2, 3, raw=False)", id="polynomial" ), pytest.param( - "C(num_1)", + "1 + C(num_1)", id="convert_to_categorical", ), pytest.param( - "C(cat_1, spans_intercept=False) * cat_2 * cat_3", + "1 + C(cat_1, spans_intercept=False) * cat_2 * cat_3", id="custom_contrasts", ), ], @@ -400,6 +410,56 @@ def test_names_against_pandas(df, formula, ensure_full_rank): assert model_tabmat.model_spec.column_names == tuple(model_df.columns) +@pytest.mark.parametrize( + "ensure_full_rank", [True, False], ids=["full_rank", "all_levels"] +) +@pytest.mark.parametrize( + "formula, formula_with_intercept, formula_wo_intercept", + [ + ("num_1", "1 + num_1", "num_1 - 1"), + ("cat_1", "1 + cat_1", "cat_1 - 1"), + ( + "num_1 * cat_1 * cat_2", + "1 + num_1 * cat_1 * cat_2", + "num_1 * cat_1 * cat_2 - 1", + ), + ], + ids=["numeric", "categorical", "mixed"], +) +def test_include_intercept( + df, formula, formula_with_intercept, formula_wo_intercept, ensure_full_rank +): + model_no_include = tm.from_formula( + formula, df, include_intercept=False, ensure_full_rank=ensure_full_rank + ) + model_no_intercept = tm.from_formula( + formula_wo_intercept, + df, + include_intercept=True, + ensure_full_rank=ensure_full_rank, + ) + np.testing.assert_array_equal(model_no_include.A, model_no_intercept.A) + assert ( + model_no_include.model_spec.column_names + == model_no_intercept.model_spec.column_names + ) + + model_include = tm.from_formula( + formula, df, include_intercept=True, ensure_full_rank=ensure_full_rank + ) + model_intercept = tm.from_formula( + formula_with_intercept, + df, + include_intercept=False, + ensure_full_rank=ensure_full_rank, + ) + np.testing.assert_array_equal(model_include.A, model_intercept.A) + assert ( + model_no_include.model_spec.column_names + == model_no_intercept.model_spec.column_names + ) + + @pytest.mark.parametrize( "formula", [ @@ -412,7 +472,7 @@ def test_names_against_pandas(df, formula, ensure_full_rank): ) def test_C_state(df, formula, ensure_full_rank): model_tabmat = tm.from_formula( - "str_1 : cat_1", df, cat_threshold=0, ensure_full_rank=ensure_full_rank + "str_1 : cat_1 + 1", df, cat_threshold=0, ensure_full_rank=ensure_full_rank ) model_tabmat_2 = model_tabmat.model_spec.get_model_matrix(df[:2]) np.testing.assert_array_equal(model_tabmat.A[:2, :], model_tabmat_2.A) @@ -602,13 +662,14 @@ def test_na_handling(self, data_with_nulls, formula, tests): assert mm.shape == (tests[3], len(tests[2])) assert list(mm.model_spec.column_names) == tests[2] - # Tabmat does not allo NAs in categoricals - if formula == "a": - mm = TabmatMaterializer(data_with_nulls).get_model_matrix( - formula, na_action="ignore" - ) - assert isinstance(mm, tm.MatrixBase) - assert mm.shape == (3, len(tests[0]) + (-1 if "A" in formula else 0)) + if formula != "a": + pytest.skip("Tabmat does not allo NAs in categoricals") + + mm = TabmatMaterializer(data_with_nulls).get_model_matrix( + formula, na_action="ignore" + ) + assert isinstance(mm, tm.MatrixBase) + assert mm.shape == (3, len(tests[0]) + (-1 if "A" in formula else 0)) def test_state(self, materializer): mm = materializer.get_model_matrix("center(a) - 1") From 55ae36f756f856123152b37084a40394fafbb57a Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 30 Jun 2023 09:17:06 +0200 Subject: [PATCH 36/72] Add na_action parameter to constrictor --- src/tabmat/constructor.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index bb19af53..73e78462 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd from formulaic import Formula, ModelSpec +from formulaic.materializers.types import NAAction from formulaic.parser import DefaultFormulaParser from formulaic.utils.layered_mapping import LayeredMapping from pandas.api.types import is_numeric_dtype @@ -184,10 +185,11 @@ def from_csc(mat: sps.csc_matrix, threshold=0.1): def from_formula( formula: Union[str, Formula], data: pd.DataFrame, + ensure_full_rank: bool = False, + na_action: Union[str, NAAction] = NAAction.IGNORE, dtype: np.dtype = np.float64, sparse_threshold: float = 0.1, cat_threshold: int = 4, - ensure_full_rank: bool = False, interaction_separator: str = ":", categorical_format: str = "{name}[T.{category}]", intercept_name: str = "Intercept", @@ -205,6 +207,15 @@ def from_formula( pandas data frame to be converted. ensure_full_rank: bool, default False If True, ensure that the matrix has full structural rank by categories. + na_action: Union[str, NAAction], default NAAction.IGNORE + How to handle missing values. Can be one of "drop", "ignore", "raise". + dtype: np.dtype, default np.float64 + The dtype of the resulting matrix. + sparse_threshold: float, default 0.1 + The density below which a column is treated as sparse. + cat_threshold: int, default 4 + The number of categories below which a categorical column is one-hot + encoded. This is only checked after interactions have been applied. interaction_separator: str, default ":" The separator between the names of interacted variables. categorical_format: str, default "{name}[T.{category}]" @@ -232,6 +243,7 @@ def from_formula( formula, _parser=DefaultFormulaParser(include_intercept=include_intercept) ), ensure_full_rank=ensure_full_rank, + na_action=na_action, ) materializer = TabmatMaterializer( data, From 5f253a3457d4f3c53247d6d217a15095002058e5 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 30 Jun 2023 12:44:31 +0200 Subject: [PATCH 37/72] Add test for sparse numerical columns --- tests/test_formula.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_formula.py b/tests/test_formula.py index 01c7887e..89d9ff9d 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -82,6 +82,25 @@ def test_retrieval(): ), id="categorical", ), + pytest.param( + "{np.where(num_1 >= 2, num_1, 0)} * {np.where(num_2 <= 2, num_2, 0)}", + tm.SplitMatrix( + [ + tm.DenseMatrix(np.array([[0.0, 2.0, 3.0, 4.0, 5.0]]).T), + tm.SparseMatrix( + sps.csc_matrix( + np.array( + [ + [1.0, 2.0, 0.0, 0.0, 0.0], + [0.0, 2.0, 0.0, 0.0, 0.0], + ] + ) + ).T + ), + ] + ), + id="numeric_sparse", + ), pytest.param( "1 + num_1 : cat_1", tm.SplitMatrix( From 9caa6dfbf87a9ed4d9a20f384396ee93ee5fac2b Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 2 Aug 2023 13:04:46 +0200 Subject: [PATCH 38/72] Add option to not add the constant column --- src/tabmat/constructor.py | 7 ++++++- src/tabmat/formula.py | 5 +++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index 73e78462..b281bbed 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -194,6 +194,7 @@ def from_formula( categorical_format: str = "{name}[T.{category}]", intercept_name: str = "Intercept", include_intercept: bool = False, + add_column_for_intercept: bool = True, context: Optional[Union[int, Mapping[str, Any]]] = 0, ) -> SplitMatrix: """ @@ -224,8 +225,11 @@ def from_formula( intercept_name: str, default "Intercept" The name of the intercept column. include_intercept: bool, default False - Whether to include an intercept column if the formula does not + Whether to include an intercept term if the formula does not include (``+ 1``) or exclude (``+ 0`` or ``- 1``) it explicitly. + add_column_for_intercept: bool, default = True + Whether to add a column of ones for the intercept, or just + have a term without a corresponding column. For advanced use only. context: Union[int, Mapping[str, Any]], default 0 The context to use for evaluating the formula. If an integer, the context is taken from the stack frame of the caller at the given @@ -254,5 +258,6 @@ def from_formula( dtype=dtype, sparse_threshold=sparse_threshold, cat_threshold=cat_threshold, + add_column_for_intercept=add_column_for_intercept, ) return materializer.get_model_matrix(spec) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 0735e6c8..4cea1162 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -41,6 +41,9 @@ def _init(self): self.dtype = self.params.get("dtype", numpy.float64) self.sparse_threshold = self.params.get("sparse_threshold", 0.1) self.cat_threshold = self.params.get("cat_threshold", 4) + self.add_column_for_intercept = self.params.get( + "add_column_for_intercept", True + ) # We can override formulaic's C() function here self.context["C"] = _C @@ -139,6 +142,8 @@ def _build_model_matrix(self, spec: ModelSpec, drop_rows): scoped_cols = OrderedDict() for scoped_term in scoped_terms: if not scoped_term.factors: + if not self.add_column_for_intercept: + continue scoped_cols[ "Intercept" ] = scoped_term.scale * self._encode_constant( From f180075bd3aee5c0f3113721c4f68d30feca2d6b Mon Sep 17 00:00:00 2001 From: "quant-ranger[bot]" <132915763+quant-ranger[bot]@users.noreply.github.com> Date: Mon, 26 Jun 2023 08:10:06 +0100 Subject: [PATCH 39/72] Pre-commit autoupdate (#274) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2fa1dbd2..94355447 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: - id: isort-conda additional_dependencies: [toml] - repo: https://github.com/Quantco/pre-commit-mirrors-mypy - rev: "1.3.0" + rev: "1.4.0" hooks: - id: mypy-conda additional_dependencies: From 5eaba1365890bfc5b021072a7af980a1c0ce9c72 Mon Sep 17 00:00:00 2001 From: "quant-ranger[bot]" <132915763+quant-ranger[bot]@users.noreply.github.com> Date: Mon, 3 Jul 2023 08:39:23 +0100 Subject: [PATCH 40/72] Pre-commit autoupdate (#276) Co-authored-by: quant-ranger[bot] <132915763+quant-ranger[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 94355447..5a37c63e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: - id: isort-conda additional_dependencies: [toml] - repo: https://github.com/Quantco/pre-commit-mirrors-mypy - rev: "1.4.0" + rev: "1.4.1" hooks: - id: mypy-conda additional_dependencies: From 6fb5a96e86b060dfca340d15bdde0339c7627c5c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Jul 2023 08:40:15 +0100 Subject: [PATCH 41/72] Bump pypa/gh-action-pypi-publish from 1.8.6 to 1.8.7 (#277) Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.8.6 to 1.8.7. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.8.6...v1.8.7) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build_wheels_release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_wheels_release.yml b/.github/workflows/build_wheels_release.yml index 0edbde90..3f61d37c 100644 --- a/.github/workflows/build_wheels_release.yml +++ b/.github/workflows/build_wheels_release.yml @@ -60,7 +60,7 @@ jobs: name: artifact path: dist - - uses: pypa/gh-action-pypi-publish@v1.8.6 + - uses: pypa/gh-action-pypi-publish@v1.8.7 with: user: __token__ password: ${{ secrets.GH_TESTPYPI_UPLOAD }} @@ -75,7 +75,7 @@ jobs: name: artifact path: dist - - uses: pypa/gh-action-pypi-publish@v1.8.6 + - uses: pypa/gh-action-pypi-publish@v1.8.7 with: user: __token__ password: ${{ secrets.GH_PYPI_UPLOAD }} From a4735770e77ab97e4db3fdd834418116975cf842 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jul 2023 08:29:56 -0400 Subject: [PATCH 42/72] Bump pypa/gh-action-pypi-publish from 1.8.7 to 1.8.8 (#279) Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.8.7 to 1.8.8. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.8.7...v1.8.8) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build_wheels_release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_wheels_release.yml b/.github/workflows/build_wheels_release.yml index 3f61d37c..b43f3018 100644 --- a/.github/workflows/build_wheels_release.yml +++ b/.github/workflows/build_wheels_release.yml @@ -60,7 +60,7 @@ jobs: name: artifact path: dist - - uses: pypa/gh-action-pypi-publish@v1.8.7 + - uses: pypa/gh-action-pypi-publish@v1.8.8 with: user: __token__ password: ${{ secrets.GH_TESTPYPI_UPLOAD }} @@ -75,7 +75,7 @@ jobs: name: artifact path: dist - - uses: pypa/gh-action-pypi-publish@v1.8.7 + - uses: pypa/gh-action-pypi-publish@v1.8.8 with: user: __token__ password: ${{ secrets.GH_PYPI_UPLOAD }} From 1084b53774219caaecc84880b8abff75f8594d18 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jul 2023 08:30:20 -0400 Subject: [PATCH 43/72] Bump pypa/cibuildwheel from 2.13.1 to 2.14.1 (#280) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.13.1 to 2.14.1. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.13.1...v2.14.1) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build_wheels.yml | 2 +- .github/workflows/build_wheels_release.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index b531b3bc..e147b2bf 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -20,7 +20,7 @@ jobs: with: platforms: all - name: Build wheels - uses: pypa/cibuildwheel@v2.13.1 + uses: pypa/cibuildwheel@v2.14.1 env: CIBW_ARCHS_LINUX: auto CIBW_ARCHS_MACOS: x86_64 arm64 diff --git a/.github/workflows/build_wheels_release.yml b/.github/workflows/build_wheels_release.yml index b43f3018..9f6cd123 100644 --- a/.github/workflows/build_wheels_release.yml +++ b/.github/workflows/build_wheels_release.yml @@ -21,7 +21,7 @@ jobs: with: platforms: all - name: Build wheels - uses: pypa/cibuildwheel@v2.13.1 + uses: pypa/cibuildwheel@v2.14.1 env: CIBW_ARCHS_LINUX: auto aarch64 CIBW_ARCHS_MACOS: x86_64 arm64 From ce96be8f332d50d51914c8358358df704393c1fb Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 18 Jul 2023 17:17:08 +0200 Subject: [PATCH 44/72] Minimal implementation (tests green) --- src/tabmat/categorical_matrix.py | 6 ++- src/tabmat/dense_matrix.py | 84 +++++++++++++++++++++++--------- src/tabmat/sparse_matrix.py | 9 ++-- tests/test_matrices.py | 4 +- 4 files changed, 71 insertions(+), 32 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 44c47efd..7a751fdb 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -452,8 +452,10 @@ def _cross_sandwich( R_cols: Optional[np.ndarray] = None, ) -> np.ndarray: """Perform a sandwich product: X.T @ diag(d) @ Y.""" - if isinstance(other, np.ndarray): - return self._cross_dense(other, d, rows, L_cols, R_cols) + from .dense_matrix import DenseMatrix + + if isinstance(other, (np.ndarray, DenseMatrix)): + return self._cross_dense(np.asarray(other), d, rows, L_cols, R_cols) if isinstance(other, sps.csc_matrix): return self._cross_sparse(other, d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 84ef1f1d..c854041f 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -17,7 +17,7 @@ ) -class DenseMatrix(np.ndarray, MatrixBase): +class DenseMatrix(np.lib.mixins.NDArrayOperatorsMixin, MatrixBase): """ A ``numpy.ndarray`` subclass with several additional functions that allow it to share the MatrixBase API with SparseMatrix and CategoricalMatrix. @@ -32,29 +32,65 @@ class DenseMatrix(np.ndarray, MatrixBase): """ - def __new__(cls, input_array): # noqa - """ - Details of how to subclass np.ndarray are explained here: + def __init__(self, input_array): + self._array = np.asarray(input_array) - https://docs.scipy.org/doc/numpy/user/basics.subclassing.html\ - #slightly-more-realistic-example-attribute-added-to-existing-array - """ - obj = np.asarray(input_array).view(cls) - if not np.issubdtype(obj.dtype, np.floating): - raise NotImplementedError("DenseMatrix is only implemented for float data") - return obj + def __getitem__(self, key): + return type(self)(self._array.__getitem__(key)) + + def __array__(self, dtype=None): + return self._array.astype(dtype, copy=False) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + inputs = (x._array if isinstance(x, DenseMatrix) else x for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + if method in ["__call__", "accumulate"]: + return type(self)(result) + else: + return result + + def __matmul__(self, other): + return self._array.__matmul__(other) + + def __rmatmul__(self, other): + return self._array.__rmatmul__(other) + + @property + def shape(self): + """Tuple of array dimensions.""" + return self._array.shape + + @property + def ndim(self): + """Number of array dimensions.""" # noqa: D401 + return self._array.ndim + + @property + def dtype(self): + """Data-type of the array’s elements.""" # noqa: D401 + return self._array.dtype + + def transpose(self): + """Returns a view of the array with axes transposed.""" # noqa: D401 + return type(self)(self._array.T) + + T = property(transpose) + + def astype(self, dtype, order="K", casting="unsafe", copy=True): + """Copy of the array, cast to a specified type.""" + return type(self)(self._array.astype(dtype, order, casting, copy)) - def __array_finalize__(self, obj): - if obj is None: - return + def sum(self, *args, **kwargs): + """Return the sum of the array elements over the given axis.""" + return self._array.sum(*args, **kwargs) def getcol(self, i): """Return matrix column at specified index.""" - return self[:, [i]] + return type(self)(self._array[:, [i]]) def toarray(self): """Return array representation of matrix.""" - return np.asarray(self) + return self._array def sandwich( self, d: np.ndarray, rows: np.ndarray = None, cols: np.ndarray = None @@ -62,7 +98,7 @@ def sandwich( """Perform a sandwich product: X.T @ diag(d) @ X.""" d = np.asarray(d) rows, cols = setup_restrictions(self.shape, rows, cols) - return dense_sandwich(self, d, rows, cols) + return dense_sandwich(self._array, d, rows, cols) def _cross_sandwich( self, @@ -81,7 +117,7 @@ def _cross_sandwich( def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarray: """Get standard deviations of columns.""" - sqrt_arg = transpose_square_dot_weights(self, weights) - col_means**2 + sqrt_arg = transpose_square_dot_weights(self._array, weights) - col_means**2 # Minor floating point errors above can result in a very slightly # negative sqrt_arg (e.g. -5e-16). We just set those values equal to # zero. @@ -105,7 +141,7 @@ def _matvec_helper( # this without an explosion of code? vec = np.asarray(vec) check_matvec_dimensions(self, vec, transpose=transpose) - X = self.T if transpose else self + X = self._array.T if transpose else self._array # NOTE: We assume that rows and cols are unique unrestricted_rows = rows is None or len(rows) == self.shape[0] @@ -122,11 +158,11 @@ def _matvec_helper( # TODO: should take 'out' parameter fast_fnc = dense_rmatvec if transpose else dense_matvec if vec.ndim == 1: - res = fast_fnc(self, vec, rows, cols) + res = fast_fnc(self._array, vec, rows, cols) elif vec.ndim == 2 and vec.shape[1] == 1: - res = fast_fnc(self, vec[:, 0], rows, cols)[:, None] + res = fast_fnc(self._array, vec[:, 0], rows, cols)[:, None] else: - subset = self[np.ix_(rows, cols)] + subset = self._array[np.ix_(rows, cols)] res = subset.T.dot(vec[rows]) if transpose else subset.dot(vec[cols]) if out is None: return res @@ -164,5 +200,5 @@ def multiply(self, other): This assumes that ``other`` is a vector of size ``self.shape[0]``. """ if np.asanyarray(other).ndim == 1: - return super().__mul__(other[:, np.newaxis]) - return super().__mul__(other) + return type(self)(self._array.__mul__(other[:, np.newaxis])) + return type(self)(self._array.__mul__(other)) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 7f1b44ad..3befbad9 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -59,8 +59,7 @@ def sandwich( self, d: np.ndarray, rows: np.ndarray = None, cols: np.ndarray = None ) -> np.ndarray: """Perform a sandwich product: X.T @ diag(d) @ X.""" - if not hasattr(d, "dtype"): - d = np.asarray(d) + d = np.asarray(d) if not self.dtype == d.dtype: raise TypeError( f"""self and d need to be of same dtype, either np.float64 @@ -80,9 +79,11 @@ def _cross_sandwich( R_cols: Optional[np.ndarray] = None, ): """Perform a sandwich product: X.T @ diag(d) @ Y.""" - if isinstance(other, np.ndarray): - return self.sandwich_dense(other, d, rows, L_cols, R_cols) from .categorical_matrix import CategoricalMatrix + from .dense_matrix import DenseMatrix + + if isinstance(other, (np.ndarray, DenseMatrix)): + return self.sandwich_dense(np.asarray(other), d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): return other._cross_sandwich(self, d, rows, R_cols, L_cols).T diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 64317747..5d314c7e 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -24,7 +24,7 @@ def dense_matrix_C() -> tm.DenseMatrix: def dense_matrix_not_writeable() -> tm.DenseMatrix: mat = dense_matrix_F() - mat.setflags(write=False) + mat._array.setflags(write=False) return mat @@ -440,7 +440,7 @@ def test_rmatmul(mat: Union[tm.MatrixBase, tm.StandardizedMatrix], vec_type): expected = vec_as_list @ mat.A np.testing.assert_allclose(res, expected) np.testing.assert_allclose(res2, expected) - assert isinstance(res, np.ndarray) + assert isinstance(res, (np.ndarray, tm.DenseMatrix)) @pytest.mark.parametrize("mat", get_matrices()) From 6fdde75bb404cf673a4ed6d3a329159f5a6ccb6e Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 19 Jul 2023 09:33:51 +0200 Subject: [PATCH 45/72] Remove sum method and rely on np.sum --- src/tabmat/dense_matrix.py | 4 ---- src/tabmat/standardized_mat.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index c854041f..ade5d355 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -80,10 +80,6 @@ def astype(self, dtype, order="K", casting="unsafe", copy=True): """Copy of the array, cast to a specified type.""" return type(self)(self._array.astype(dtype, order, casting, copy)) - def sum(self, *args, **kwargs): - """Return the sum of the array elements over the given axis.""" - return self._array.sum(*args, **kwargs) - def getcol(self, i): """Return matrix column at specified index.""" return type(self)(self._array[:, [i]]) diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py index 0d8a0190..19b04f5a 100644 --- a/src/tabmat/standardized_mat.py +++ b/src/tabmat/standardized_mat.py @@ -147,7 +147,7 @@ def sandwich( limited_shift = self.shift[cols] if cols is not None else self.shift limited_d = d[rows] if rows is not None else d - term3_and_4 = np.outer(limited_shift, d_mat + limited_shift * limited_d.sum()) + term3_and_4 = np.outer(limited_shift, d_mat + limited_shift * np.sum(limited_d)) res = term2 + term3_and_4 if isinstance(term1, sps.dia_matrix): idx = np.arange(res.shape[0]) From a8cbf9660ac1a4c7d3809fcb6ce934396c492398 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 19 Jul 2023 16:00:07 +0200 Subject: [PATCH 46/72] Force DenseMatrix to always be 2-dimensional --- src/tabmat/dense_matrix.py | 16 +++++++++++++++- tests/test_matrices.py | 11 ++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index ade5d355..dcad444a 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -33,9 +33,22 @@ class DenseMatrix(np.lib.mixins.NDArrayOperatorsMixin, MatrixBase): """ def __init__(self, input_array): + input_array = np.asarray(input_array) + + if input_array.ndim == 1: + input_array = input_array.reshape(-1, 1) + elif input_array.ndim > 2: + raise ValueError("Input array must be 1- or 2-dimensional") + self._array = np.asarray(input_array) def __getitem__(self, key): + if not isinstance(key, tuple): + key = (key,) + + # Always return a 2d array + key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key) + return type(self)(self._array.__getitem__(key)) def __array__(self, dtype=None): @@ -44,7 +57,8 @@ def __array__(self, dtype=None): def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): inputs = (x._array if isinstance(x, DenseMatrix) else x for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) - if method in ["__call__", "accumulate"]: + if method in ("call", "accumulate") and ufunc.signature is None: + # Does not change shape return type(self)(result) else: return result diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 5d314c7e..779b160c 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -233,7 +233,7 @@ def test_to_array_standardized_mat(mat: tm.StandardizedMatrix): @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize( "other_type", - [lambda x: x, np.asarray, tm.DenseMatrix], + [lambda x: x, np.asarray], ) @pytest.mark.parametrize("cols", [None, [], [1], np.array([1])]) @pytest.mark.parametrize("other_shape", [[], [1], [2]]) @@ -243,7 +243,7 @@ def test_matvec( """ Mat. - other_type: Function transforming list to list, array, or DenseMatrix + t: Function transforming list to list, array, or DenseMatrix cols: Argument 1 to matvec, specifying which columns of the matrix (and which elements of 'other') to use other_shape: Second dimension of 'other.shape', if any. If other_shape is [], then @@ -303,7 +303,7 @@ def process_mat_vec_subsets(mat, vec, mat_rows, mat_cols, vec_idxs): @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize( "other_type", - [lambda x: x, np.array, tm.DenseMatrix], + [lambda x: x, np.array], ) @pytest.mark.parametrize("rows", [None, [], [2], np.arange(2)]) @pytest.mark.parametrize("cols", [None, [], [1], np.arange(1)]) @@ -373,7 +373,7 @@ def test_cross_sandwich( @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize( "vec_type", - [lambda x: x, np.array, tm.DenseMatrix], + [lambda x: x, np.array], ) @pytest.mark.parametrize("rows", [None, [], [1], np.arange(2)]) @pytest.mark.parametrize("cols", [None, [], [0], np.arange(1)]) @@ -430,7 +430,7 @@ def test_transpose(mat): @pytest.mark.parametrize("mat", get_matrices()) @pytest.mark.parametrize( "vec_type", - [lambda x: x, np.array, tm.DenseMatrix], + [lambda x: x, np.array], ) def test_rmatmul(mat: Union[tm.MatrixBase, tm.StandardizedMatrix], vec_type): vec_as_list = [3.0, -0.1, 0] @@ -559,6 +559,7 @@ def test_indexing_int_row(mat: Union[tm.MatrixBase, tm.StandardizedMatrix]): @pytest.mark.parametrize("mat", get_matrices()) def test_indexing_range_row(mat: Union[tm.MatrixBase, tm.StandardizedMatrix]): res = mat[0:2, :] + assert res.ndim == 2 if not isinstance(res, np.ndarray): res = res.A expected = mat.A[0:2, :] From b80cdc12351e1d36741290e352323a24fb486f2b Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 19 Jul 2023 16:40:17 +0200 Subject: [PATCH 47/72] Add __repr__ and __str__ methods --- src/tabmat/dense_matrix.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index dcad444a..842d4464 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -1,3 +1,4 @@ +import textwrap from typing import List, Optional, Union import numpy as np @@ -69,6 +70,18 @@ def __matmul__(self, other): def __rmatmul__(self, other): return self._array.__rmatmul__(other) + def __str__(self): + return "{}x{} DenseMatrix:\n\n".format(*self.shape) + np.array_str(self._array) + + def __repr__(self): + class_name = type(self).__name__ + array_str = f"{class_name}({np.array2string(self._array, separator=', ')})" + return textwrap.indent( + array_str, + " " * (len(class_name) + 1), + predicate=lambda line: not line.startswith(class_name), + ) + @property def shape(self): """Tuple of array dimensions.""" From 8983f4d1ff5cc45b5d363d4e74d68174dfddf1da Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 20 Jul 2023 10:31:35 +0200 Subject: [PATCH 48/72] Fix as_mx --- src/tabmat/split_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index a7618912..aaf88414 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -29,7 +29,7 @@ def as_mx(a: Any): return a elif sps.issparse(a): return SparseMatrix(a) - elif isinstance(a, np.ndarray): + elif isinstance(a, (np.ndarray, DenseMatrix)): return DenseMatrix(a) else: raise ValueError(f"Cannot convert type {type(a)} to Matrix.") From 16e02175ae3662f050187933c25cfdd813931f6e Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 20 Jul 2023 12:31:38 +0200 Subject: [PATCH 49/72] Fix ufunc return value --- src/tabmat/dense_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 842d4464..953fe505 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -58,7 +58,7 @@ def __array__(self, dtype=None): def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): inputs = (x._array if isinstance(x, DenseMatrix) else x for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) - if method in ("call", "accumulate") and ufunc.signature is None: + if method in ("__call__", "accumulate") and ufunc.signature is None: # Does not change shape return type(self)(result) else: From 272ba65797d92f9d8fe21c8b9ab227c460427776 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 20 Jul 2023 17:08:49 +0200 Subject: [PATCH 50/72] Wrap SparseMatrix, too --- src/tabmat/sparse_matrix.py | 96 ++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 27 deletions(-) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 3befbad9..11043bb8 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -31,29 +31,65 @@ class SparseMatrix(sps.csc_matrix, MatrixBase): """ def __init__(self, arg1, shape=None, dtype=None, copy=False): - super().__init__(arg1, shape, dtype, copy) - self.idx_dtype = max(self.indices.dtype, self.indptr.dtype) - if self.indices.dtype != self.idx_dtype: - self.indices = self.indices.astype(self.idx_dtype) - if self.indptr.dtype != self.idx_dtype: - self.indptr = self.indptr.astype(self.idx_dtype) + self._array = sps.csc_matrix(arg1, shape, dtype, copy) + + self.idx_dtype = max(self._array.indices.dtype, self._array.indptr.dtype) + if self._array.indices.dtype != self.idx_dtype: + self._array.indices = self._array.indices.astype(self.idx_dtype) + if self._array.indptr.dtype != self.idx_dtype: + self._array.indptr = self._array.indptr.astype(self.idx_dtype) assert self.indices.dtype == self.idx_dtype - if not self.has_sorted_indices: - self.sort_indices() - self._x_csr = None + if not self._array.has_sorted_indices: + self._array.sort_indices() + self._array_csr = None + + @property + def shape(self): + """Tuple of array dimensions.""" + return self._array.shape + + @property + def ndim(self): + """Number of array dimensions.""" # noqa: D401 + return self._array.ndim + + @property + def dtype(self): + """Data-type of the array’s elements.""" # noqa: D401 + return self._array.dtype + + @property + def indices(self): + """Indices of the matrix.""" # noqa: D401 + return self._array.indices + + @property + def indptr(self): + """Indptr of the matrix.""" # noqa: D401 + return self._array.indptr + + @property + def data(self): + """Data of the matrix.""" # noqa: D401 + return self._array.data + + @property + def array_csc(self): + """Return the CSC representation of the matrix.""" + return self._array @property - def x_csr(self): + def array_csr(self): """Cache the CSR representation of the matrix.""" - if self._x_csr is None: - self._x_csr = self.tocsr(copy=False) - if self._x_csr.indices.dtype != self.idx_dtype: - self._x_csr.indices = self._x_csr.indices.astype(self.idx_dtype) - if self._x_csr.indptr.dtype != self.idx_dtype: - self._x_csr.indptr = self._x_csr.indptr.astype(self.idx_dtype) + if self._array_csr is None: + self._array_csr = self._array.tocsr(copy=False) + if self._array_csr.indices.dtype != self.idx_dtype: + self._array_csr.indices = self._array_csr.indices.astype(self.idx_dtype) + if self._array_csr.indptr.dtype != self.idx_dtype: + self._array_csr.indptr = self._array_csr.indptr.astype(self.idx_dtype) - return self._x_csr + return self._array_csr def sandwich( self, d: np.ndarray, rows: np.ndarray = None, cols: np.ndarray = None @@ -68,7 +104,7 @@ def sandwich( ) rows, cols = setup_restrictions(self.shape, rows, cols, dtype=self.idx_dtype) - return sparse_sandwich(self, self.x_csr, d, rows, cols) + return sparse_sandwich(self, self.array_csr, d, rows, cols) def _cross_sandwich( self, @@ -112,7 +148,7 @@ def sandwich_dense( rows, L_cols = setup_restrictions(self.shape, rows, L_cols) R_cols = set_up_rows_or_cols(R_cols, B.shape[1]) - return csr_dense_sandwich(self.x_csr, B, d, rows, L_cols, R_cols) + return csr_dense_sandwich(self.array_csr, B, d, rows, L_cols, R_cols) def _matvec_helper( self, @@ -129,9 +165,11 @@ def _matvec_helper( unrestricted_cols = cols is None or len(cols) == self.shape[1] if unrestricted_rows and unrestricted_cols and vec.ndim == 1: if transpose: - return csc_rmatvec_unrestricted(self, vec, out, self.indices) + return csc_rmatvec_unrestricted(self.array_csc, vec, out, self.indices) else: - return csr_matvec_unrestricted(self.x_csr, vec, out, self.x_csr.indices) + return csr_matvec_unrestricted( + self.array_csr, vec, out, self.array_csr.indices + ) matrix_matvec = lambda x, v: sps.csc_matrix.dot(x, v) if transpose: @@ -139,9 +177,9 @@ def _matvec_helper( rows, cols = setup_restrictions(self.shape, rows, cols, dtype=self.idx_dtype) if transpose: - fast_fnc = lambda v: csc_rmatvec(self, v, rows, cols) + fast_fnc = lambda v: csc_rmatvec(self.array_csc, v, rows, cols) else: - fast_fnc = lambda v: csr_matvec(self.x_csr, v, rows, cols) + fast_fnc = lambda v: csr_matvec(self.array_csr, v, rows, cols) if vec.ndim == 1: res = fast_fnc(vec) elif vec.ndim == 2 and vec.shape[1] == 1: @@ -180,7 +218,11 @@ def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarra """Get standard deviations of columns.""" sqrt_arg = ( transpose_square_dot_weights( - self.data, self.indices, self.indptr, weights, weights.dtype + self._array.data, + self._array.indices, + self._array.indptr, + weights, + weights.dtype, ) - col_means**2 ) @@ -192,7 +234,7 @@ def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarra def astype(self, dtype, order="K", casting="unsafe", copy=True): """Return SparseMatrix cast to new type.""" - return super().astype(dtype, casting, copy) + return type(self)(self._array.astype(dtype, casting, copy)) def multiply(self, other): """Element-wise multiplication. @@ -202,5 +244,5 @@ def multiply(self, other): ``self.shape[0]``. """ if other.ndim == 1: - return SparseMatrix(super().multiply(other[:, np.newaxis])) - return SparseMatrix(super().multiply(other)) + return type(self)(self._array.multiply(other[:, np.newaxis])) + return type(self)(self._array.multiply(other)) From 7775b790d9bc5dc026b44e906ae3d2afb77bfa02 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 20 Jul 2023 17:31:25 +0200 Subject: [PATCH 51/72] Demo of how the ufunc interface can be implemented --- src/tabmat/sparse_matrix.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 11043bb8..8074a67c 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -44,6 +44,26 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False): self._array.sort_indices() self._array_csr = None + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + from .dense_matrix import DenseMatrix + + if ufunc.nin == 1 and ufunc.nout == 1: + if getattr(ufunc, method)(0) == 0: + result_matrix = sps.csc_matrix( + ( + getattr(ufunc, method)(self._array.data, **kwargs), + self._array.indices, + self._array.indptr, + ), + shape=self._array.shape, + ) + return type(self)(result_matrix) + else: + result_matrix = getattr(ufunc, method)(self._array.todense(), **kwargs) + return DenseMatrix(result_matrix) + else: + return NotImplemented + @property def shape(self): """Tuple of array dimensions.""" @@ -201,8 +221,6 @@ def matvec(self, vec, cols: np.ndarray = None, out: np.ndarray = None): check_matvec_out_shape(self, out) return self._matvec_helper(vec, None, cols, out, False) - __array_priority__ = 12 - def transpose_matvec( self, vec: Union[np.ndarray, List], From a493e031652e819ed5ea0db9595bba15a6b77383 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 20 Jul 2023 19:12:12 +0200 Subject: [PATCH 52/72] Do not subclass csc_matrix --- src/tabmat/categorical_matrix.py | 2 ++ src/tabmat/sparse_matrix.py | 31 +++++++++++++++++++++++++++++-- src/tabmat/split_matrix.py | 10 ++++++++-- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 7a751fdb..6e781691 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -458,6 +458,8 @@ def _cross_sandwich( return self._cross_dense(np.asarray(other), d, rows, L_cols, R_cols) if isinstance(other, sps.csc_matrix): return self._cross_sparse(other, d, rows, L_cols, R_cols) + if isinstance(other, SparseMatrix): + return self._cross_sparse(other.array_csc, d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): return self._cross_categorical(other, d, rows, L_cols, R_cols) raise TypeError diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 8074a67c..40644763 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -22,7 +22,7 @@ ) -class SparseMatrix(sps.csc_matrix, MatrixBase): +class SparseMatrix(MatrixBase): """ A scipy.sparse csc matrix subclass that allows such objects to conform to the ``MatrixBase`` interface. @@ -44,6 +44,15 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False): self._array.sort_indices() self._array_csr = None + def __getitem__(self, key): + if not isinstance(key, tuple): + key = (key,) + + # Always return a 2d array + key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key) + + return type(self)(self._array.__getitem__(key)) + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): from .dense_matrix import DenseMatrix @@ -111,6 +120,24 @@ def array_csr(self): return self._array_csr + def transpose(self): + """Returns a view of the array with axes transposed.""" # noqa: D401 + return type(self)(self._array.T) + + T = property(transpose) + + def getcol(self, i): + """Return matrix column at specified index.""" + return type(self)(self._array.getcol(i)) + + def toarray(self): + """Return a dense ndarray representation of the matrix.""" + return self._array.toarray() + + def dot(self, other): + """Return the dot product as a scipy sparse matrix.""" + return self._array.dot(other) + def sandwich( self, d: np.ndarray, rows: np.ndarray = None, cols: np.ndarray = None ) -> np.ndarray: @@ -206,7 +233,7 @@ def _matvec_helper( res = fast_fnc(vec[:, 0])[:, None] else: res = matrix_matvec( - self[np.ix_(rows, cols)], vec[rows] if transpose else vec[cols] + self[np.ix_(rows, cols)]._array, vec[rows] if transpose else vec[cols] ) if out is None: return res diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index aaf88414..2f7438fa 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -75,8 +75,14 @@ def _combine_matrices(matrices, indices): n_row = matrices[0].shape[0] for mat_type_, stack_fn in [ - (DenseMatrix, np.hstack), - (SparseMatrix, sps.hstack), + ( + DenseMatrix, + lambda matrices: np.hstack([mat._array for mat in matrices]), + ), + ( + SparseMatrix, + lambda matrices: sps.hstack([mat._array for mat in matrices]), + ), ]: this_type_matrices = [ i for i, mat in enumerate(matrices) if isinstance(mat, mat_type_) From 008dfa330275a2eeb68cde068ce1ddd12a971a98 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 18 Jul 2023 20:29:18 +0200 Subject: [PATCH 53/72] Improve the performance of `from_pandas` in the case of low-cardinality categoricals (#275) * Improve the performance of `from_pandas` * Update changelog according to review --- CHANGELOG.rst | 7 +++++++ src/tabmat/constructor.py | 12 ++---------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1305812e..87c866c0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,13 @@ Changelog ========= +Unreleased +---------- + +**Other changes:** + +- Improve the performance of ``from_pandas`` in the case of low-cardinality categorical variables. + 3.1.10 - 2023-06-23 ------------------- diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index b281bbed..6c8daad0 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -79,6 +79,7 @@ def from_pandas( if object_as_cat and coldata.dtype == object: coldata = coldata.astype("category") if isinstance(coldata.dtype, pd.CategoricalDtype): + cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype) if len(coldata.cat.categories) < cat_threshold: ( X_dense_F, @@ -86,15 +87,7 @@ def from_pandas( dense_indices, sparse_indices, ) = _split_sparse_and_dense_parts( - pd.get_dummies( - coldata, - prefix=colname, - sparse=True, - drop_first=drop_first, - dtype=np.float64, - ) - .sparse.to_coo() - .tocsc(), + sps.csc_matrix(cat.tocsr(), dtype=dtype), threshold=sparse_threshold, ) matrices.append(X_dense_F) @@ -110,7 +103,6 @@ def from_pandas( indices.append(sparse_indices) else: - cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype) matrices.append(cat) is_cat.append(True) if cat_position == "expand": From 407a12f3ba6cbb6af33b7e30a33a2a132ed5ac40 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 19 Jul 2023 20:02:42 +0200 Subject: [PATCH 54/72] Add benchmark data to .gitignore (#282) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c528d376..186ba948 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Project-specific benchmark/*.csv +benchmark/data/*.pkl # Files created by templating dense.cpp From ac9c121a7b22fb98c8ceabe309210b3ee86ff6a3 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 21 Jul 2023 09:03:01 +0200 Subject: [PATCH 55/72] Demonstrate binary ufuncs for sparse --- src/tabmat/categorical_matrix.py | 4 +--- src/tabmat/dense_matrix.py | 3 +++ src/tabmat/sparse_matrix.py | 11 +++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 6e781691..1a996cc0 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -454,10 +454,8 @@ def _cross_sandwich( """Perform a sandwich product: X.T @ diag(d) @ Y.""" from .dense_matrix import DenseMatrix - if isinstance(other, (np.ndarray, DenseMatrix)): + if isinstance(other, DenseMatrix): return self._cross_dense(np.asarray(other), d, rows, L_cols, R_cols) - if isinstance(other, sps.csc_matrix): - return self._cross_sparse(other, d, rows, L_cols, R_cols) if isinstance(other, SparseMatrix): return self._cross_sparse(other.array_csc, d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 953fe505..4bd76501 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -56,6 +56,9 @@ def __array__(self, dtype=None): return self._array.astype(dtype, copy=False) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + if not all(isinstance(x, (np.ndarray, DenseMatrix)) for x in inputs): + return NotImplemented + inputs = (x._array if isinstance(x, DenseMatrix) else x for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) if method in ("__call__", "accumulate") and ufunc.signature is None: diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 40644763..b144dcc1 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -70,6 +70,17 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: result_matrix = getattr(ufunc, method)(self._array.todense(), **kwargs) return DenseMatrix(result_matrix) + + elif ufunc == np.multiply: + if isinstance(inputs[0], SparseMatrix) and isinstance( + inputs[1], SparseMatrix + ): + return SparseMatrix(inputs[0].array_csc.multiply(inputs[1].array_csc)) + elif isinstance(inputs[0], SparseMatrix): + return SparseMatrix(inputs[0].array_csc.multiply(inputs[1])) + else: + return SparseMatrix(inputs[1].array_csc.multiply(inputs[0])) + else: return NotImplemented From 95bc477bce725f1e3fa1a5adae99a8fba937bcc8 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 21 Jul 2023 09:07:14 +0200 Subject: [PATCH 56/72] Add tocsc method --- src/tabmat/sparse_matrix.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index b144dcc1..bda43efd 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -131,6 +131,10 @@ def array_csr(self): return self._array_csr + def tocsc(self, copy=False): + """Return the matrix in CSC format.""" + return self._array.tocsc(copy=copy) + def transpose(self): """Returns a view of the array with axes transposed.""" # noqa: D401 return type(self)(self._array.T) From a6173f5b05d0e67ed88ce38fa2cec2f89580df26 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 21 Jul 2023 09:39:56 +0200 Subject: [PATCH 57/72] Fix type checks --- src/tabmat/dense_matrix.py | 4 ++-- src/tabmat/sparse_matrix.py | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 4bd76501..fced90c4 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -56,10 +56,10 @@ def __array__(self, dtype=None): return self._array.astype(dtype, copy=False) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if not all(isinstance(x, (np.ndarray, DenseMatrix)) for x in inputs): + if not all(isinstance(x, (np.ndarray, type(self))) for x in inputs): return NotImplemented - inputs = (x._array if isinstance(x, DenseMatrix) else x for x in inputs) + inputs = (x._array if isinstance(x, type(self)) else x for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) if method in ("__call__", "accumulate") and ufunc.signature is None: # Does not change shape diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index bda43efd..06f54505 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -72,14 +72,12 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return DenseMatrix(result_matrix) elif ufunc == np.multiply: - if isinstance(inputs[0], SparseMatrix) and isinstance( - inputs[1], SparseMatrix - ): - return SparseMatrix(inputs[0].array_csc.multiply(inputs[1].array_csc)) - elif isinstance(inputs[0], SparseMatrix): - return SparseMatrix(inputs[0].array_csc.multiply(inputs[1])) + if isinstance(inputs[0], type(self)) and isinstance(inputs[1], type(self)): + return type(self)(inputs[0].array_csc.multiply(inputs[1].array_csc)) + elif isinstance(inputs[0], type(self)): + return type(self)(inputs[0].array_csc.multiply(inputs[1])) else: - return SparseMatrix(inputs[1].array_csc.multiply(inputs[0])) + return type(self)(inputs[1].array_csc.multiply(inputs[0])) else: return NotImplemented From 35e7330fd9f0d992354ce185c379a18b36e4203a Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 21 Jul 2023 11:18:07 +0200 Subject: [PATCH 58/72] Minor improvements --- src/tabmat/dense_matrix.py | 5 ++++- src/tabmat/sparse_matrix.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index fced90c4..a520389d 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -1,3 +1,4 @@ +import numbers import textwrap from typing import List, Optional, Union @@ -56,7 +57,9 @@ def __array__(self, dtype=None): return self._array.astype(dtype, copy=False) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if not all(isinstance(x, (np.ndarray, type(self))) for x in inputs): + if not all( + isinstance(x, (np.ndarray, type(self), numbers.Number)) for x in inputs + ): return NotImplemented inputs = (x._array if isinstance(x, type(self)) else x for x in inputs) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 06f54505..7d52773a 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -56,6 +56,9 @@ def __getitem__(self, key): def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): from .dense_matrix import DenseMatrix + if "out" in kwargs: + raise NotImplementedError("out argument is not supported") + if ufunc.nin == 1 and ufunc.nout == 1: if getattr(ufunc, method)(0) == 0: result_matrix = sps.csc_matrix( From aa264df82f26d2acf512c8909d5877d32e1b61d3 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Fri, 21 Jul 2023 11:43:19 +0200 Subject: [PATCH 59/72] ufunc support for categoricals --- src/tabmat/categorical_matrix.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 1a996cc0..2c688053 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -265,6 +265,12 @@ def __init__( self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None self.dtype = np.dtype(dtype) + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + inputs = ( + x.to_sparse_matrix() if isinstance(x, type(self)) else x for x in inputs + ) + return getattr(ufunc, method)(*inputs, **kwargs) + def recover_orig(self) -> np.ndarray: """ Return 1d numpy array with same data as what was initially fed to __init__. @@ -491,6 +497,12 @@ def tocsr(self) -> sps.csr_matrix: shape=self.shape, ) + def to_sparse_matrix(self): + """Return a tabmat.SparseMatrix representation.""" + from .sparse_matrix import SparseMatrix + + return SparseMatrix(self.tocsr()) + def toarray(self) -> np.ndarray: """Return array representation of matrix.""" return self.tocsr().A From 51d31e58b7c8d5012b8fa057a808877438aef67f Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 25 Jul 2023 09:15:43 +0200 Subject: [PATCH 60/72] Remove __array_ufunc__ interface --- src/tabmat/categorical_matrix.py | 8 ++----- src/tabmat/dense_matrix.py | 18 +--------------- src/tabmat/sparse_matrix.py | 36 +++----------------------------- 3 files changed, 6 insertions(+), 56 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 2c688053..f6e84c1d 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -265,11 +265,7 @@ def __init__( self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None self.dtype = np.dtype(dtype) - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - inputs = ( - x.to_sparse_matrix() if isinstance(x, type(self)) else x for x in inputs - ) - return getattr(ufunc, method)(*inputs, **kwargs) + __array_ufunc__ = None def recover_orig(self) -> np.ndarray: """ @@ -461,7 +457,7 @@ def _cross_sandwich( from .dense_matrix import DenseMatrix if isinstance(other, DenseMatrix): - return self._cross_dense(np.asarray(other), d, rows, L_cols, R_cols) + return self._cross_dense(other._array, d, rows, L_cols, R_cols) if isinstance(other, SparseMatrix): return self._cross_sparse(other.array_csc, d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index a520389d..464d1f70 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -1,4 +1,3 @@ -import numbers import textwrap from typing import List, Optional, Union @@ -53,22 +52,7 @@ def __getitem__(self, key): return type(self)(self._array.__getitem__(key)) - def __array__(self, dtype=None): - return self._array.astype(dtype, copy=False) - - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if not all( - isinstance(x, (np.ndarray, type(self), numbers.Number)) for x in inputs - ): - return NotImplemented - - inputs = (x._array if isinstance(x, type(self)) else x for x in inputs) - result = getattr(ufunc, method)(*inputs, **kwargs) - if method in ("__call__", "accumulate") and ufunc.signature is None: - # Does not change shape - return type(self)(result) - else: - return result + __array_ufunc__ = None def __matmul__(self, other): return self._array.__matmul__(other) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 7d52773a..5eba5adc 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -53,37 +53,7 @@ def __getitem__(self, key): return type(self)(self._array.__getitem__(key)) - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - from .dense_matrix import DenseMatrix - - if "out" in kwargs: - raise NotImplementedError("out argument is not supported") - - if ufunc.nin == 1 and ufunc.nout == 1: - if getattr(ufunc, method)(0) == 0: - result_matrix = sps.csc_matrix( - ( - getattr(ufunc, method)(self._array.data, **kwargs), - self._array.indices, - self._array.indptr, - ), - shape=self._array.shape, - ) - return type(self)(result_matrix) - else: - result_matrix = getattr(ufunc, method)(self._array.todense(), **kwargs) - return DenseMatrix(result_matrix) - - elif ufunc == np.multiply: - if isinstance(inputs[0], type(self)) and isinstance(inputs[1], type(self)): - return type(self)(inputs[0].array_csc.multiply(inputs[1].array_csc)) - elif isinstance(inputs[0], type(self)): - return type(self)(inputs[0].array_csc.multiply(inputs[1])) - else: - return type(self)(inputs[1].array_csc.multiply(inputs[0])) - - else: - return NotImplemented + __array_ufunc__ = None @property def shape(self): @@ -181,8 +151,8 @@ def _cross_sandwich( from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix - if isinstance(other, (np.ndarray, DenseMatrix)): - return self.sandwich_dense(np.asarray(other), d, rows, L_cols, R_cols) + if isinstance(other, DenseMatrix): + return self.sandwich_dense(other._array, d, rows, L_cols, R_cols) if isinstance(other, CategoricalMatrix): return other._cross_sandwich(self, d, rows, R_cols, L_cols).T From 86e31782bce32346990928b807d33291301f4068 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 25 Jul 2023 11:33:00 +0200 Subject: [PATCH 61/72] Remove numpy operator mixin --- src/tabmat/categorical_matrix.py | 4 ++-- src/tabmat/dense_matrix.py | 2 +- src/tabmat/split_matrix.py | 9 ++++----- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index f6e84c1d..7783d5fd 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -466,7 +466,7 @@ def _cross_sandwich( # TODO: best way to return this depends on the use case. See what that is # See how csr getcol works - def getcol(self, i: int) -> sps.csc_matrix: + def getcol(self, i: int) -> SparseMatrix: """Return matrix column at specified index.""" i %= self.shape[1] # wrap-around indexing @@ -474,7 +474,7 @@ def getcol(self, i: int) -> sps.csc_matrix: i += 1 col_i = sps.csc_matrix((self.indices == i).astype(int)[:, None]) - return col_i + return SparseMatrix(col_i) def tocsr(self) -> sps.csr_matrix: """Return scipy csr representation of matrix.""" diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 464d1f70..587d244b 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -18,7 +18,7 @@ ) -class DenseMatrix(np.lib.mixins.NDArrayOperatorsMixin, MatrixBase): +class DenseMatrix(MatrixBase): """ A ``numpy.ndarray`` subclass with several additional functions that allow it to share the MatrixBase API with SparseMatrix and CategoricalMatrix. diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index 2f7438fa..f936bfb1 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -1,10 +1,9 @@ import warnings -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Optional, Sequence, Tuple, Union import numpy as np from scipy import sparse as sps -from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix from .ext.split import is_sorted, split_col_subsets from .matrix_base import MatrixBase @@ -29,7 +28,7 @@ def as_mx(a: Any): return a elif sps.issparse(a): return SparseMatrix(a) - elif isinstance(a, (np.ndarray, DenseMatrix)): + elif isinstance(a, np.ndarray): return DenseMatrix(a) else: raise ValueError(f"Cannot convert type {type(a)} to Matrix.") @@ -135,7 +134,7 @@ class SplitMatrix(MatrixBase): def __init__( self, - matrices: List[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]], + matrices: Sequence[MatrixBase], indices: Optional[List[np.ndarray]] = None, ): flatten_matrices = [] @@ -149,7 +148,7 @@ def __init__( if isinstance(mat, SplitMatrix): # Flatten out the SplitMatrix current_idx = 0 - for iind, imat in zip(mat.indices, mat.matrices): + for iind, imat in zip(mat.indices, mat.matrices): # type: ignore flatten_matrices.append(imat) index_corrections.append( iind - np.arange(len(iind), dtype=np.int64) - current_idx From e4bb2ea576975f771ef4a7bed52cd50ab9a1c983 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 26 Jul 2023 12:18:14 +0200 Subject: [PATCH 62/72] Add hstack function --- src/tabmat/__init__.py | 4 +++- src/tabmat/split_matrix.py | 30 +++++++++++++++++++++++++++--- tests/test_matrices.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/src/tabmat/__init__.py b/src/tabmat/__init__.py index 1d295c5e..0a3f8233 100644 --- a/src/tabmat/__init__.py +++ b/src/tabmat/__init__.py @@ -3,7 +3,7 @@ from .dense_matrix import DenseMatrix from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix -from .split_matrix import SplitMatrix +from .split_matrix import SplitMatrix, as_tabmat, hstack from .standardized_mat import StandardizedMatrix __all__ = [ @@ -16,4 +16,6 @@ "from_csc", "from_formula", "from_pandas", + "as_tabmat", + "hstack", ] diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index f936bfb1..a091949f 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -1,5 +1,5 @@ import warnings -from typing import Any, List, Optional, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Tuple, Union import numpy as np from scipy import sparse as sps @@ -16,7 +16,7 @@ ) -def as_mx(a: Any): +def as_tabmat(a: Union[MatrixBase, StandardizedMatrix, np.ndarray, sps.spmatrix]): """Convert an array to a corresponding MatrixBase type. If the input is already a MatrixBase, return untouched. @@ -27,13 +27,37 @@ def as_mx(a: Any): if isinstance(a, (MatrixBase, StandardizedMatrix)): return a elif sps.issparse(a): - return SparseMatrix(a) + return SparseMatrix(a.tocsc(copy=False)) elif isinstance(a, np.ndarray): return DenseMatrix(a) else: raise ValueError(f"Cannot convert type {type(a)} to Matrix.") +def hstack(tup: Sequence[Union[MatrixBase, np.ndarray, sps.spmatrix]]) -> MatrixBase: + """Stack arrays in sequence horizontally (column wise). + + This is equivalent to concatenation along the second axis, + except for 1-D arrays where it concatenates along the first axis. + + Parameters + ---------- + tup: sequence of arrays + The arrays must have the same shape along all but the second axis. + """ + matrices = [as_tabmat(a) for a in tup] + + if len(matrices) == 0: + raise ValueError("Need at least one array to concatenate.") + + if all(isinstance(mat, SparseMatrix) for mat in matrices): + return SparseMatrix(sps.hstack([mat._array for mat in matrices])) + elif all(isinstance(mat, DenseMatrix) for mat in matrices): + return DenseMatrix(np.hstack([mat._array for mat in matrices])) + else: + return SplitMatrix(matrices) + + def _prepare_out_array(out: Optional[np.ndarray], out_shape, out_dtype): if out is None: out = np.zeros(out_shape, out_dtype) diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 779b160c..34f6a5bb 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -632,3 +632,32 @@ def test_multiply(mat): for act in actual: assert isinstance(act, MatrixBase) np.testing.assert_allclose(act.A, expected) + + +@pytest.mark.parametrize( + "mat_1", + get_all_matrix_base_subclass_mats() + + [base_array()] + + [sps.csc_matrix(base_array())], +) +@pytest.mark.parametrize( + "mat_2", + get_all_matrix_base_subclass_mats() + + [base_array()] + + [sps.csc_matrix(base_array())], +) +def test_hstack(mat_1, mat_2): + mats = [mat_1, mat_2] + stacked = tm.hstack(mats) + + if all(isinstance(mat, (np.ndarray, tm.DenseMatrix)) for mat in mats): + assert isinstance(stacked, tm.DenseMatrix) + elif all(isinstance(mat, (sps.csc_matrix, tm.SparseMatrix)) for mat in mats): + assert isinstance(stacked, tm.SparseMatrix) + else: + assert isinstance(stacked, tm.SplitMatrix) + + np.testing.assert_array_equal( + stacked.A, + np.hstack([mat.A if not isinstance(mat, np.ndarray) else mat for mat in mats]), + ) From 17c36ca6b038e71c5586c63209264f46326fb34a Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 26 Jul 2023 14:31:13 +0200 Subject: [PATCH 63/72] Add method for unpacking underlying array --- src/tabmat/categorical_matrix.py | 4 ++++ src/tabmat/dense_matrix.py | 4 ++++ src/tabmat/sparse_matrix.py | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 7783d5fd..4968c628 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -503,6 +503,10 @@ def toarray(self) -> np.ndarray: """Return array representation of matrix.""" return self.tocsr().A + def unpack(self): + """Return the underlying pandas.Categorical.""" + return self.cat + def astype(self, dtype, order="K", casting="unsafe", copy=True): """Return CategoricalMatrix cast to new type.""" self.dtype = dtype diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 587d244b..1a70457f 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -105,6 +105,10 @@ def toarray(self): """Return array representation of matrix.""" return self._array + def unpack(self): + """Return the underlying numpy.ndarray.""" + return self._array + def sandwich( self, d: np.ndarray, rows: np.ndarray = None, cols: np.ndarray = None ) -> np.ndarray: diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 5eba5adc..8d7a30bc 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -116,6 +116,10 @@ def getcol(self, i): """Return matrix column at specified index.""" return type(self)(self._array.getcol(i)) + def unpack(self): + """Return the underlying scipy.sparse.csc_matrix.""" + return self._array + def toarray(self): """Return a dense ndarray representation of the matrix.""" return self._array.toarray() From 9dd638dc02eaef01328a35180d646e310dfe49b4 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 26 Jul 2023 14:38:32 +0200 Subject: [PATCH 64/72] Add __matmul__ methods to SparseMatrix --- src/tabmat/sparse_matrix.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 8d7a30bc..188f6862 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -53,6 +53,12 @@ def __getitem__(self, key): return type(self)(self._array.__getitem__(key)) + def __matmul__(self, other): + return self._array.__matmul__(other) + + def __rmatmul__(self, other): + return self._array.__rmatmul__(other) + __array_ufunc__ = None @property From ba2b70ef7431c9942327c2e65744d0f25c8f76b4 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 27 Jul 2023 12:13:54 +0200 Subject: [PATCH 65/72] Stricter and more consistent indexing --- src/tabmat/categorical_matrix.py | 48 ++++++++++----------------- src/tabmat/dense_matrix.py | 9 ++--- src/tabmat/sparse_matrix.py | 9 ++--- src/tabmat/util.py | 48 +++++++++++++++++++++++++++ tests/test_matrices.py | 57 ++++++++++++++++++++++++++++++-- 5 files changed, 124 insertions(+), 47 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 4968c628..68161445 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -161,7 +161,7 @@ def matvec(mat, vec): """ -from typing import Any, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import numpy as np import pandas as pd @@ -181,6 +181,7 @@ def matvec(mat, vec): from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix from .util import ( + _check_indexer, check_matvec_dimensions, check_matvec_out_shape, check_transpose_matvec_out_shape, @@ -189,21 +190,15 @@ def matvec(mat, vec): ) -def _is_indexer_full_length(full_length: int, indexer: Any): - if isinstance(indexer, int): - return full_length == 1 - elif isinstance(indexer, list): - if (np.asarray(indexer) > full_length - 1).any(): - raise IndexError("Index out-of-range.") - return len(set(indexer)) == full_length - elif isinstance(indexer, np.ndarray): +def _is_indexer_full_length(full_length: int, indexer: Union[slice, np.ndarray]): + if isinstance(indexer, np.ndarray): if (indexer > full_length - 1).any(): raise IndexError("Index out-of-range.") - return len(np.unique(indexer)) == full_length + # Order is important in indexing. Could achieve similar results + # by rearranging categories. + return np.array_equal(indexer.ravel(), np.arange(full_length)) elif isinstance(indexer, slice): return len(range(*indexer.indices(full_length))) == full_length - else: - raise ValueError(f"Indexing with {type(indexer)} is not allowed.") def _row_col_indexing( @@ -522,25 +517,18 @@ def _get_col_stds(self, weights: np.ndarray, col_means: np.ndarray) -> np.ndarra return np.sqrt(mean - col_means**2) def __getitem__(self, item): - if isinstance(item, tuple): - row, col = item - if _is_indexer_full_length(self.shape[1], col): - if isinstance(row, int): - row = [row] - return CategoricalMatrix( - self.cat[row], drop_first=self.drop_first, dtype=self.dtype - ) - else: - # return a SparseMatrix if we subset columns - # TODO: this is inefficient. See issue #101. - return SparseMatrix(self.tocsr()[row, col], dtype=self.dtype) + row, col = _check_indexer(item) + + if _is_indexer_full_length(self.shape[1], col): + if isinstance(row, np.ndarray): + row = row.ravel() + return CategoricalMatrix( + self.cat[row], drop_first=self.drop_first, dtype=self.dtype + ) else: - row = item - if isinstance(row, int): - row = [row] - return CategoricalMatrix( - self.cat[row], drop_first=self.drop_first, dtype=self.dtype - ) + # return a SparseMatrix if we subset columns + # TODO: this is inefficient. See issue #101. + return self.to_sparse_matrix()[row, col] def _cross_dense( self, diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 1a70457f..55c9a088 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -11,6 +11,7 @@ ) from .matrix_base import MatrixBase from .util import ( + _check_indexer, check_matvec_dimensions, check_matvec_out_shape, check_transpose_matvec_out_shape, @@ -44,13 +45,7 @@ def __init__(self, input_array): self._array = np.asarray(input_array) def __getitem__(self, key): - if not isinstance(key, tuple): - key = (key,) - - # Always return a 2d array - key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key) - - return type(self)(self._array.__getitem__(key)) + return type(self)(self._array.__getitem__(_check_indexer(key))) __array_ufunc__ = None diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 188f6862..8c2a3b2b 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -14,6 +14,7 @@ ) from .matrix_base import MatrixBase from .util import ( + _check_indexer, check_matvec_dimensions, check_matvec_out_shape, check_transpose_matvec_out_shape, @@ -45,13 +46,7 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False): self._array_csr = None def __getitem__(self, key): - if not isinstance(key, tuple): - key = (key,) - - # Always return a 2d array - key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key) - - return type(self)(self._array.__getitem__(key)) + return type(self)(self._array.__getitem__(_check_indexer(key))) def __matmul__(self, other): return self._array.__matmul__(other) diff --git a/src/tabmat/util.py b/src/tabmat/util.py index 2dd570ec..24cfbe30 100644 --- a/src/tabmat/util.py +++ b/src/tabmat/util.py @@ -50,3 +50,51 @@ def check_matvec_dimensions(mat, vec: np.ndarray, transpose: bool) -> None: f"shapes {mat.shape} and {vec.shape} not aligned: " f"{mat.shape[match_dim]} (dim {match_dim}) != {vec.shape[0]} (dim 0)" ) + + +def _check_indexer(indexer): + """Check that the indexer is valid, and transform it to a canonical format.""" + if not isinstance(indexer, tuple): + indexer = (indexer, slice(None, None, None)) + + if len(indexer) > 2: + raise ValueError("More than two indexers are not supported.") + + row_indexer, col_indexer = indexer + + if isinstance(row_indexer, slice): + if isinstance(col_indexer, slice): + return row_indexer, col_indexer + else: + col_indexer = np.asarray(col_indexer) + if col_indexer.ndim > 1: + raise ValueError( + "Indexing would result in a matrix with more than 2 dimensions." + ) + else: + return row_indexer, col_indexer.reshape(-1) + + elif isinstance(col_indexer, slice): + row_indexer = np.asarray(row_indexer) + if row_indexer.ndim > 1: + raise ValueError( + "Indexing would result in a matrix with more than 2 dimensions." + ) + else: + return row_indexer.reshape(-1), col_indexer + + else: + row_indexer = np.asarray(row_indexer) + col_indexer = np.asarray(col_indexer) + if row_indexer.ndim <= 1 and col_indexer.ndim <= 1: + return np.ix_(row_indexer.reshape(-1), col_indexer.reshape(-1)) + elif ( + row_indexer.ndim == 2 + and row_indexer.shape[1] == 1 + and col_indexer.ndim == 2 + and col_indexer.shape[0] == 1 + ): + # support for np.ix_-ed indices + return row_indexer, col_indexer + else: + raise ValueError("This type of indexing is not supported.") diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 34f6a5bb..815c48e3 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -552,8 +552,8 @@ def test_indexing_int_row(mat: Union[tm.MatrixBase, tm.StandardizedMatrix]): res = mat[0, :] if not isinstance(res, np.ndarray): res = res.A - expected = mat.A[0, :] - np.testing.assert_allclose(np.squeeze(res), expected) + expected = mat.A[[0], :] + np.testing.assert_allclose(res, expected) @pytest.mark.parametrize("mat", get_matrices()) @@ -563,7 +563,58 @@ def test_indexing_range_row(mat: Union[tm.MatrixBase, tm.StandardizedMatrix]): if not isinstance(res, np.ndarray): res = res.A expected = mat.A[0:2, :] - np.testing.assert_allclose(np.squeeze(res), expected) + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_int_col(mat): + res = mat[:, 0] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (mat.shape[0], 1) + expected = mat.A[:, [0]] + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_range_col(mat): + res = mat[:, 0:2] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (mat.shape[0], 2) + expected = mat.A[:, 0:2] + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_int_both(mat): + res = mat[0, 0] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (1, 1) + expected = mat.A[0, 0] + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_seq_both(mat): + res = mat[[0, 1], [0, 1]] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (2, 2) + expected = mat.A[np.ix_([0, 1], [0, 1])] + np.testing.assert_array_equal(res, expected) + + +@pytest.mark.parametrize("mat", get_unscaled_matrices()) +def test_indexing_ix_both(mat): + indexer = np.ix_([0, 1], [0, 1]) + res = mat[indexer] + if not isinstance(res, np.ndarray): + res = res.A + assert res.shape == (2, 2) + expected = mat.A[indexer] + np.testing.assert_array_equal(res, expected) def test_pandas_to_matrix(): From 9b04f8c78f57ecbba01d620ee52de03e5b812cee Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 9 Aug 2023 15:09:39 +0200 Subject: [PATCH 66/72] Be consistent when instantiating from 1d arrays --- src/tabmat/sparse_matrix.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 8c2a3b2b..d98f180f 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -31,8 +31,14 @@ class SparseMatrix(MatrixBase): SparseMatrix is instantiated in the same way as scipy.sparse.csc_matrix. """ - def __init__(self, arg1, shape=None, dtype=None, copy=False): - self._array = sps.csc_matrix(arg1, shape, dtype, copy) + def __init__(self, input_array, shape=None, dtype=None, copy=False): + if isinstance(input_array, np.ndarray): + if input_array.ndim == 1: + input_array = input_array.reshape(-1, 1) + elif input_array.ndim > 2: + raise ValueError("Input array must be 1- or 2-dimensional") + + self._array = sps.csc_matrix(input_array, shape, dtype, copy) self.idx_dtype = max(self._array.indices.dtype, self._array.indptr.dtype) if self._array.indices.dtype != self.idx_dtype: From 5c064c2fae983290c4b37f7c99ade8621d6ed286 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 9 Aug 2023 15:12:06 +0200 Subject: [PATCH 67/72] Adjust tests to work with v4 --- tests/test_formula.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/test_formula.py b/tests/test_formula.py index 89d9ff9d..bd2c712a 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -158,9 +158,7 @@ def test_matrix_against_expectation(df, formula, expected): assert len(model_df.matrices) == len(expected.matrices) for res, exp in zip(model_df.matrices, expected.matrices): assert type(res) == type(exp) - if isinstance(res, tm.DenseMatrix): - np.testing.assert_array_equal(res, exp) - elif isinstance(res, tm.SparseMatrix): + if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)): np.testing.assert_array_equal(res.A, res.A) elif isinstance(res, tm.CategoricalMatrix): assert (exp.cat == res.cat).all() @@ -269,9 +267,7 @@ def test_matrix_against_expectation_qcl(df, formula, expected): assert len(model_df.matrices) == len(expected.matrices) for res, exp in zip(model_df.matrices, expected.matrices): assert type(res) == type(exp) - if isinstance(res, tm.DenseMatrix): - np.testing.assert_array_equal(res, exp) - elif isinstance(res, tm.SparseMatrix): + if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)): np.testing.assert_array_equal(res.A, res.A) elif isinstance(res, tm.CategoricalMatrix): assert (exp.cat == res.cat).all() @@ -694,19 +690,19 @@ def test_state(self, materializer): mm = materializer.get_model_matrix("center(a) - 1") assert isinstance(mm, tm.MatrixBase) assert list(mm.model_spec.column_names) == ["center(a)"] - assert np.allclose(mm.getcol(0).squeeze(), [-1, 0, 1]) + assert np.allclose(mm.getcol(0).unpack().squeeze(), [-1, 0, 1]) mm2 = TabmatMaterializer(pd.DataFrame({"a": [4, 5, 6]})).get_model_matrix( mm.model_spec ) assert isinstance(mm2, tm.MatrixBase) assert list(mm2.model_spec.column_names) == ["center(a)"] - assert np.allclose(mm2.getcol(0).squeeze(), [2, 3, 4]) + assert np.allclose(mm2.getcol(0).unpack().squeeze(), [2, 3, 4]) mm3 = mm.model_spec.get_model_matrix(pd.DataFrame({"a": [4, 5, 6]})) assert isinstance(mm3, tm.MatrixBase) assert list(mm3.model_spec.column_names) == ["center(a)"] - assert np.allclose(mm3.getcol(0).squeeze(), [2, 3, 4]) + assert np.allclose(mm3.getcol(0).unpack().squeeze(), [2, 3, 4]) def test_factor_evaluation_edge_cases(self, materializer): # Test that categorical kinds are set if type would otherwise be numerical From f1ba3041f98ef581a38aed32909eb65bb26ba376 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 9 Aug 2023 15:22:19 +0200 Subject: [PATCH 68/72] Fix type hints --- src/tabmat/formula.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index 4cea1162..a4471211 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -345,7 +345,7 @@ def to_tabmat( dtype: numpy.dtype = numpy.float64, sparse_threshold: float = 0.1, cat_threshold: int = 4, - ) -> DenseMatrix: + ) -> Union[SparseMatrix, DenseMatrix]: if (self.values != 0).mean() > sparse_threshold: return DenseMatrix(self.values) else: @@ -439,7 +439,7 @@ def to_tabmat( dtype: numpy.dtype = numpy.float64, sparse_threshold: float = 0.1, cat_threshold: int = 4, - ) -> Union[CategoricalMatrix, SplitMatrix]: + ) -> Union[DenseMatrix, CategoricalMatrix, SplitMatrix]: codes = self.codes.copy() categories = self.categories.copy() if -2 in self.codes: From 603293bd83b1e4d3d7b90bc27b9e56f6cf652cbf Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 15 Aug 2023 09:52:59 +0200 Subject: [PATCH 69/72] Add changelog entry --- CHANGELOG.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2e34b3c5..b3750c98 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -13,6 +13,7 @@ Unreleased **New features:** - Add column name and term name metadata to ``MatrixBase`` objects. These are automatically populated when initializing a ``MatrixBase`` from a ``pandas.DataFrame``. In addition, they can be accessed and modified via the ``column_names`` and ``term_names`` properties. +- Add a formula interface for creating tabmat matrices from pandas data frames. See :func:`tabmat.from_formula` for details. **Other changes:** @@ -28,7 +29,7 @@ Unreleased - Fixed ``getcol`` not respecting the ``drop_first`` attribute of a ``CategoricalMatrix``. 3.1.9 - 2023-06-16 -------------------- +------------------ **Other changes:** From 36863712d2a1f542bd80524c9d682516e3442182 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 15 Aug 2023 10:10:22 +0200 Subject: [PATCH 70/72] term and column names for formula-based matrices --- src/tabmat/constructor.py | 17 +++++++++++++++-- src/tabmat/formula.py | 4 ++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index 641b0005..d9148cff 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -206,7 +206,7 @@ def from_formula( sparse_threshold: float = 0.1, cat_threshold: int = 4, interaction_separator: str = ":", - categorical_format: str = "{name}[T.{category}]", + categorical_format: str = "{name}[{category}]", intercept_name: str = "Intercept", include_intercept: bool = False, add_column_for_intercept: bool = True, @@ -275,4 +275,17 @@ def from_formula( cat_threshold=cat_threshold, add_column_for_intercept=add_column_for_intercept, ) - return materializer.get_model_matrix(spec) + result = materializer.get_model_matrix(spec) + + column_names = [] + term_names = [] + + for term, _, columns in result.model_spec.structure: + for column in columns: + column_names.append(str(column)) + term_names.append(str(term)) + + result.column_names = column_names + result.term_names = term_names + + return result diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index a4471211..b27c0f69 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -35,7 +35,7 @@ class TabmatMaterializer(FormulaMaterializer): def _init(self): self.interaction_separator = self.params.get("interaction_separator", ":") self.categorical_format = self.params.get( - "categorical_format", "{name}[T.{category}]" + "categorical_format", "{name}[{category}]" ) self.intercept_name = self.params.get("intercept_name", "Intercept") self.dtype = self.params.get("dtype", numpy.float64) @@ -481,7 +481,7 @@ def get_names(self) -> List[str]: return self.categories def set_name( - self, name, name_format="{name}[T.{category}]" + self, name, name_format="{name}[{category}]" ) -> "_InteractableCategoricalVector": if self.name is None: # Make sure to only format the name once From 7aa36a4dd9a0d9d795c5f6109d82ca121310ecfe Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 15 Aug 2023 10:59:20 +0200 Subject: [PATCH 71/72] Fix handling of formula-based names --- src/tabmat/constructor.py | 14 ++++---------- src/tabmat/formula.py | 22 +++++++++++++++++----- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index d9148cff..3426a76b 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -277,15 +277,9 @@ def from_formula( ) result = materializer.get_model_matrix(spec) - column_names = [] - term_names = [] - - for term, _, columns in result.model_spec.structure: - for column in columns: - column_names.append(str(column)) - term_names.append(str(term)) - - result.column_names = column_names - result.term_names = term_names + term_names = np.zeros(len(result.term_names), dtype="object") + for term, indices in result.model_spec.term_indices.items(): + term_names[indices] = str(term) + result.term_names = term_names.tolist() return result diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py index b27c0f69..b97d5617 100644 --- a/src/tabmat/formula.py +++ b/src/tabmat/formula.py @@ -347,10 +347,12 @@ def to_tabmat( cat_threshold: int = 4, ) -> Union[SparseMatrix, DenseMatrix]: if (self.values != 0).mean() > sparse_threshold: - return DenseMatrix(self.values) + return DenseMatrix(self.values, column_names=[self.name]) else: # Columns can become sparser, but not denser through interactions - return SparseMatrix(sps.csc_matrix(self.values[:, numpy.newaxis])) + return SparseMatrix( + sps.csc_matrix(self.values[:, numpy.newaxis]), column_names=[self.name] + ) def get_names(self) -> List[str]: if self.name is None: @@ -380,7 +382,7 @@ def to_tabmat( sparse_threshold: float = 0.1, cat_threshold: int = 4, ) -> SparseMatrix: - return SparseMatrix(self.values) + return SparseMatrix(self.values, column_names=[self.name]) def get_names(self) -> List[str]: if self.name is None: @@ -456,7 +458,13 @@ def to_tabmat( ordered=False, ) - categorical_part = CategoricalMatrix(cat, drop_first=drop_first, dtype=dtype) + categorical_part = CategoricalMatrix( + cat, + drop_first=drop_first, + dtype=dtype, + column_name=self.name, + column_name_format="{category}", + ) if (self.codes == -2).all(): # All values are dropped @@ -472,7 +480,11 @@ def to_tabmat( sparse_part, dense_idx, sparse_idx, - ) = _split_sparse_and_dense_parts(sparse_matrix, sparse_threshold) + ) = _split_sparse_and_dense_parts( + sparse_matrix, + sparse_threshold, + column_names=categorical_part.column_names, + ) return SplitMatrix([dense_part, sparse_part], [dense_idx, sparse_idx]) def get_names(self) -> List[str]: From c9cfc0f1a86c597fb40e33fb681a17992df68448 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 15 Aug 2023 10:59:30 +0200 Subject: [PATCH 72/72] Add tests for formula-based names --- tests/test_formula.py | 186 ++++++++++++++++++++++++++++-------------- 1 file changed, 126 insertions(+), 60 deletions(-) diff --git a/tests/test_formula.py b/tests/test_formula.py index bd2c712a..88a55c71 100644 --- a/tests/test_formula.py +++ b/tests/test_formula.py @@ -69,12 +69,12 @@ def test_retrieval(): pd.Categorical( [ "__drop__", - "cat_1[T.b]", - "cat_1[T.c]", - "cat_1[T.b]", + "cat_1[b]", + "cat_1[c]", + "cat_1[b]", "__drop__", ], - categories=["__drop__", "cat_1[T.b]", "cat_1[T.c]"], + categories=["__drop__", "cat_1[b]", "cat_1[c]"], ), drop_first=True, ), @@ -128,19 +128,19 @@ def test_retrieval(): tm.CategoricalMatrix( pd.Categorical( [ - "cat_1[T.a]:cat_3[T.1]", - "cat_1[T.b]:cat_3[T.2]", - "cat_1[T.c]:cat_3[T.1]", - "cat_1[T.b]:cat_3[T.2]", - "cat_1[T.a]:cat_3[T.1]", + "cat_1[a]:cat_3[1]", + "cat_1[b]:cat_3[2]", + "cat_1[c]:cat_3[1]", + "cat_1[b]:cat_3[2]", + "cat_1[a]:cat_3[1]", ], categories=[ - "cat_1[T.a]:cat_3[T.1]", - "cat_1[T.b]:cat_3[T.1]", - "cat_1[T.c]:cat_3[T.1]", - "cat_1[T.a]:cat_3[T.2]", - "cat_1[T.c]:cat_3[T.2]", - "cat_1[T.b]:cat_3[T.2]", + "cat_1[a]:cat_3[1]", + "cat_1[b]:cat_3[1]", + "cat_1[c]:cat_3[1]", + "cat_1[a]:cat_3[2]", + "cat_1[c]:cat_3[2]", + "cat_1[b]:cat_3[2]", ], ), drop_first=False, @@ -318,17 +318,17 @@ def test_matrix_against_pandas(df, formula, ensure_full_rank): ), pytest.param("num_1 + num_2 - 1", ("num_1", "num_2"), id="no_intercept"), pytest.param( - "1 + cat_1", ("Intercept", "cat_1[T.b]", "cat_1[T.c]"), id="categorical" + "1 + cat_1", ("Intercept", "cat_1[b]", "cat_1[c]"), id="categorical" ), pytest.param( "1 + cat_2 * cat_3", ( "Intercept", - "cat_2[T.y]", - "cat_2[T.z]", - "cat_3[T.2]", - "cat_2[T.y]:cat_3[T.2]", - "cat_2[T.z]:cat_3[T.2]", + "cat_2[y]", + "cat_2[z]", + "cat_3[2]", + "cat_2[y]:cat_3[2]", + "cat_2[z]:cat_3[2]", ), id="interaction", ), @@ -347,6 +347,7 @@ def test_matrix_against_pandas(df, formula, ensure_full_rank): def test_names_against_expectation(df, formula, expected_names): model_tabmat = tm.from_formula(formula, df, ensure_full_rank=True) assert model_tabmat.model_spec.column_names == expected_names + assert model_tabmat.column_names == list(expected_names) @pytest.mark.parametrize( @@ -389,6 +390,65 @@ def test_names_against_expectation_qcl(df, formula, expected_names): intercept_name="intercept", ) assert model_tabmat.model_spec.column_names == expected_names + assert model_tabmat.column_names == list(expected_names) + + +@pytest.mark.parametrize( + "formula, expected_names", + [ + pytest.param("1 + cat_1", ("1", "cat_1", "cat_1"), id="categorical"), + pytest.param( + "1 + cat_2 * cat_3", + ( + "1", + "cat_2", + "cat_2", + "cat_3", + "cat_2:cat_3", + "cat_2:cat_3", + ), + id="interaction", + ), + pytest.param( + "poly(num_1, 3) - 1", + ("poly(num_1, 3)", "poly(num_1, 3)", "poly(num_1, 3)"), + id="polynomial", + ), + pytest.param( + "1 + {np.log(num_1 ** 2)}", + ("1", "np.log(num_1 ** 2)"), + id="functions", + ), + ], +) +def test_term_names_against_expectation(df, formula, expected_names): + model_tabmat = tm.from_formula( + formula, + df, + ensure_full_rank=True, + intercept_name="intercept", + ) + assert model_tabmat.term_names == list(expected_names) + + +@pytest.mark.parametrize( + "categorical_format", + ["{name}[{category}]", "{name}__{category}"], + ids=["brackets", "double_underscore"], +) +def test_all_names_against_from_pandas(df, categorical_format): + mat_from_pandas = tm.from_pandas( + df, drop_first=False, object_as_cat=True, categorical_format=categorical_format + ) + mat_from_formula = tm.from_formula( + "num_1 + num_2 + cat_1 + cat_2 + cat_3 + str_1 - 1", + data=df, + ensure_full_rank=False, + categorical_format=categorical_format, + ) + + assert mat_from_formula.column_names == mat_from_pandas.column_names + assert mat_from_formula.term_names == mat_from_pandas.term_names @pytest.mark.parametrize( @@ -420,9 +480,15 @@ def test_names_against_expectation_qcl(df, formula, expected_names): def test_names_against_pandas(df, formula, ensure_full_rank): num_in_scope = 2 # noqa model_df = formulaic.model_matrix(formula, df, ensure_full_rank=ensure_full_rank) - model_tabmat = tm.from_formula(formula, df, ensure_full_rank=ensure_full_rank) + model_tabmat = tm.from_formula( + formula, + df, + ensure_full_rank=ensure_full_rank, + categorical_format="{name}[T.{category}]", + ) assert model_tabmat.model_spec.column_names == model_df.model_spec.column_names assert model_tabmat.model_spec.column_names == tuple(model_df.columns) + assert model_tabmat.column_names == list(model_df.columns) @pytest.mark.parametrize( @@ -571,46 +637,46 @@ def test_interactable_vectors(left, right, reverse): # '': (, , , ) "a": (["Intercept", "a"], ["Intercept", "a"], ["Intercept", "a"], 2), "A": ( - ["Intercept", "A[T.b]", "A[T.c]"], - ["Intercept", "A[T.a]", "A[T.b]", "A[T.c]"], - ["Intercept", "A[T.c]"], + ["Intercept", "A[b]", "A[c]"], + ["Intercept", "A[a]", "A[b]", "A[c]"], + ["Intercept", "A[c]"], 2, ), "C(A)": ( - ["Intercept", "C(A)[T.b]", "C(A)[T.c]"], - ["Intercept", "C(A)[T.a]", "C(A)[T.b]", "C(A)[T.c]"], - ["Intercept", "C(A)[T.c]"], + ["Intercept", "C(A)[b]", "C(A)[c]"], + ["Intercept", "C(A)[a]", "C(A)[b]", "C(A)[c]"], + ["Intercept", "C(A)[c]"], 2, ), "A:a": ( - ["Intercept", "A[T.a]:a", "A[T.b]:a", "A[T.c]:a"], - ["Intercept", "A[T.a]:a", "A[T.b]:a", "A[T.c]:a"], - ["Intercept", "A[T.a]:a"], + ["Intercept", "A[a]:a", "A[b]:a", "A[c]:a"], + ["Intercept", "A[a]:a", "A[b]:a", "A[c]:a"], + ["Intercept", "A[a]:a"], 1, ), "A:B": ( [ "Intercept", - "B[T.b]", - "B[T.c]", - "A[T.b]:B[T.a]", - "A[T.c]:B[T.a]", - "A[T.b]:B[T.b]", - "A[T.c]:B[T.b]", - "A[T.b]:B[T.c]", - "A[T.c]:B[T.c]", + "B[b]", + "B[c]", + "A[b]:B[a]", + "A[c]:B[a]", + "A[b]:B[b]", + "A[c]:B[b]", + "A[b]:B[c]", + "A[c]:B[c]", ], [ "Intercept", - "A[T.a]:B[T.a]", - "A[T.b]:B[T.a]", - "A[T.c]:B[T.a]", - "A[T.a]:B[T.b]", - "A[T.b]:B[T.b]", - "A[T.c]:B[T.b]", - "A[T.a]:B[T.c]", - "A[T.b]:B[T.c]", - "A[T.c]:B[T.c]", + "A[a]:B[a]", + "A[b]:B[a]", + "A[c]:B[a]", + "A[a]:B[b]", + "A[b]:B[b]", + "A[c]:B[b]", + "A[a]:B[c]", + "A[b]:B[c]", + "A[c]:B[c]", ], ["Intercept"], 1, @@ -828,7 +894,7 @@ def test_encoding_edge_cases(self, materializer): encoded_matrix = ( encoded_factor["B[a]"].set_name("B[a]").to_tabmat(cat_threshold=1) ) - assert list(encoded_matrix.cat) == ["B[a][T.a]", "B[a][T.b]", "B[a][T.c]"] + assert list(encoded_matrix.cat) == ["B[a][a]", "B[a][b]", "B[a][c]"] def test_empty(self, materializer): mm = materializer.get_model_matrix("0", ensure_full_rank=True) @@ -844,13 +910,13 @@ def test_category_reordering(self): ) m = TabmatMaterializer(data).get_model_matrix("A + 0", ensure_full_rank=False) - assert list(m.model_spec.column_names) == ["A[T.a]", "A[T.b]", "A[T.c]"] + assert list(m.model_spec.column_names) == ["A[a]", "A[b]", "A[c]"] m2 = TabmatMaterializer(data2).get_model_matrix("A + 0", ensure_full_rank=False) - assert list(m2.model_spec.column_names) == ["A[T.a]", "A[T.b]", "A[T.c]"] + assert list(m2.model_spec.column_names) == ["A[a]", "A[b]", "A[c]"] m3 = TabmatMaterializer(data3).get_model_matrix("A + 0", ensure_full_rank=False) - assert list(m3.model_spec.column_names) == ["A[T.c]", "A[T.b]", "A[T.a]"] + assert list(m3.model_spec.column_names) == ["A[c]", "A[b]", "A[a]"] def test_term_clustering(self, materializer): assert materializer.get_model_matrix( @@ -859,21 +925,21 @@ def test_term_clustering(self, materializer): "Intercept", "a", "b", - "a:A[T.b]", - "a:A[T.c]", - "b:A[T.b]", - "b:A[T.c]", + "a:A[b]", + "a:A[c]", + "b:A[b]", + "b:A[c]", ) assert materializer.get_model_matrix( "a + b + a:A + b:A", cluster_by="numerical_factors" ).model_spec.column_names == ( "Intercept", "a", - "a:A[T.b]", - "a:A[T.c]", + "a:A[b]", + "a:A[c]", "b", - "b:A[T.b]", - "b:A[T.c]", + "b:A[b]", + "b:A[c]", ) def test_model_spec_pickleable(self, materializer):