Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add column name metadata to tabmat matrices #278

Merged
merged 21 commits into from
Aug 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ Changelog
Unreleased
----------

**New features:**

- Add column name and term name metadata to ``MatrixBase`` objects. These are automatically populated when initializing a ``MatrixBase`` from a ``pandas.DataFrame``. In addition, they can be accessed and modified via the ``column_names`` and ``term_names`` properties.

**Other changes:**

- Improve the performance of ``from_pandas`` in the case of low-cardinality categorical variables.
Expand Down
140 changes: 134 additions & 6 deletions src/tabmat/categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def matvec(mat, vec):

"""

import re
from typing import List, Optional, Tuple, Union

import numpy as np
Expand Down Expand Up @@ -245,6 +246,9 @@ def __init__(
cat_vec: Union[List, np.ndarray, pd.Categorical],
drop_first: bool = False,
dtype: np.dtype = np.float64,
column_name: Optional[str] = None,
term_name: Optional[str] = None,
column_name_format: str = "{name}[{category}]",
):
if pd.isnull(cat_vec).any():
raise ValueError("Categorical data can't have missing values.")
Expand All @@ -260,6 +264,13 @@ def __init__(
self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None
self.dtype = np.dtype(dtype)

self._colname = column_name
if term_name is None:
self._term = self._colname
else:
self._term = term_name
self._colname_format = column_name_format

__array_ufunc__ = None

def recover_orig(self) -> np.ndarray:
Expand Down Expand Up @@ -466,10 +477,16 @@ def getcol(self, i: int) -> SparseMatrix:
i %= self.shape[1] # wrap-around indexing

if self.drop_first:
i += 1
i_corr = i + 1
else:
i_corr = i

col_i = sps.csc_matrix((self.indices == i).astype(int)[:, None])
return SparseMatrix(col_i)
col_i = sps.csc_matrix((self.indices == i_corr).astype(int)[:, None])
return SparseMatrix(
col_i,
column_names=[self.column_names[i]],
term_names=[self.term_names[i]],
)

def tocsr(self) -> sps.csr_matrix:
"""Return scipy csr representation of matrix."""
Expand All @@ -492,7 +509,11 @@ def to_sparse_matrix(self):
"""Return a tabmat.SparseMatrix representation."""
from .sparse_matrix import SparseMatrix

return SparseMatrix(self.tocsr())
return SparseMatrix(
self.tocsr(),
column_names=self.column_names,
term_names=self.term_names,
)

def toarray(self) -> np.ndarray:
"""Return array representation of matrix."""
Expand Down Expand Up @@ -523,7 +544,11 @@ def __getitem__(self, item):
if isinstance(row, np.ndarray):
row = row.ravel()
return CategoricalMatrix(
self.cat[row], drop_first=self.drop_first, dtype=self.dtype
self.cat[row],
drop_first=self.drop_first,
dtype=self.dtype,
column_name=self._colname,
column_name_format=self._colname_format,
)
else:
# return a SparseMatrix if we subset columns
Expand Down Expand Up @@ -638,8 +663,111 @@ def multiply(self, other) -> SparseMatrix:
np.arange(self.shape[0] + 1, dtype=int),
),
shape=self.shape,
)
),
column_names=self.column_names,
term_names=self.term_names,
)

def __repr__(self):
return str(self.cat)

def get_names(
self,
type: str = "column",
missing_prefix: Optional[str] = None,
indices: Optional[List[int]] = None,
) -> List[Optional[str]]:
"""Get column names.

For columns that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the column.

Parameters
----------
type: str {'column'|'term'}
Whether to get column names or term names. The main difference is that
a categorical submatrix is counted as a single term, whereas it is
counted as multiple columns. Furthermore, matrices created from formulas
have a difference between a column and term (c.f. ``formulaic`` docs).
missing_prefix: Optional[str], default None
Prefix to use for columns that do not have a name. If None, then no
default name is created.
indices
The indices used for columns that do not have a name. If ``None``,
then the indices are ``list(range(self.shape[1]))``.

Returns
-------
List[Optional[str]]
Column names.
"""
if type == "column":
name = self._colname
elif type == "term":
name = self._term
else:
raise ValueError(f"Type must be 'column' or 'term', got {type}")

if indices is None:
indices = list(range(len(self.cat.categories) - self.drop_first))
if name is None and missing_prefix is None:
return [None] * (len(self.cat.categories) - self.drop_first)
elif name is None:
name = f"{missing_prefix}{indices[0]}-{indices[-1]}"

if type == "column":
return [
self._colname_format.format(name=name, category=cat)
for cat in self.cat.categories[self.drop_first :]
]
else:
return [name] * (len(self.cat.categories) - self.drop_first)

def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"):
"""Set column names.

Parameters
----------
names: List[Optional[str]]
Names to set.
type: str {'column'|'term'}
Whether to set column names or term names. The main difference is that
a categorical submatrix is counted as a single term, whereas it is
counted as multiple columns. Furthermore, matrices created from formulas
have a difference between a column and term (c.f. ``formulaic`` docs).
"""
if isinstance(names, str):
names = [names]

if len(names) != 1:
if type == "column":
# Try finding the column name
base_names = []
for name, cat in zip(names, self.cat.categories[self.drop_first :]):
partial_name = self._colname_format.format(
name="__CAPTURE__", category=cat
)
pattern = re.escape(partial_name).replace("__CAPTURE__", "(.*)")
if name is not None:
match = re.search(pattern, name)
else:
match = None
if match is not None:
base_names.append(match.group(1))
else:
base_names.append(name)
names = base_names

if len(names) == self.shape[1] and all(name == names[0] for name in names):
names = [names[0]]

if len(names) != 1:
raise ValueError("A categorical matrix has only one name")

if type == "column":
self._colname = names[0]
elif type == "term":
self._term = names[0]
else:
raise ValueError(f"Type must be 'column' or 'term', got {type}")
55 changes: 47 additions & 8 deletions src/tabmat/constructor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import warnings
from typing import List, Tuple, Union
from typing import List, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd
Expand All @@ -21,6 +21,7 @@ def from_pandas(
object_as_cat: bool = False,
cat_position: str = "expand",
drop_first: bool = False,
categorical_format: str = "{name}[{category}]",
) -> MatrixBase:
"""
Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this
Expand Down Expand Up @@ -72,7 +73,14 @@ def from_pandas(
if object_as_cat and coldata.dtype == object:
coldata = coldata.astype("category")
if isinstance(coldata.dtype, pd.CategoricalDtype):
cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype)
cat = CategoricalMatrix(
coldata,
drop_first=drop_first,
dtype=dtype,
column_name=colname,
term_name=colname,
column_name_format=categorical_format,
)
if len(coldata.cat.categories) < cat_threshold:
(
X_dense_F,
Expand All @@ -82,6 +90,8 @@ def from_pandas(
) = _split_sparse_and_dense_parts(
sps.csc_matrix(cat.tocsr(), dtype=dtype),
threshold=sparse_threshold,
column_names=cat.get_names("column"),
term_names=cat.get_names("term"),
)
matrices.append(X_dense_F)
is_cat.append(True)
Expand Down Expand Up @@ -128,13 +138,26 @@ def from_pandas(
f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype."
)
if len(dense_dfidx) > 0:
matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype)))
matrices.append(
DenseMatrix(
df.iloc[:, dense_dfidx].astype(dtype),
column_names=df.columns[dense_dfidx],
term_names=df.columns[dense_dfidx],
)
)
indices.append(dense_mxidx)
is_cat.append(False)
if len(sparse_dfcols) > 0:
sparse_dict = {i: v for i, v in enumerate(sparse_dfcols)}
full_sparse = pd.DataFrame(sparse_dict).sparse.to_coo()
matrices.append(SparseMatrix(full_sparse, dtype=dtype))
matrices.append(
SparseMatrix(
full_sparse,
dtype=dtype,
column_names=[col.name for col in sparse_dfcols],
term_names=[col.name for col in sparse_dfcols],
)
)
indices.append(sparse_mxidx)
is_cat.append(False)

Expand All @@ -157,7 +180,10 @@ def from_pandas(


def _split_sparse_and_dense_parts(
arg1: sps.csc_matrix, threshold: float = 0.1
arg1: sps.csc_matrix,
threshold: float = 0.1,
column_names: Optional[Sequence[Optional[str]]] = None,
term_names: Optional[Sequence[Optional[str]]] = None,
) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]:
"""
Split matrix.
Expand All @@ -176,12 +202,25 @@ def _split_sparse_and_dense_parts(
dense_indices = np.where(densities > threshold)[0]
sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices)

X_dense_F = DenseMatrix(np.asfortranarray(arg1[:, dense_indices].toarray()))
X_sparse = SparseMatrix(arg1[:, sparse_indices])
if column_names is None:
column_names = [None] * arg1.shape[1]
if term_names is None:
term_names = column_names

X_dense_F = DenseMatrix(
np.asfortranarray(arg1[:, dense_indices].toarray()),
column_names=[column_names[i] for i in dense_indices],
term_names=[term_names[i] for i in dense_indices],
)
X_sparse = SparseMatrix(
arg1[:, sparse_indices],
column_names=[column_names[i] for i in sparse_indices],
term_names=[term_names[i] for i in sparse_indices],
)
return X_dense_F, X_sparse, dense_indices, sparse_indices


def from_csc(mat: sps.csc_matrix, threshold=0.1):
def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=None):
"""
Convert a CSC-format sparse matrix into a ``SplitMatrix``.

Expand Down
Loading
Loading