Skip to content

Commit

Permalink
Add column name getters
Browse files Browse the repository at this point in the history
  • Loading branch information
stanmart committed Jul 11, 2023
1 parent 212a1c6 commit 9fe977e
Show file tree
Hide file tree
Showing 6 changed files with 353 additions and 0 deletions.
64 changes: 64 additions & 0 deletions src/tabmat/categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ def __init__(
self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None
self.dtype = np.dtype(dtype)

self._colname = None
self._term = None
self._colname_format = "{name}[{category}]"

def recover_orig(self) -> np.ndarray:
"""
Return 1d numpy array with same data as what was initially fed to __init__.
Expand Down Expand Up @@ -641,3 +645,63 @@ def multiply(self, other) -> SparseMatrix:

def __repr__(self):
return str(self.cat)

def get_column_names(
self, missing_prefix: str = "_col_", start_index: int = 0
) -> List[str]:
"""Get column names.
For columns that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the column.
Parameters
----------
missing_prefix
Prefix to use for columns that do not have a name.
start_index
Index to start from when creating default names.
Returns
-------
list of str
Column names.
"""
if self._colname is None:
colname = f"{missing_prefix}{start_index}"
else:
colname = self._colname
return [
self._colname_format.format(name=colname, category=cat)
for cat in self.cat.categories[self.drop_first :]
]

def get_term_names(
self, missing_prefix: str = "_col_", start_index: int = 0
) -> List[str]:
"""Get term names.
The main difference to ``get_column_names`` is that a categorical submatrix
is counted as a single term. Furthermore, matrices created from formulas
have a difference between a column and term (c.f. ``formulaic`` docs).
For terms that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the term.
Parameters
----------
missing_prefix
Prefix to use for terms that do not have a name.
start_index
Index to start from when creating default names.
Returns
-------
list of str
Term names.
"""
if self._term is None:
term = f"{missing_prefix}{start_index}"
else:
term = self._term
return [term] * (len(self.cat.categories) - self.drop_first)
64 changes: 64 additions & 0 deletions src/tabmat/dense_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ def __new__(cls, input_array): # noqa
obj = np.asarray(input_array).view(cls)
if not np.issubdtype(obj.dtype, np.floating):
raise NotImplementedError("DenseMatrix is only implemented for float data")

if obj.ndim == 2:
obj._colnames = [None] * obj.shape[1]
obj._terms = [None] * obj.shape[1]

return obj

def __array_finalize__(self, obj):
Expand Down Expand Up @@ -166,3 +171,62 @@ def multiply(self, other):
if np.asanyarray(other).ndim == 1:
return super().__mul__(other[:, np.newaxis])
return super().__mul__(other)

def get_column_names(
self, missing_prefix: str = "_col_", start_index: int = 0
) -> List[str]:
"""Get column names.
For columns that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the column.
Parameters
----------
missing_prefix
Prefix to use for columns that do not have a name.
start_index
Index to start from when creating default names.
Returns
-------
list of str
Column names.
"""
colnames = np.array(self._colnames)
default_colnames = np.array(
[f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
)
colnames[colnames == None] = default_colnames[colnames == None] # noqa: E711
return list(colnames)

def get_term_names(
self, missing_prefix: str = "_col_", start_index: int = 0
) -> List[str]:
"""Get term names.
The main difference to ``get_column_names`` is that a categorical submatrix
is counted as a single term. Furthermore, matrices created from formulas
have a difference between a column and term (c.f. ``formulaic`` docs).
For terms that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the term.
Parameters
----------
missing_prefix
Prefix to use for terms that do not have a name.
start_index
Index to start from when creating default names.
Returns
-------
list of str
Term names.
"""
terms = np.array(self._terms)
default_terms = np.array(
[f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
)
terms[terms == None] = default_terms[terms == None] # noqa: E711
return list(terms)
51 changes: 51 additions & 0 deletions src/tabmat/matrix_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,57 @@ def standardize(
def __getitem__(self, item):
pass

@abstractmethod
def get_column_names(
self, missing_prefix: str = "_col_", start_index: int = 0
) -> List[str]:
"""Get column names.
For columns that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the column.
Parameters
----------
missing_prefix
Prefix to use for columns that do not have a name.
start_index
Index to start from when creating default names.
Returns
-------
list of str
Column names.
"""
pass

@abstractmethod
def get_term_names(
self, missing_prefix: str = "_col_", start_index: int = 0
) -> List[str]:
"""Get term names.
The main difference to ``get_column_names`` is that a categorical submatrix
is counted as a single term. Furthermore, matrices created from formulas
have a difference between a column and term (c.f. ``formulaic`` docs).
For terms that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the term.
Parameters
----------
missing_prefix
Prefix to use for terms that do not have a name.
start_index
Index to start from when creating default names.
Returns
-------
list of str
Term names.
"""
pass

# Higher priority than numpy arrays, so behavior for funcs like "@" defaults to the
# behavior of this class
__array_priority__ = 11
Expand Down
62 changes: 62 additions & 0 deletions src/tabmat/sparse_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False):
self.sort_indices()
self._x_csr = None

self._colnames = [None] * self.shape[1]
self._terms = [None] * self.shape[1]

@property
def x_csr(self):
"""Cache the CSR representation of the matrix."""
Expand Down Expand Up @@ -203,3 +206,62 @@ def multiply(self, other):
if other.ndim == 1:
return SparseMatrix(super().multiply(other[:, np.newaxis]))
return SparseMatrix(super().multiply(other))

def get_column_names(
self, missing_prefix: str = "_col_", start_index: int = 0
) -> List[str]:
"""Get column names.
For columns that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the column.
Parameters
----------
missing_prefix
Prefix to use for columns that do not have a name.
start_index
Index to start from when creating default names.
Returns
-------
list of str
Column names.
"""
colnames = np.array(self._colnames)
default_colnames = np.array(
[f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
)
colnames[colnames == None] = default_colnames[colnames == None] # noqa: E711
return list(colnames)

def get_term_names(
self, missing_prefix: str = "_col_", start_index: int = 0
) -> List[str]:
"""Get term names.
The main difference to ``get_column_names`` is that a categorical submatrix
is counted as a single term. Furthermore, matrices created from formulas
have a difference between a column and term (c.f. ``formulaic`` docs).
For terms that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the term.
Parameters
----------
missing_prefix
Prefix to use for terms that do not have a name.
start_index
Index to start from when creating default names.
Returns
-------
list of str
Term names.
"""
terms = np.array(self._terms)
default_terms = np.array(
[f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
)
terms[terms == None] = default_terms[terms == None] # noqa: E711
return list(terms)
63 changes: 63 additions & 0 deletions src/tabmat/split_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,3 +448,66 @@ def __repr__(self):
return out

__array_priority__ = 13

def get_column_names(
self, missing_prefix: str = "_col_", start_index: int = 0
) -> List[str]:
"""Get column names.
For columns that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the column.
Parameters
----------
missing_prefix
Prefix to use for columns that do not have a name.
start_index
Index to start from when creating default names.
Returns
-------
list of str
Column names.
"""
column_names = np.empty(self.shape[1], dtype=object)
for idx, mat in zip(self.indices, self.matrices):
column_names[idx] = mat.get_column_names(missing_prefix, start_index)
if isinstance(mat, CategoricalMatrix):
start_index += 1
else:
start_index += mat.shape[1]
return list(column_names)

def get_term_names(
self, missing_prefix: str = "_col_", start_index: int = 0
) -> List[str]:
"""Get term names.
The main difference to ``get_column_names`` is that a categorical submatrix
is counted as a single term. Furthermore, matrices created from formulas
have a difference between a column and term (c.f. ``formulaic`` docs).
For terms that do not have a name, a default name is created using the
followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
the index of the term.
Parameters
----------
missing_prefix
Prefix to use for terms that do not have a name.
start_index
Index to start from when creating default names.
Returns
-------
list of str
Term names.
"""
term_names = np.empty(self.shape[1], dtype=object)
for idx, mat in zip(self.indices, self.matrices):
term_names[idx] = mat.get_term_names(missing_prefix, start_index)
if isinstance(mat, CategoricalMatrix):
start_index += 1
else:
start_index += mat.shape[1]
return list(term_names)
Loading

0 comments on commit 9fe977e

Please sign in to comment.