From ed016b4a5d4e979a72262397c9da38a58990f378 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 11 Jul 2023 14:33:01 +0200 Subject: [PATCH 01/19] Add column name getters --- src/tabmat/categorical_matrix.py | 63 ++++++++++++++++++++++++++++++++ src/tabmat/dense_matrix.py | 62 +++++++++++++++++++++++++++++++ src/tabmat/matrix_base.py | 51 ++++++++++++++++++++++++++ src/tabmat/sparse_matrix.py | 63 ++++++++++++++++++++++++++++++++ src/tabmat/split_matrix.py | 63 ++++++++++++++++++++++++++++++++ src/tabmat/standardized_mat.py | 49 +++++++++++++++++++++++++ 6 files changed, 351 insertions(+) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 4968c628..df878393 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -264,6 +264,9 @@ def __init__( self.indices = self.cat.codes.astype(np.int32) self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None self.dtype = np.dtype(dtype) + self._colname = None + self._term = None + self._colname_format = "{name}[{category}]" __array_ufunc__ = None @@ -655,3 +658,63 @@ def multiply(self, other) -> SparseMatrix: def __repr__(self): return str(self.cat) + + def get_column_names( + self, missing_prefix: str = "_col_", start_index: int = 0 + ) -> List[str]: + """Get column names. + + For columns that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + missing_prefix + Prefix to use for columns that do not have a name. + start_index + Index to start from when creating default names. + + Returns + ------- + list of str + Column names. + """ + if self._colname is None: + colname = f"{missing_prefix}{start_index}" + else: + colname = self._colname + return [ + self._colname_format.format(name=colname, category=cat) + for cat in self.cat.categories[self.drop_first :] + ] + + def get_term_names( + self, missing_prefix: str = "_col_", start_index: int = 0 + ) -> List[str]: + """Get term names. + + The main difference to ``get_column_names`` is that a categorical submatrix + is counted as a single term. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + For terms that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the term. + + Parameters + ---------- + missing_prefix + Prefix to use for terms that do not have a name. + start_index + Index to start from when creating default names. + + Returns + ------- + list of str + Term names. + """ + if self._term is None: + term = f"{missing_prefix}{start_index}" + else: + term = self._term + return [term] * (len(self.cat.categories) - self.drop_first) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 1a70457f..b624832c 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -43,6 +43,9 @@ def __init__(self, input_array): self._array = np.asarray(input_array) + self._colnames = [None] * input_array.shape[1] + self._terms = [None] * input_array.shape[1] + def __getitem__(self, key): if not isinstance(key, tuple): key = (key,) @@ -219,3 +222,62 @@ def multiply(self, other): if np.asanyarray(other).ndim == 1: return type(self)(self._array.__mul__(other[:, np.newaxis])) return type(self)(self._array.__mul__(other)) + + def get_column_names( + self, missing_prefix: str = "_col_", start_index: int = 0 + ) -> List[str]: + """Get column names. + + For columns that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + missing_prefix + Prefix to use for columns that do not have a name. + start_index + Index to start from when creating default names. + + Returns + ------- + list of str + Column names. + """ + colnames = np.array(self._colnames) + default_colnames = np.array( + [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])] + ) + colnames[colnames == None] = default_colnames[colnames == None] # noqa: E711 + return list(colnames) + + def get_term_names( + self, missing_prefix: str = "_col_", start_index: int = 0 + ) -> List[str]: + """Get term names. + + The main difference to ``get_column_names`` is that a categorical submatrix + is counted as a single term. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + For terms that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the term. + + Parameters + ---------- + missing_prefix + Prefix to use for terms that do not have a name. + start_index + Index to start from when creating default names. + + Returns + ------- + list of str + Term names. + """ + terms = np.array(self._terms) + default_terms = np.array( + [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])] + ) + terms[terms == None] = default_terms[terms == None] # noqa: E711 + return list(terms) diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py index 88091834..318fd4d9 100644 --- a/src/tabmat/matrix_base.py +++ b/src/tabmat/matrix_base.py @@ -164,6 +164,57 @@ def standardize( def __getitem__(self, item): pass + @abstractmethod + def get_column_names( + self, missing_prefix: str = "_col_", start_index: int = 0 + ) -> List[str]: + """Get column names. + + For columns that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + missing_prefix + Prefix to use for columns that do not have a name. + start_index + Index to start from when creating default names. + + Returns + ------- + list of str + Column names. + """ + pass + + @abstractmethod + def get_term_names( + self, missing_prefix: str = "_col_", start_index: int = 0 + ) -> List[str]: + """Get term names. + + The main difference to ``get_column_names`` is that a categorical submatrix + is counted as a single term. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + For terms that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the term. + + Parameters + ---------- + missing_prefix + Prefix to use for terms that do not have a name. + start_index + Index to start from when creating default names. + + Returns + ------- + list of str + Term names. + """ + pass + # Higher priority than numpy arrays, so behavior for funcs like "@" defaults to the # behavior of this class __array_priority__ = 11 diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 188f6862..7053236d 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -44,6 +44,10 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False): self._array.sort_indices() self._array_csr = None + self._colnames = [None] * self.shape[1] + self._terms = [None] * self.shape[1] + + def __getitem__(self, key): if not isinstance(key, tuple): key = (key,) @@ -287,3 +291,62 @@ def multiply(self, other): if other.ndim == 1: return type(self)(self._array.multiply(other[:, np.newaxis])) return type(self)(self._array.multiply(other)) + + def get_column_names( + self, missing_prefix: str = "_col_", start_index: int = 0 + ) -> List[str]: + """Get column names. + + For columns that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + missing_prefix + Prefix to use for columns that do not have a name. + start_index + Index to start from when creating default names. + + Returns + ------- + list of str + Column names. + """ + colnames = np.array(self._colnames) + default_colnames = np.array( + [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])] + ) + colnames[colnames == None] = default_colnames[colnames == None] # noqa: E711 + return list(colnames) + + def get_term_names( + self, missing_prefix: str = "_col_", start_index: int = 0 + ) -> List[str]: + """Get term names. + + The main difference to ``get_column_names`` is that a categorical submatrix + is counted as a single term. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + For terms that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the term. + + Parameters + ---------- + missing_prefix + Prefix to use for terms that do not have a name. + start_index + Index to start from when creating default names. + + Returns + ------- + list of str + Term names. + """ + terms = np.array(self._terms) + default_terms = np.array( + [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])] + ) + terms[terms == None] = default_terms[terms == None] # noqa: E711 + return list(terms) diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index a091949f..4cfe36c1 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -477,3 +477,66 @@ def __repr__(self): return out __array_priority__ = 13 + + def get_column_names( + self, missing_prefix: str = "_col_", start_index: int = 0 + ) -> List[str]: + """Get column names. + + For columns that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + missing_prefix + Prefix to use for columns that do not have a name. + start_index + Index to start from when creating default names. + + Returns + ------- + list of str + Column names. + """ + column_names = np.empty(self.shape[1], dtype=object) + for idx, mat in zip(self.indices, self.matrices): + column_names[idx] = mat.get_column_names(missing_prefix, start_index) + if isinstance(mat, CategoricalMatrix): + start_index += 1 + else: + start_index += mat.shape[1] + return list(column_names) + + def get_term_names( + self, missing_prefix: str = "_col_", start_index: int = 0 + ) -> List[str]: + """Get term names. + + The main difference to ``get_column_names`` is that a categorical submatrix + is counted as a single term. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + For terms that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the term. + + Parameters + ---------- + missing_prefix + Prefix to use for terms that do not have a name. + start_index + Index to start from when creating default names. + + Returns + ------- + list of str + Term names. + """ + term_names = np.empty(self.shape[1], dtype=object) + for idx, mat in zip(self.indices, self.matrices): + term_names[idx] = mat.get_term_names(missing_prefix, start_index) + if isinstance(mat, CategoricalMatrix): + start_index += 1 + else: + start_index += mat.shape[1] + return list(term_names) diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py index 19b04f5a..df69b15a 100644 --- a/src/tabmat/standardized_mat.py +++ b/src/tabmat/standardized_mat.py @@ -298,3 +298,52 @@ def __repr__(self): Mult: {self.mult} """ return out + + def get_column_names( + self, missing_prefix: str = "_col_", start_index: int = 0 + ) -> List[str]: + """Get column names. + + For columns that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the column. + + Parameters + ---------- + missing_prefix + Prefix to use for columns that do not have a name. + start_index + Index to start from when creating default names. + + Returns + ------- + list of str + Column names. + """ + return self.mat.get_column_names(missing_prefix, start_index) + + def get_term_names( + self, missing_prefix: str = "_col_", start_index: int = 0 + ) -> List[str]: + """Get term names. + + The main difference to ``get_column_names`` is that a categorical submatrix + is counted as a single term. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + For terms that do not have a name, a default name is created using the + followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is + the index of the term. + + Parameters + ---------- + missing_prefix + Prefix to use for terms that do not have a name. + start_index + Index to start from when creating default names. + + Returns + ------- + list of str + Term names. + """ + return self.mat.get_term_names(missing_prefix, start_index) From 0519d7af3e2eff5df43e7e93ea67df7b296098c2 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 11 Jul 2023 14:48:30 +0200 Subject: [PATCH 02/19] Matrix names are also combined --- src/tabmat/split_matrix.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index 4cfe36c1..e00dc0ee 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -113,8 +113,16 @@ def _combine_matrices(matrices, indices): if len(this_type_matrices) > 1: new_matrix = mat_type_(stack_fn([matrices[i] for i in this_type_matrices])) new_indices = np.concatenate([indices[i] for i in this_type_matrices]) + new_colnames = np.concatenate( + np.array([matrices[i]._colnames for i in this_type_matrices]) + ) + new_terms = np.concatenate( + np.array([matrices[i]._terms for i in this_type_matrices]) + ) sorter = np.argsort(new_indices) sorted_matrix = new_matrix[:, sorter] + sorted_matrix._colnames = list(new_colnames[sorter]) + sorted_matrix._terms = list(new_terms[sorter]) sorted_indices = new_indices[sorter] assert sorted_matrix.shape[0] == n_row From 9b652aec134cfd3fa930fb465ed07c84764c5f91 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 11 Jul 2023 18:15:39 +0200 Subject: [PATCH 03/19] Add names to constructors --- src/tabmat/categorical_matrix.py | 13 +++++-- src/tabmat/constructor.py | 59 +++++++++++++++++++++++++++----- src/tabmat/dense_matrix.py | 20 +++++++++-- src/tabmat/sparse_matrix.py | 29 ++++++++++++++-- src/tabmat/split_matrix.py | 4 +-- 5 files changed, 106 insertions(+), 19 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index df878393..6d87adc7 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -250,6 +250,9 @@ def __init__( cat_vec: Union[List, np.ndarray, pd.Categorical], drop_first: bool = False, dtype: np.dtype = np.float64, + column_name: Optional[str] = None, + term_name: Optional[str] = None, + column_name_format: str = "{name}[{category}]", ): if pd.isnull(cat_vec).any(): raise ValueError("Categorical data can't have missing values.") @@ -264,9 +267,13 @@ def __init__( self.indices = self.cat.codes.astype(np.int32) self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None self.dtype = np.dtype(dtype) - self._colname = None - self._term = None - self._colname_format = "{name}[{category}]" + + self._colname = column_name + if term_name is None: + self._term = self._colname + else: + self._term = term_name + self._colname_format = column_name_format __array_ufunc__ = None diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index f8e23c31..08b09316 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -1,5 +1,5 @@ import warnings -from typing import List, Tuple, Union +from typing import List, Optional, Sequence, Tuple, Union import numpy as np import pandas as pd @@ -21,6 +21,7 @@ def from_pandas( object_as_cat: bool = False, cat_position: str = "expand", drop_first: bool = False, + categorical_format: str = "{name}[{category}]", ) -> MatrixBase: """ Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this @@ -72,7 +73,14 @@ def from_pandas( if object_as_cat and coldata.dtype == object: coldata = coldata.astype("category") if isinstance(coldata.dtype, pd.CategoricalDtype): - cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype) + cat = CategoricalMatrix( + coldata, + drop_first=drop_first, + dtype=dtype, + column_name=colname, + term_name=colname, + column_name_format=categorical_format, + ) if len(coldata.cat.categories) < cat_threshold: ( X_dense_F, @@ -82,6 +90,8 @@ def from_pandas( ) = _split_sparse_and_dense_parts( sps.csc_matrix(cat.tocsr(), dtype=dtype), threshold=sparse_threshold, + column_names=cat.get_column_names(), + term_names=cat.get_term_names(), ) matrices.append(X_dense_F) is_cat.append(True) @@ -128,13 +138,26 @@ def from_pandas( f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype." ) if len(dense_dfidx) > 0: - matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype))) + matrices.append( + DenseMatrix( + df.iloc[:, dense_dfidx].astype(dtype), + column_names=df.columns[dense_dfidx], + term_names=df.columns[dense_dfidx], + ) + ) indices.append(dense_mxidx) is_cat.append(False) if len(sparse_dfcols) > 0: sparse_dict = {i: v for i, v in enumerate(sparse_dfcols)} full_sparse = pd.DataFrame(sparse_dict).sparse.to_coo() - matrices.append(SparseMatrix(full_sparse, dtype=dtype)) + matrices.append( + SparseMatrix( + full_sparse, + dtype=dtype, + column_names=[col.name for col in sparse_dfcols], + term_names=[col.name for col in sparse_dfcols], + ) + ) indices.append(sparse_mxidx) is_cat.append(False) @@ -157,7 +180,10 @@ def from_pandas( def _split_sparse_and_dense_parts( - arg1: sps.csc_matrix, threshold: float = 0.1 + arg1: sps.csc_matrix, + threshold: float = 0.1, + column_names: Optional[Sequence[Optional[str]]] = None, + term_names: Optional[Sequence[Optional[str]]] = None, ) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]: """ Split matrix. @@ -176,17 +202,34 @@ def _split_sparse_and_dense_parts( dense_indices = np.where(densities > threshold)[0] sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices) - X_dense_F = DenseMatrix(np.asfortranarray(arg1[:, dense_indices].toarray())) - X_sparse = SparseMatrix(arg1[:, sparse_indices]) + if column_names is None: + column_names = [None] * arg1.shape[1] + if term_names is None: + term_names = column_names + + X_dense_F = DenseMatrix( + np.asfortranarray(arg1[:, dense_indices].toarray()), + column_names=[column_names[i] for i in dense_indices], + term_names=[term_names[i] for i in dense_indices], + ) + X_sparse = SparseMatrix( + arg1[:, sparse_indices], + column_names=[column_names[i] for i in sparse_indices], + term_names=[term_names[i] for i in sparse_indices], + ) return X_dense_F, X_sparse, dense_indices, sparse_indices -def from_csc(mat: sps.csc_matrix, threshold=0.1): +def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=None): """ Convert a CSC-format sparse matrix into a ``SplitMatrix``. The ``threshold`` parameter specifies the density below which a column is treated as sparse. """ + if column_names is None: + column_names = [None] * mat.shape[1] + if term_names is None: + term_names = column_names dense, sparse, dense_idx, sparse_idx = _split_sparse_and_dense_parts(mat, threshold) return SplitMatrix([dense, sparse], [dense_idx, sparse_idx]) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index b624832c..7b35b654 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -33,7 +33,7 @@ class DenseMatrix(MatrixBase): """ - def __init__(self, input_array): + def __init__(self, input_array, column_names=None, term_names=None): input_array = np.asarray(input_array) if input_array.ndim == 1: @@ -42,9 +42,23 @@ def __init__(self, input_array): raise ValueError("Input array must be 1- or 2-dimensional") self._array = np.asarray(input_array) + width = self._array.shape[1] + + if column_names is not None: + if len(column_names) != width: + raise ValueError( + f"Expected {width} column names, got {len(column_names)}" + ) + self._colnames = column_names + else: + self._colnames = [None] * width - self._colnames = [None] * input_array.shape[1] - self._terms = [None] * input_array.shape[1] + if term_names is not None: + if len(term_names) != width: + raise ValueError(f"Expected {width} term names, got {len(term_names)}") + self._terms = term_names + else: + self._terms = obj._colnames def __getitem__(self, key): if not isinstance(key, tuple): diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 7053236d..e62de092 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -30,7 +30,15 @@ class SparseMatrix(MatrixBase): SparseMatrix is instantiated in the same way as scipy.sparse.csc_matrix. """ - def __init__(self, arg1, shape=None, dtype=None, copy=False): + def __init__( + self, + arg1, + shape=None, + dtype=None, + copy=False, + column_names=None, + term_names=None, + ): self._array = sps.csc_matrix(arg1, shape, dtype, copy) self.idx_dtype = max(self._array.indices.dtype, self._array.indptr.dtype) @@ -44,8 +52,23 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False): self._array.sort_indices() self._array_csr = None - self._colnames = [None] * self.shape[1] - self._terms = [None] * self.shape[1] + if column_names is not None: + if len(column_names) != self.shape[1]: + raise ValueError( + f"Expected {self.shape[1]} column names, got {len(column_names)}" + ) + self._colnames = column_names + else: + self._colnames = [None] * self.shape[1] + + if term_names is not None: + if len(term_names) != self.shape[1]: + raise ValueError( + f"Expected {self.shape[1]} term names, got {len(term_names)}" + ) + self._terms = term_names + else: + self._terms = self._colnames def __getitem__(self, key): diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index e00dc0ee..e7669fba 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -114,10 +114,10 @@ def _combine_matrices(matrices, indices): new_matrix = mat_type_(stack_fn([matrices[i] for i in this_type_matrices])) new_indices = np.concatenate([indices[i] for i in this_type_matrices]) new_colnames = np.concatenate( - np.array([matrices[i]._colnames for i in this_type_matrices]) + [np.array(matrices[i]._colnames) for i in this_type_matrices] ) new_terms = np.concatenate( - np.array([matrices[i]._terms for i in this_type_matrices]) + [np.array(matrices[i]._terms) for i in this_type_matrices] ) sorter = np.argsort(new_indices) sorted_matrix = new_matrix[:, sorter] From 96c2e5658eb6f478fc543d2546f34b59d2d901ef Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Tue, 11 Jul 2023 18:23:35 +0200 Subject: [PATCH 04/19] Add indexing support for column names --- src/tabmat/dense_matrix.py | 8 ++++++++ src/tabmat/sparse_matrix.py | 14 ++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 7b35b654..42ae5b22 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -114,6 +114,14 @@ def astype(self, dtype, order="K", casting="unsafe", copy=True): """Copy of the array, cast to a specified type.""" return type(self)(self._array.astype(dtype, order, casting, copy)) + def __getitem__(self, key): + """Return a subset of the matrix.""" + result = super().__getitem__(key) + if len(key) == 2: + result._colnames = list(np.array(self._colnames)[key[1]]) + result._terms = list(np.array(self._terms)[key[1]]) + return result + def getcol(self, i): """Return matrix column at specified index.""" return type(self)(self._array[:, [i]]) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index e62de092..af25eaad 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -70,7 +70,6 @@ def __init__( else: self._terms = self._colnames - def __getitem__(self, key): if not isinstance(key, tuple): key = (key,) @@ -78,7 +77,18 @@ def __getitem__(self, key): # Always return a 2d array key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key) - return type(self)(self._array.__getitem__(key)) + if len(key) == 2: + colnames = list(np.array(self._colnames)[key[1]]) + terms = list(np.array(self._terms)[key[1]]) + else: + colnames = self._colnames + terms = self._terms + + return type(self)( + self._array.__getitem__(key), + column_names=colnames, + term_names=terms + ) def __matmul__(self, other): return self._array.__matmul__(other) From 7ba73b2399977476615de83010cf3e88b5144887 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 12 Jul 2023 08:57:27 +0200 Subject: [PATCH 05/19] Remove unnecessary code --- src/tabmat/constructor.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index 08b09316..ed274377 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -227,9 +227,5 @@ def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=N The ``threshold`` parameter specifies the density below which a column is treated as sparse. """ - if column_names is None: - column_names = [None] * mat.shape[1] - if term_names is None: - term_names = column_names dense, sparse, dense_idx, sparse_idx = _split_sparse_and_dense_parts(mat, threshold) return SplitMatrix([dense, sparse], [dense_idx, sparse_idx]) From cd1ca1e33609f536d103d4ee76e307c164e929a4 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 12 Jul 2023 10:12:02 +0200 Subject: [PATCH 06/19] Better default column names --- src/tabmat/categorical_matrix.py | 22 ++++++++++++++-------- src/tabmat/dense_matrix.py | 26 ++++++++++++++------------ src/tabmat/matrix_base.py | 14 ++++++++------ src/tabmat/sparse_matrix.py | 26 ++++++++++++++------------ src/tabmat/split_matrix.py | 24 ++++++++---------------- src/tabmat/standardized_mat.py | 18 +++++++++--------- 6 files changed, 67 insertions(+), 63 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 6d87adc7..31c8282d 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -667,7 +667,7 @@ def __repr__(self): return str(self.cat) def get_column_names( - self, missing_prefix: str = "_col_", start_index: int = 0 + self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None ) -> List[str]: """Get column names. @@ -679,16 +679,19 @@ def get_column_names( ---------- missing_prefix Prefix to use for columns that do not have a name. - start_index - Index to start from when creating default names. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. Returns ------- list of str Column names. """ + if indices is None: + indices = list(range(len(self.cat.categories) - self.drop_first)) if self._colname is None: - colname = f"{missing_prefix}{start_index}" + colname = f"{missing_prefix}{indices[0]}-{indices[-1]}" else: colname = self._colname return [ @@ -697,7 +700,7 @@ def get_column_names( ] def get_term_names( - self, missing_prefix: str = "_col_", start_index: int = 0 + self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None ) -> List[str]: """Get term names. @@ -712,16 +715,19 @@ def get_term_names( ---------- missing_prefix Prefix to use for terms that do not have a name. - start_index - Index to start from when creating default names. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. Returns ------- list of str Term names. """ + if indices is None: + indices = list(range(len(self.cat.categories) - self.drop_first)) if self._term is None: - term = f"{missing_prefix}{start_index}" + term = f"{missing_prefix}{indices[0]}-{indices[-1]}" else: term = self._term return [term] * (len(self.cat.categories) - self.drop_first) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 42ae5b22..c4fe345c 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -246,7 +246,7 @@ def multiply(self, other): return type(self)(self._array.__mul__(other)) def get_column_names( - self, missing_prefix: str = "_col_", start_index: int = 0 + self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None ) -> List[str]: """Get column names. @@ -258,23 +258,24 @@ def get_column_names( ---------- missing_prefix Prefix to use for columns that do not have a name. - start_index - Index to start from when creating default names. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. Returns ------- list of str Column names. """ + if indices is None: + indices = list(range(self.shape[1])) colnames = np.array(self._colnames) - default_colnames = np.array( - [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])] - ) + default_colnames = np.array([f"{missing_prefix}{i}" for i in indices]) colnames[colnames == None] = default_colnames[colnames == None] # noqa: E711 return list(colnames) def get_term_names( - self, missing_prefix: str = "_col_", start_index: int = 0 + self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None ) -> List[str]: """Get term names. @@ -289,17 +290,18 @@ def get_term_names( ---------- missing_prefix Prefix to use for terms that do not have a name. - start_index - Index to start from when creating default names. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. Returns ------- list of str Term names. """ + if indices is None: + indices = list(range(self.shape[1])) terms = np.array(self._terms) - default_terms = np.array( - [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])] - ) + default_terms = np.array([f"{missing_prefix}{i}" for i in indices]) terms[terms == None] = default_terms[terms == None] # noqa: E711 return list(terms) diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py index 318fd4d9..393196b0 100644 --- a/src/tabmat/matrix_base.py +++ b/src/tabmat/matrix_base.py @@ -166,7 +166,7 @@ def __getitem__(self, item): @abstractmethod def get_column_names( - self, missing_prefix: str = "_col_", start_index: int = 0 + self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None ) -> List[str]: """Get column names. @@ -178,8 +178,9 @@ def get_column_names( ---------- missing_prefix Prefix to use for columns that do not have a name. - start_index - Index to start from when creating default names. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. Returns ------- @@ -190,7 +191,7 @@ def get_column_names( @abstractmethod def get_term_names( - self, missing_prefix: str = "_col_", start_index: int = 0 + self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None ) -> List[str]: """Get term names. @@ -205,8 +206,9 @@ def get_term_names( ---------- missing_prefix Prefix to use for terms that do not have a name. - start_index - Index to start from when creating default names. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. Returns ------- diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index af25eaad..18e554f3 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -326,7 +326,7 @@ def multiply(self, other): return type(self)(self._array.multiply(other)) def get_column_names( - self, missing_prefix: str = "_col_", start_index: int = 0 + self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None ) -> List[str]: """Get column names. @@ -338,23 +338,24 @@ def get_column_names( ---------- missing_prefix Prefix to use for columns that do not have a name. - start_index - Index to start from when creating default names. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. Returns ------- list of str Column names. """ + if indices is None: + indices = list(range(self.shape[1])) colnames = np.array(self._colnames) - default_colnames = np.array( - [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])] - ) + default_colnames = np.array([f"{missing_prefix}{i}" for i in indices]) colnames[colnames == None] = default_colnames[colnames == None] # noqa: E711 return list(colnames) def get_term_names( - self, missing_prefix: str = "_col_", start_index: int = 0 + self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None ) -> List[str]: """Get term names. @@ -369,17 +370,18 @@ def get_term_names( ---------- missing_prefix Prefix to use for terms that do not have a name. - start_index - Index to start from when creating default names. + indices + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. Returns ------- list of str Term names. """ + if indices is None: + indices = list(range(self.shape[1])) terms = np.array(self._terms) - default_terms = np.array( - [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])] - ) + default_terms = np.array([f"{missing_prefix}{i}" for i in indices]) terms[terms == None] = default_terms[terms == None] # noqa: E711 return list(terms) diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index e7669fba..ec2f34cc 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -487,7 +487,7 @@ def __repr__(self): __array_priority__ = 13 def get_column_names( - self, missing_prefix: str = "_col_", start_index: int = 0 + self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None ) -> List[str]: """Get column names. @@ -499,8 +499,8 @@ def get_column_names( ---------- missing_prefix Prefix to use for columns that do not have a name. - start_index - Index to start from when creating default names. + indices + Ignored. Returns ------- @@ -509,15 +509,11 @@ def get_column_names( """ column_names = np.empty(self.shape[1], dtype=object) for idx, mat in zip(self.indices, self.matrices): - column_names[idx] = mat.get_column_names(missing_prefix, start_index) - if isinstance(mat, CategoricalMatrix): - start_index += 1 - else: - start_index += mat.shape[1] + column_names[idx] = mat.get_column_names(missing_prefix, idx) return list(column_names) def get_term_names( - self, missing_prefix: str = "_col_", start_index: int = 0 + self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None ) -> List[str]: """Get term names. @@ -532,8 +528,8 @@ def get_term_names( ---------- missing_prefix Prefix to use for terms that do not have a name. - start_index - Index to start from when creating default names. + indices + Ignored. Returns ------- @@ -542,9 +538,5 @@ def get_term_names( """ term_names = np.empty(self.shape[1], dtype=object) for idx, mat in zip(self.indices, self.matrices): - term_names[idx] = mat.get_term_names(missing_prefix, start_index) - if isinstance(mat, CategoricalMatrix): - start_index += 1 - else: - start_index += mat.shape[1] + term_names[idx] = mat.get_term_names(missing_prefix, idx) return list(term_names) diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py index df69b15a..8a8099c7 100644 --- a/src/tabmat/standardized_mat.py +++ b/src/tabmat/standardized_mat.py @@ -1,4 +1,4 @@ -from typing import List, Union +from typing import List, Optional, Union import numpy as np from scipy import sparse as sps @@ -300,7 +300,7 @@ def __repr__(self): return out def get_column_names( - self, missing_prefix: str = "_col_", start_index: int = 0 + self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None ) -> List[str]: """Get column names. @@ -312,18 +312,18 @@ def get_column_names( ---------- missing_prefix Prefix to use for columns that do not have a name. - start_index - Index to start from when creating default names. + indices + Ignored. Returns ------- list of str Column names. """ - return self.mat.get_column_names(missing_prefix, start_index) + return self.mat.get_column_names(missing_prefix, indices) def get_term_names( - self, missing_prefix: str = "_col_", start_index: int = 0 + self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None ) -> List[str]: """Get term names. @@ -338,12 +338,12 @@ def get_term_names( ---------- missing_prefix Prefix to use for terms that do not have a name. - start_index - Index to start from when creating default names. + indices + Ignored. Returns ------- list of str Term names. """ - return self.mat.get_term_names(missing_prefix, start_index) + return self.mat.get_term_names(missing_prefix, indices) From f6549160ce2b4e9f704b0c4e9e0ee82c8f98f2a8 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 12 Jul 2023 10:36:15 +0200 Subject: [PATCH 07/19] Reduce code duplication --- src/tabmat/categorical_matrix.py | 64 ++++++++++++-------------------- src/tabmat/constructor.py | 4 +- src/tabmat/dense_matrix.py | 57 ++++++++++------------------ src/tabmat/matrix_base.py | 40 +++++--------------- src/tabmat/sparse_matrix.py | 40 +++++++++++++------- src/tabmat/split_matrix.py | 50 ++++++++----------------- src/tabmat/standardized_mat.py | 43 +++++++-------------- 7 files changed, 110 insertions(+), 188 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 31c8282d..41a6e1d8 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -666,8 +666,11 @@ def multiply(self, other) -> SparseMatrix: def __repr__(self): return str(self.cat) - def get_column_names( - self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None + def get_names( + self, + type: str = "column", + missing_prefix: str = "_col_", + indices: Optional[List[int]] = None, ) -> List[str]: """Get column names. @@ -677,6 +680,11 @@ def get_column_names( Parameters ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). missing_prefix Prefix to use for columns that do not have a name. indices @@ -688,46 +696,22 @@ def get_column_names( list of str Column names. """ - if indices is None: - indices = list(range(len(self.cat.categories) - self.drop_first)) - if self._colname is None: - colname = f"{missing_prefix}{indices[0]}-{indices[-1]}" + if type == "column": + name = self._colname + elif type == "term": + name = self._term else: - colname = self._colname - return [ - self._colname_format.format(name=colname, category=cat) - for cat in self.cat.categories[self.drop_first :] - ] - - def get_term_names( - self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None - ) -> List[str]: - """Get term names. + raise ValueError(f"Type must be 'column' or 'term', got {type}") - The main difference to ``get_column_names`` is that a categorical submatrix - is counted as a single term. Furthermore, matrices created from formulas - have a difference between a column and term (c.f. ``formulaic`` docs). - For terms that do not have a name, a default name is created using the - followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is - the index of the term. - - Parameters - ---------- - missing_prefix - Prefix to use for terms that do not have a name. - indices - The indices used for columns that do not have a name. If ``None``, - then the indices are ``list(range(self.shape[1]))``. - - Returns - ------- - list of str - Term names. - """ if indices is None: indices = list(range(len(self.cat.categories) - self.drop_first)) - if self._term is None: - term = f"{missing_prefix}{indices[0]}-{indices[-1]}" + if name is None: + name = f"{missing_prefix}{indices[0]}-{indices[-1]}" + + if type == "column": + return [ + self._colname_format.format(name=name, category=cat) + for cat in self.cat.categories[self.drop_first :] + ] else: - term = self._term - return [term] * (len(self.cat.categories) - self.drop_first) + return [name] * (len(self.cat.categories) - self.drop_first) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index ed274377..d01aa711 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -90,8 +90,8 @@ def from_pandas( ) = _split_sparse_and_dense_parts( sps.csc_matrix(cat.tocsr(), dtype=dtype), threshold=sparse_threshold, - column_names=cat.get_column_names(), - term_names=cat.get_term_names(), + column_names=cat.get_names("columns"), + term_names=cat.get_names("terms"), ) matrices.append(X_dense_F) is_cat.append(True) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index c4fe345c..f93e777e 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -245,8 +245,11 @@ def multiply(self, other): return type(self)(self._array.__mul__(other[:, np.newaxis])) return type(self)(self._array.__mul__(other)) - def get_column_names( - self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None + def get_names( + self, + type: str = "column", + missing_prefix: str = "_col_", + indices: Optional[List[int]] = None, ) -> List[str]: """Get column names. @@ -256,6 +259,11 @@ def get_column_names( Parameters ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). missing_prefix Prefix to use for columns that do not have a name. indices @@ -267,41 +275,16 @@ def get_column_names( list of str Column names. """ - if indices is None: - indices = list(range(self.shape[1])) - colnames = np.array(self._colnames) - default_colnames = np.array([f"{missing_prefix}{i}" for i in indices]) - colnames[colnames == None] = default_colnames[colnames == None] # noqa: E711 - return list(colnames) - - def get_term_names( - self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None - ) -> List[str]: - """Get term names. - - The main difference to ``get_column_names`` is that a categorical submatrix - is counted as a single term. Furthermore, matrices created from formulas - have a difference between a column and term (c.f. ``formulaic`` docs). - For terms that do not have a name, a default name is created using the - followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is - the index of the term. - - Parameters - ---------- - missing_prefix - Prefix to use for terms that do not have a name. - indices - The indices used for columns that do not have a name. If ``None``, - then the indices are ``list(range(self.shape[1]))``. + if type == "column": + names = np.array(self._colnames) + elif type == "term": + names = np.array(self._terms) + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") - Returns - ------- - list of str - Term names. - """ if indices is None: indices = list(range(self.shape[1])) - terms = np.array(self._terms) - default_terms = np.array([f"{missing_prefix}{i}" for i in indices]) - terms[terms == None] = default_terms[terms == None] # noqa: E711 - return list(terms) + + default_names = np.array([f"{missing_prefix}{i}" for i in indices]) + names[names == None] = default_names[names == None] # noqa: E711 + return list(names) diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py index 393196b0..ad9a8cff 100644 --- a/src/tabmat/matrix_base.py +++ b/src/tabmat/matrix_base.py @@ -165,8 +165,11 @@ def __getitem__(self, item): pass @abstractmethod - def get_column_names( - self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None + def get_names( + self, + type: str = "column", + missing_prefix: str = "_col_", + indices: Optional[List[int]] = None, ) -> List[str]: """Get column names. @@ -176,6 +179,11 @@ def get_column_names( Parameters ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). missing_prefix Prefix to use for columns that do not have a name. indices @@ -189,34 +197,6 @@ def get_column_names( """ pass - @abstractmethod - def get_term_names( - self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None - ) -> List[str]: - """Get term names. - - The main difference to ``get_column_names`` is that a categorical submatrix - is counted as a single term. Furthermore, matrices created from formulas - have a difference between a column and term (c.f. ``formulaic`` docs). - For terms that do not have a name, a default name is created using the - followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is - the index of the term. - - Parameters - ---------- - missing_prefix - Prefix to use for terms that do not have a name. - indices - The indices used for columns that do not have a name. If ``None``, - then the indices are ``list(range(self.shape[1]))``. - - Returns - ------- - list of str - Term names. - """ - pass - # Higher priority than numpy arrays, so behavior for funcs like "@" defaults to the # behavior of this class __array_priority__ = 11 diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 18e554f3..edd69f15 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -354,22 +354,27 @@ def get_column_names( colnames[colnames == None] = default_colnames[colnames == None] # noqa: E711 return list(colnames) - def get_term_names( - self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None + def get_names( + self, + type: str = "column", + missing_prefix: str = "_col_", + indices: Optional[List[int]] = None, ) -> List[str]: - """Get term names. + """Get column names. - The main difference to ``get_column_names`` is that a categorical submatrix - is counted as a single term. Furthermore, matrices created from formulas - have a difference between a column and term (c.f. ``formulaic`` docs). - For terms that do not have a name, a default name is created using the + For columns that do not have a name, a default name is created using the followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is - the index of the term. + the index of the column. Parameters ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). missing_prefix - Prefix to use for terms that do not have a name. + Prefix to use for columns that do not have a name. indices The indices used for columns that do not have a name. If ``None``, then the indices are ``list(range(self.shape[1]))``. @@ -377,11 +382,18 @@ def get_term_names( Returns ------- list of str - Term names. + Column names. """ + if type == "column": + names = np.array(self._colnames) + elif type == "term": + names = np.array(self._terms) + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") + if indices is None: indices = list(range(self.shape[1])) - terms = np.array(self._terms) - default_terms = np.array([f"{missing_prefix}{i}" for i in indices]) - terms[terms == None] = default_terms[terms == None] # noqa: E711 - return list(terms) + + default_names = np.array([f"{missing_prefix}{i}" for i in indices]) + names[names == None] = default_names[names == None] # noqa: E711 + return list(names) diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index ec2f34cc..133b3f68 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -486,8 +486,11 @@ def __repr__(self): __array_priority__ = 13 - def get_column_names( - self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None + def get_names( + self, + type: str = "column", + missing_prefix: str = "_col_", + indices: Optional[List[int]] = None, ) -> List[str]: """Get column names. @@ -497,46 +500,23 @@ def get_column_names( Parameters ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). missing_prefix Prefix to use for columns that do not have a name. indices - Ignored. + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. Returns ------- list of str Column names. """ - column_names = np.empty(self.shape[1], dtype=object) - for idx, mat in zip(self.indices, self.matrices): - column_names[idx] = mat.get_column_names(missing_prefix, idx) - return list(column_names) - - def get_term_names( - self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None - ) -> List[str]: - """Get term names. - - The main difference to ``get_column_names`` is that a categorical submatrix - is counted as a single term. Furthermore, matrices created from formulas - have a difference between a column and term (c.f. ``formulaic`` docs). - For terms that do not have a name, a default name is created using the - followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is - the index of the term. - - Parameters - ---------- - missing_prefix - Prefix to use for terms that do not have a name. - indices - Ignored. - - Returns - ------- - list of str - Term names. - """ - term_names = np.empty(self.shape[1], dtype=object) + names = np.empty(self.shape[1], dtype=object) for idx, mat in zip(self.indices, self.matrices): - term_names[idx] = mat.get_term_names(missing_prefix, idx) - return list(term_names) + names[idx] = mat.get_names(type, missing_prefix, idx) + return list(names) diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py index 8a8099c7..397113ed 100644 --- a/src/tabmat/standardized_mat.py +++ b/src/tabmat/standardized_mat.py @@ -299,8 +299,11 @@ def __repr__(self): """ return out - def get_column_names( - self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None + def get_names( + self, + type: str = "column", + missing_prefix: str = "_col_", + indices: Optional[List[int]] = None, ) -> List[str]: """Get column names. @@ -310,40 +313,20 @@ def get_column_names( Parameters ---------- + type: str {'column'|'term'} + Whether to get column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). missing_prefix Prefix to use for columns that do not have a name. indices - Ignored. + The indices used for columns that do not have a name. If ``None``, + then the indices are ``list(range(self.shape[1]))``. Returns ------- list of str Column names. """ - return self.mat.get_column_names(missing_prefix, indices) - - def get_term_names( - self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None - ) -> List[str]: - """Get term names. - - The main difference to ``get_column_names`` is that a categorical submatrix - is counted as a single term. Furthermore, matrices created from formulas - have a difference between a column and term (c.f. ``formulaic`` docs). - For terms that do not have a name, a default name is created using the - followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is - the index of the term. - - Parameters - ---------- - missing_prefix - Prefix to use for terms that do not have a name. - indices - Ignored. - - Returns - ------- - list of str - Term names. - """ - return self.mat.get_term_names(missing_prefix, indices) + return self.mat.get_names(type, missing_prefix, indices) From e0e33d1f3d738e02b9caa79e5c6fb28f3e27fb85 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 12 Jul 2023 10:53:00 +0200 Subject: [PATCH 08/19] Saner defaults --- src/tabmat/categorical_matrix.py | 15 +++++++++------ src/tabmat/dense_matrix.py | 17 ++++++++++------- src/tabmat/matrix_base.py | 11 ++++++----- src/tabmat/sparse_matrix.py | 17 ++++++++++------- src/tabmat/split_matrix.py | 11 ++++++----- src/tabmat/standardized_mat.py | 11 ++++++----- 6 files changed, 47 insertions(+), 35 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 41a6e1d8..d1badcd7 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -669,9 +669,9 @@ def __repr__(self): def get_names( self, type: str = "column", - missing_prefix: str = "_col_", + missing_prefix: Optional[str] = None, indices: Optional[List[int]] = None, - ) -> List[str]: + ) -> List[Optional[str]]: """Get column names. For columns that do not have a name, a default name is created using the @@ -685,15 +685,16 @@ def get_names( a categorical submatrix is counted as a single term, whereas it is counted as multiple columns. Furthermore, matrices created from formulas have a difference between a column and term (c.f. ``formulaic`` docs). - missing_prefix - Prefix to use for columns that do not have a name. + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. indices The indices used for columns that do not have a name. If ``None``, then the indices are ``list(range(self.shape[1]))``. Returns ------- - list of str + List[Optional[str]] Column names. """ if type == "column": @@ -705,7 +706,9 @@ def get_names( if indices is None: indices = list(range(len(self.cat.categories) - self.drop_first)) - if name is None: + if name is None and missing_prefix is None: + return [None] * (len(self.cat.categories) - self.drop_first) + elif name is None: name = f"{missing_prefix}{indices[0]}-{indices[-1]}" if type == "column": diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index f93e777e..085f9d65 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -248,9 +248,9 @@ def multiply(self, other): def get_names( self, type: str = "column", - missing_prefix: str = "_col_", + missing_prefix: Optional[str] = None, indices: Optional[List[int]] = None, - ) -> List[str]: + ) -> List[Optional[str]]: """Get column names. For columns that do not have a name, a default name is created using the @@ -264,15 +264,16 @@ def get_names( a categorical submatrix is counted as a single term, whereas it is counted as multiple columns. Furthermore, matrices created from formulas have a difference between a column and term (c.f. ``formulaic`` docs). - missing_prefix - Prefix to use for columns that do not have a name. + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. indices The indices used for columns that do not have a name. If ``None``, then the indices are ``list(range(self.shape[1]))``. Returns ------- - list of str + List[Optional[str]] Column names. """ if type == "column": @@ -285,6 +286,8 @@ def get_names( if indices is None: indices = list(range(self.shape[1])) - default_names = np.array([f"{missing_prefix}{i}" for i in indices]) - names[names == None] = default_names[names == None] # noqa: E711 + if missing_prefix is not None: + default_names = np.array([f"{missing_prefix}{i}" for i in indices]) + names[names == None] = default_names[names == None] # noqa: E711 + return list(names) diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py index ad9a8cff..c8815452 100644 --- a/src/tabmat/matrix_base.py +++ b/src/tabmat/matrix_base.py @@ -168,9 +168,9 @@ def __getitem__(self, item): def get_names( self, type: str = "column", - missing_prefix: str = "_col_", + missing_prefix: Optional[str] = None, indices: Optional[List[int]] = None, - ) -> List[str]: + ) -> List[Optional[str]]: """Get column names. For columns that do not have a name, a default name is created using the @@ -184,15 +184,16 @@ def get_names( a categorical submatrix is counted as a single term, whereas it is counted as multiple columns. Furthermore, matrices created from formulas have a difference between a column and term (c.f. ``formulaic`` docs). - missing_prefix - Prefix to use for columns that do not have a name. + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. indices The indices used for columns that do not have a name. If ``None``, then the indices are ``list(range(self.shape[1]))``. Returns ------- - list of str + List[Optional[str]] Column names. """ pass diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index edd69f15..4dc4421f 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -357,9 +357,9 @@ def get_column_names( def get_names( self, type: str = "column", - missing_prefix: str = "_col_", + missing_prefix: Optional[str] = None, indices: Optional[List[int]] = None, - ) -> List[str]: + ) -> List[Optional[str]]: """Get column names. For columns that do not have a name, a default name is created using the @@ -373,15 +373,16 @@ def get_names( a categorical submatrix is counted as a single term, whereas it is counted as multiple columns. Furthermore, matrices created from formulas have a difference between a column and term (c.f. ``formulaic`` docs). - missing_prefix - Prefix to use for columns that do not have a name. + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. indices The indices used for columns that do not have a name. If ``None``, then the indices are ``list(range(self.shape[1]))``. Returns ------- - list of str + List[Optional[str]] Column names. """ if type == "column": @@ -394,6 +395,8 @@ def get_names( if indices is None: indices = list(range(self.shape[1])) - default_names = np.array([f"{missing_prefix}{i}" for i in indices]) - names[names == None] = default_names[names == None] # noqa: E711 + if missing_prefix is not None: + default_names = np.array([f"{missing_prefix}{i}" for i in indices]) + names[names == None] = default_names[names == None] # noqa: E711 + return list(names) diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index 133b3f68..8eea2b4a 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -489,9 +489,9 @@ def __repr__(self): def get_names( self, type: str = "column", - missing_prefix: str = "_col_", + missing_prefix: Optional[str] = None, indices: Optional[List[int]] = None, - ) -> List[str]: + ) -> List[Optional[str]]: """Get column names. For columns that do not have a name, a default name is created using the @@ -505,15 +505,16 @@ def get_names( a categorical submatrix is counted as a single term, whereas it is counted as multiple columns. Furthermore, matrices created from formulas have a difference between a column and term (c.f. ``formulaic`` docs). - missing_prefix - Prefix to use for columns that do not have a name. + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. indices The indices used for columns that do not have a name. If ``None``, then the indices are ``list(range(self.shape[1]))``. Returns ------- - list of str + List[Optional[str]] Column names. """ names = np.empty(self.shape[1], dtype=object) diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py index 397113ed..df1304e8 100644 --- a/src/tabmat/standardized_mat.py +++ b/src/tabmat/standardized_mat.py @@ -302,9 +302,9 @@ def __repr__(self): def get_names( self, type: str = "column", - missing_prefix: str = "_col_", + missing_prefix: Optional[str] = None, indices: Optional[List[int]] = None, - ) -> List[str]: + ) -> List[Optional[str]]: """Get column names. For columns that do not have a name, a default name is created using the @@ -318,15 +318,16 @@ def get_names( a categorical submatrix is counted as a single term, whereas it is counted as multiple columns. Furthermore, matrices created from formulas have a difference between a column and term (c.f. ``formulaic`` docs). - missing_prefix - Prefix to use for columns that do not have a name. + missing_prefix: Optional[str], default None + Prefix to use for columns that do not have a name. If None, then no + default name is created. indices The indices used for columns that do not have a name. If ``None``, then the indices are ``list(range(self.shape[1]))``. Returns ------- - list of str + List[Optional[str]] Column names. """ return self.mat.get_names(type, missing_prefix, indices) From 81932ea8d5bdaa5b914964341f97c2fadc10510b Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 12 Jul 2023 11:23:15 +0200 Subject: [PATCH 09/19] Add convenient getters and setters --- src/tabmat/categorical_matrix.py | 29 +++++++++++++++++++++++++ src/tabmat/dense_matrix.py | 26 +++++++++++++++++++++++ src/tabmat/matrix_base.py | 36 ++++++++++++++++++++++++++++++++ src/tabmat/sparse_matrix.py | 26 +++++++++++++++++++++++ src/tabmat/split_matrix.py | 21 +++++++++++++++++++ src/tabmat/standardized_mat.py | 36 ++++++++++++++++++++++++++++++++ 6 files changed, 174 insertions(+) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index d1badcd7..5f5d1914 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -718,3 +718,32 @@ def get_names( ] else: return [name] * (len(self.cat.categories) - self.drop_first) + + def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: List[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to set column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + """ + if isinstance(names, str): + names = [names] + + if len(names) == self.shape[1] and all(name == names[0] for name in names): + names = [names[0]] + + if len(names) != 1: + raise ValueError("A categorical matrix has only one name") + + if type == "column": + self._colname = names[0] + elif type == "term": + self._term = names[0] + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 085f9d65..25371e17 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -291,3 +291,29 @@ def get_names( names[names == None] = default_names[names == None] # noqa: E711 return list(names) + + def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: List[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to set column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + """ + if isinstance(names, str): + names = [names] + + if len(names) != self.shape[1]: + raise ValueError(f"Length of names must be {self.shape[1]}") + + if type == "column": + self._colnames = names + elif type == "term": + self._terms = names + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py index c8815452..ac17d717 100644 --- a/src/tabmat/matrix_base.py +++ b/src/tabmat/matrix_base.py @@ -198,6 +198,42 @@ def get_names( """ pass + def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: List[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to set column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + """ + pass + + @property + def column_names(self): + """Column names of the matrix.""" + return self.get_names(type="column") + + @column_names.setter + def column_names(self, names: List[Optional[str]]): + self.set_names(names, type="column") + + @property + def term_names(self): + """Term names of the matrix. + + For differences between column names and term names, see ``get_names``. + """ + return self.get_names(type="term") + + @term_names.setter + def term_names(self, names: List[Optional[str]]): + self.set_names(names, type="term") + # Higher priority than numpy arrays, so behavior for funcs like "@" defaults to the # behavior of this class __array_priority__ = 11 diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 4dc4421f..ace52bbd 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -400,3 +400,29 @@ def get_names( names[names == None] = default_names[names == None] # noqa: E711 return list(names) + + def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: List[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to set column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + """ + if isinstance(names, str): + names = [names] + + if len(names) != self.shape[1]: + raise ValueError(f"Length of names must be {self.shape[1]}") + + if type == "column": + self._colnames = names + elif type == "term": + self._terms = names + else: + raise ValueError(f"Type must be 'column' or 'term', got {type}") diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py index 8eea2b4a..a60b1fb3 100644 --- a/src/tabmat/split_matrix.py +++ b/src/tabmat/split_matrix.py @@ -521,3 +521,24 @@ def get_names( for idx, mat in zip(self.indices, self.matrices): names[idx] = mat.get_names(type, missing_prefix, idx) return list(names) + + def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: List[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to set column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + """ + names_array = np.array(names) + + if len(names) != self.shape[1]: + raise ValueError(f"Length of names must be {self.shape[1]}") + + for idx, mat in zip(self.indices, self.matrices): + mat.set_names(list(names_array[idx]), type) diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py index df1304e8..2e88dbb0 100644 --- a/src/tabmat/standardized_mat.py +++ b/src/tabmat/standardized_mat.py @@ -331,3 +331,39 @@ def get_names( Column names. """ return self.mat.get_names(type, missing_prefix, indices) + + def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"): + """Set column names. + + Parameters + ---------- + names: List[Optional[str]] + Names to set. + type: str {'column'|'term'} + Whether to set column names or term names. The main difference is that + a categorical submatrix is counted as a single term, whereas it is + counted as multiple columns. Furthermore, matrices created from formulas + have a difference between a column and term (c.f. ``formulaic`` docs). + """ + self.mat.set_names(names, type) + + @property + def column_names(self): + """Column names of the matrix.""" + return self.get_names(type="column") + + @column_names.setter + def column_names(self, names: List[Optional[str]]): + self.set_names(names, type="column") + + @property + def term_names(self): + """Term names of the matrix. + + For differences between column names and term names, see ``get_names``. + """ + return self.get_names(type="term") + + @term_names.setter + def term_names(self, names: List[Optional[str]]): + self.set_names(names, type="term") From 5147f8302734588f5b1398b58cc5947337dce6bb Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 12 Jul 2023 11:54:25 +0200 Subject: [PATCH 10/19] Fix indexing --- src/tabmat/constructor.py | 4 ++-- src/tabmat/dense_matrix.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index d01aa711..d280140a 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -90,8 +90,8 @@ def from_pandas( ) = _split_sparse_and_dense_parts( sps.csc_matrix(cat.tocsr(), dtype=dtype), threshold=sparse_threshold, - column_names=cat.get_names("columns"), - term_names=cat.get_names("terms"), + column_names=cat.get_names("column"), + term_names=cat.get_names("term"), ) matrices.append(X_dense_F) is_cat.append(True) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 25371e17..542c586d 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -117,7 +117,7 @@ def astype(self, dtype, order="K", casting="unsafe", copy=True): def __getitem__(self, key): """Return a subset of the matrix.""" result = super().__getitem__(key) - if len(key) == 2: + if isinstance(key, tuple) and len(key) == 2: result._colnames = list(np.array(self._colnames)[key[1]]) result._terms = list(np.array(self._terms)[key[1]]) return result From 237c9e53842453cf91b1663e899b486064c69165 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 12 Jul 2023 13:40:29 +0200 Subject: [PATCH 11/19] Smarter setter for categorical matrices --- src/tabmat/categorical_matrix.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 5f5d1914..0cdecb10 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -161,6 +161,7 @@ def matvec(mat, vec): """ +import re from typing import Any, List, Optional, Tuple, Union import numpy as np @@ -735,8 +736,23 @@ def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column" if isinstance(names, str): names = [names] - if len(names) == self.shape[1] and all(name == names[0] for name in names): - names = [names[0]] + if len(names) != 1: + if type == "column": + # Try finding the column name + base_names = [] + for name, cat in zip(names, self.cat.categories[self.drop_first :]): + partial_name = self._colname_format.format( + name="__CAPTURE__", category=cat + ) + pattern = re.escape(partial_name).replace("__CAPTURE__", "(.*)") + if (name is not None) and (match := re.search(pattern, name)): + base_names.append(match.group(1)) + else: + base_names.append(name) + names = base_names + + if len(names) == self.shape[1] and all(name == names[0] for name in names): + names = [names[0]] if len(names) != 1: raise ValueError("A categorical matrix has only one name") From 71498bfc2a7e3417c9a9bcd0aafb5b8187527c3c Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 12 Jul 2023 14:44:10 +0200 Subject: [PATCH 12/19] Add tests --- src/tabmat/dense_matrix.py | 6 +- src/tabmat/sparse_matrix.py | 31 +----- tests/test_matrices.py | 182 +++++++++++++++++++++++++++++++++++- 3 files changed, 185 insertions(+), 34 deletions(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 542c586d..76c43497 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -58,7 +58,7 @@ def __init__(self, input_array, column_names=None, term_names=None): raise ValueError(f"Expected {width} term names, got {len(term_names)}") self._terms = term_names else: - self._terms = obj._colnames + self._terms = self._colnames def __getitem__(self, key): if not isinstance(key, tuple): @@ -116,7 +116,7 @@ def astype(self, dtype, order="K", casting="unsafe", copy=True): def __getitem__(self, key): """Return a subset of the matrix.""" - result = super().__getitem__(key) + result = type(self)(self._array.__getitem__(key)) if isinstance(key, tuple) and len(key) == 2: result._colnames = list(np.array(self._colnames)[key[1]]) result._terms = list(np.array(self._terms)[key[1]]) @@ -284,7 +284,7 @@ def get_names( raise ValueError(f"Type must be 'column' or 'term', got {type}") if indices is None: - indices = list(range(self.shape[1])) + indices = list(range(len(self._colnames))) if missing_prefix is not None: default_names = np.array([f"{missing_prefix}{i}" for i in indices]) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index ace52bbd..7722894d 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -325,35 +325,6 @@ def multiply(self, other): return type(self)(self._array.multiply(other[:, np.newaxis])) return type(self)(self._array.multiply(other)) - def get_column_names( - self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None - ) -> List[str]: - """Get column names. - - For columns that do not have a name, a default name is created using the - followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is - the index of the column. - - Parameters - ---------- - missing_prefix - Prefix to use for columns that do not have a name. - indices - The indices used for columns that do not have a name. If ``None``, - then the indices are ``list(range(self.shape[1]))``. - - Returns - ------- - list of str - Column names. - """ - if indices is None: - indices = list(range(self.shape[1])) - colnames = np.array(self._colnames) - default_colnames = np.array([f"{missing_prefix}{i}" for i in indices]) - colnames[colnames == None] = default_colnames[colnames == None] # noqa: E711 - return list(colnames) - def get_names( self, type: str = "column", @@ -393,7 +364,7 @@ def get_names( raise ValueError(f"Type must be 'column' or 'term', got {type}") if indices is None: - indices = list(range(self.shape[1])) + indices = list(range(len(self._colnames))) if missing_prefix is not None: default_names = np.array([f"{missing_prefix}{i}" for i in indices]) diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 34f6a5bb..b192f5ba 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -621,7 +621,6 @@ def test_split_matrix_creation(mat): assert sm.shape[1] == 2 * mat.shape[1] -@pytest.mark.parametrize("mat", get_matrices()) def test_multiply(mat): other = np.arange(mat.shape[0]) expected = mat.A * other[:, np.newaxis] @@ -661,3 +660,184 @@ def test_hstack(mat_1, mat_2): stacked.A, np.hstack([mat.A if not isinstance(mat, np.ndarray) else mat for mat in mats]), ) +def test_names_against_expectation(): + X = tm.DenseMatrix( + np.ones((5, 2)), column_names=["a", None], term_names=["a", None] + ) + Xc = tm.CategoricalMatrix( + pd.Categorical(["a", "b", "c", "b", "a"]), column_name="c", term_name="c" + ) + Xc2 = tm.CategoricalMatrix(pd.Categorical(["a", "b", "c", "b", "a"])) + Xs = tm.SparseMatrix( + sps.csc_matrix(np.ones((5, 2))), + column_names=["s1", "s2"], + term_names=["s", "s"], + ) + + mat = tm.SplitMatrix(matrices=[X, Xc, Xc2, Xs]) + + assert mat.get_names(type="column") == [ + "a", + None, + "c[a]", + "c[b]", + "c[c]", + None, + None, + None, + "s1", + "s2", + ] + + assert mat.get_names(type="term") == [ + "a", + None, + "c", + "c", + "c", + None, + None, + None, + "s", + "s", + ] + + assert mat.get_names(type="column", missing_prefix="_col_") == [ + "a", + "_col_1", + "c[a]", + "c[b]", + "c[c]", + "_col_5-7[a]", + "_col_5-7[b]", + "_col_5-7[c]", + "s1", + "s2", + ] + + assert mat.get_names(type="term", missing_prefix="_col_") == [ + "a", + "_col_1", + "c", + "c", + "c", + "_col_5-7", + "_col_5-7", + "_col_5-7", + "s", + "s", + ] + + +@pytest.mark.parametrize("mat", get_matrices()) +@pytest.mark.parametrize("missing_prefix", ["_col_", "X"]) +def test_names_getter_setter(mat, missing_prefix): + names = mat.get_names(missing_prefix=missing_prefix, type="column") + mat.column_names = names + assert mat.column_names == names + + +@pytest.mark.parametrize("mat", get_matrices()) +@pytest.mark.parametrize("missing_prefix", ["_col_", "X"]) +def test_terms_getter_setter(mat, missing_prefix): + names = mat.get_names(missing_prefix=missing_prefix, type="term") + mat.term_names = names + assert mat.term_names == names + + +@pytest.mark.parametrize("indexer_1", [slice(None, None), 0, slice(2, 8)]) +@pytest.mark.parametrize("indexer_2", [[0], slice(1, 4), [0, 2, 3], [4, 3, 2, 1, 0]]) +@pytest.mark.parametrize("sparse", [True, False]) +def test_names_indexing(indexer_1, indexer_2, sparse): + X = np.ones((10, 5), dtype=np.float64) + colnames = ["a", "b", None, "d", "e"] + termnames = ["t1", "t1", None, "t4", "t5"] + + colnames_array = np.array(colnames) + termnames_array = np.array(termnames) + + if sparse: + X = tm.SparseMatrix( + sps.csc_matrix(X), column_names=colnames, term_names=termnames + ) + else: + X = tm.DenseMatrix(X, column_names=colnames, term_names=termnames) + + X_indexed = X[indexer_1, indexer_2] + if not isinstance(X_indexed, tm.MatrixBase): + pytest.skip("Does not return MatrixBase") + assert X_indexed.column_names == list(colnames_array[indexer_2]) + assert X_indexed.term_names == list(termnames_array[indexer_2]) + + +@pytest.mark.parametrize("mat_1", get_all_matrix_base_subclass_mats()) +@pytest.mark.parametrize("mat_2", get_all_matrix_base_subclass_mats()) +def test_combine_names(mat_1, mat_2): + mat_1.column_names = mat_1.get_names(missing_prefix="m1_", type="column") + mat_2.column_names = mat_2.get_names(missing_prefix="m2_", type="column") + + mat_1.term_names = mat_1.get_names(missing_prefix="m1_", type="term") + mat_2.term_names = mat_2.get_names(missing_prefix="m2_", type="term") + + combined = tm.SplitMatrix(matrices=[mat_1, mat_2]) + + assert combined.column_names == mat_1.column_names + mat_2.column_names + assert combined.term_names == mat_1.term_names + mat_2.term_names + + +@pytest.mark.parametrize("prefix_sep", ["_", ": "]) +@pytest.mark.parametrize("drop_first", [True, False]) +def test_names_pandas(prefix_sep, drop_first): + n_rows = 50 + dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) + dense_column_with_lots_of_zeros = dense_column.copy() + dense_column_with_lots_of_zeros[:44] = 0.0 + sparse_column = np.zeros(n_rows, dtype=np.float64) + sparse_column[0] = 1.0 + cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) + cat_column_highdim = np.arange(n_rows) + + dense_ser = pd.Series(dense_column) + lowdense_ser = pd.Series(dense_column_with_lots_of_zeros) + sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0)) + cat_ser_lowdim = pd.Categorical(cat_column_lowdim) + cat_ser_highdim = pd.Categorical(cat_column_highdim) + + df = pd.DataFrame( + data={ + "d": dense_ser, + "cl_obj": cat_ser_lowdim.astype(object), + "ch": cat_ser_highdim, + "ds": lowdense_ser, + "s": sparse_ser, + } + ) + + categorical_format = "{name}" + prefix_sep + "{category}" + mat_end = tm.from_pandas( + df, + dtype=np.float64, + sparse_threshold=0.3, + cat_threshold=4, + object_as_cat=True, + cat_position="end", + categorical_format=categorical_format, + drop_first=drop_first, + ) + + expanded_df = pd.get_dummies(df, prefix_sep=prefix_sep, drop_first=drop_first) + assert mat_end.column_names == expanded_df.columns.tolist() + + mat_expand = tm.from_pandas( + df, + dtype=np.float64, + sparse_threshold=0.3, + cat_threshold=4, + object_as_cat=True, + cat_position="expand", + categorical_format=categorical_format, + drop_first=drop_first, + ) + + unique_terms = list(dict.fromkeys(mat_expand.term_names)) + assert unique_terms == df.columns.tolist() From 49cdabf2fc0ca352ebafec4dd14118894fd631c2 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 12 Jul 2023 15:09:15 +0200 Subject: [PATCH 13/19] Fix subsetting with np.newaxis --- src/tabmat/dense_matrix.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 76c43497..1ca4a071 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -118,8 +118,13 @@ def __getitem__(self, key): """Return a subset of the matrix.""" result = type(self)(self._array.__getitem__(key)) if isinstance(key, tuple) and len(key) == 2: - result._colnames = list(np.array(self._colnames)[key[1]]) - result._terms = list(np.array(self._terms)[key[1]]) + if key[1] is None: + # Handle np.newaxis + result._colnames = self._colnames + result._terms = self._terms + else: + result._colnames = list(np.array(self._colnames)[key[1]]) + result._terms = list(np.array(self._terms)[key[1]]) return result def getcol(self, i): From fcc0c7369813de75472752f3b21f7296907ae8ef Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 12 Jul 2023 15:19:34 +0200 Subject: [PATCH 14/19] Remove the walrus :( --- src/tabmat/categorical_matrix.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 0cdecb10..ba4259f6 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -745,7 +745,11 @@ def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column" name="__CAPTURE__", category=cat ) pattern = re.escape(partial_name).replace("__CAPTURE__", "(.*)") - if (name is not None) and (match := re.search(pattern, name)): + if name is not None: + match = re.search(pattern, name) + else: + match = None + if match is not None: base_names.append(match.group(1)) else: base_names.append(name) From 0a14af7a9ba79e3c510b21ca8be5da233e13410d Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 12 Jul 2023 15:27:02 +0200 Subject: [PATCH 15/19] Fix test --- tests/test_matrices.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_matrices.py b/tests/test_matrices.py index b192f5ba..f8f7664e 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -621,6 +621,7 @@ def test_split_matrix_creation(mat): assert sm.shape[1] == 2 * mat.shape[1] +@pytest.mark.parametrize("mat", get_matrices()) def test_multiply(mat): other = np.arange(mat.shape[0]) expected = mat.A * other[:, np.newaxis] From db0ac7513df9cf4497e15cb5a06e04f2f0c4e586 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 27 Jul 2023 10:54:02 +0200 Subject: [PATCH 16/19] Fix indexing with np.ix_ --- src/tabmat/dense_matrix.py | 24 ++++++++++-------------- src/tabmat/sparse_matrix.py | 8 +++----- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 1ca4a071..72fb8847 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -67,7 +67,16 @@ def __getitem__(self, key): # Always return a 2d array key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key) - return type(self)(self._array.__getitem__(key)) + if len(key) == 2: + colnames = list(np.array(self._colnames)[key[1]].ravel()) + terms = list(np.array(self._terms)[key[1]].ravel()) + else: + colnames = self._colnames + terms = self._terms + + return type(self)( + self._array.__getitem__(key), column_names=colnames, term_names=terms + ) __array_ufunc__ = None @@ -114,19 +123,6 @@ def astype(self, dtype, order="K", casting="unsafe", copy=True): """Copy of the array, cast to a specified type.""" return type(self)(self._array.astype(dtype, order, casting, copy)) - def __getitem__(self, key): - """Return a subset of the matrix.""" - result = type(self)(self._array.__getitem__(key)) - if isinstance(key, tuple) and len(key) == 2: - if key[1] is None: - # Handle np.newaxis - result._colnames = self._colnames - result._terms = self._terms - else: - result._colnames = list(np.array(self._colnames)[key[1]]) - result._terms = list(np.array(self._terms)[key[1]]) - return result - def getcol(self, i): """Return matrix column at specified index.""" return type(self)(self._array[:, [i]]) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 7722894d..fe8e752c 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -78,16 +78,14 @@ def __getitem__(self, key): key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key) if len(key) == 2: - colnames = list(np.array(self._colnames)[key[1]]) - terms = list(np.array(self._terms)[key[1]]) + colnames = list(np.array(self._colnames)[key[1]].ravel()) + terms = list(np.array(self._terms)[key[1]].ravel()) else: colnames = self._colnames terms = self._terms return type(self)( - self._array.__getitem__(key), - column_names=colnames, - term_names=terms + self._array.__getitem__(key), column_names=colnames, term_names=terms ) def __matmul__(self, other): From 1bd940f4ed4339a25984afd342e07f41fff48b75 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Thu, 27 Jul 2023 13:17:13 +0200 Subject: [PATCH 17/19] Propagate column names where it makes sense --- src/tabmat/categorical_matrix.py | 16 ++++++++++++---- src/tabmat/dense_matrix.py | 24 ++++++++++++++++++++---- src/tabmat/sparse_matrix.py | 20 ++++++++++++++++---- 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py index 8bdb4915..1180646c 100644 --- a/src/tabmat/categorical_matrix.py +++ b/src/tabmat/categorical_matrix.py @@ -477,10 +477,16 @@ def getcol(self, i: int) -> SparseMatrix: i %= self.shape[1] # wrap-around indexing if self.drop_first: - i += 1 + i_corr = i + 1 + else: + i_corr = i - col_i = sps.csc_matrix((self.indices == i).astype(int)[:, None]) - return SparseMatrix(col_i) + col_i = sps.csc_matrix((self.indices == i_corr).astype(int)[:, None]) + return SparseMatrix( + col_i, + column_names=[self.column_names[i]], + term_names=[self.term_names[i]], + ) def tocsr(self) -> sps.csr_matrix: """Return scipy csr representation of matrix.""" @@ -657,7 +663,9 @@ def multiply(self, other) -> SparseMatrix: np.arange(self.shape[0] + 1, dtype=int), ), shape=self.shape, - ) + ), + column_names=self.column_names, + term_names=self.term_names, ) def __repr__(self): diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py index 98006131..5de2c91f 100644 --- a/src/tabmat/dense_matrix.py +++ b/src/tabmat/dense_matrix.py @@ -113,11 +113,19 @@ def transpose(self): def astype(self, dtype, order="K", casting="unsafe", copy=True): """Copy of the array, cast to a specified type.""" - return type(self)(self._array.astype(dtype, order, casting, copy)) + return type(self)( + self._array.astype(dtype, order, casting, copy), + column_names=self.column_names, + term_names=self.term_names, + ) def getcol(self, i): """Return matrix column at specified index.""" - return type(self)(self._array[:, [i]]) + return type(self)( + self._array[:, [i]], + column_names=[self.column_names[i]], + term_names=[self.term_names[i]], + ) def toarray(self): """Return array representation of matrix.""" @@ -235,8 +243,16 @@ def multiply(self, other): This assumes that ``other`` is a vector of size ``self.shape[0]``. """ if np.asanyarray(other).ndim == 1: - return type(self)(self._array.__mul__(other[:, np.newaxis])) - return type(self)(self._array.__mul__(other)) + return type(self)( + self._array.__mul__(other[:, np.newaxis]), + column_names=self.column_names, + term_names=self.term_names, + ) + return type(self)( + self._array.__mul__(other), + column_names=self.column_names, + term_names=self.term_names, + ) def get_names( self, diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 39ce829f..c2be7d57 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -147,7 +147,11 @@ def transpose(self): def getcol(self, i): """Return matrix column at specified index.""" - return type(self)(self._array.getcol(i)) + return type(self)( + self._array.getcol(i), + column_names=[self.column_names[i]], + term_names=[self.term_names[i]], + ) def unpack(self): """Return the underlying scipy.sparse.csc_matrix.""" @@ -311,9 +315,17 @@ def multiply(self, other): from the parent class except that ``other`` is assumed to be a vector of size ``self.shape[0]``. """ - if other.ndim == 1: - return type(self)(self._array.multiply(other[:, np.newaxis])) - return type(self)(self._array.multiply(other)) + if np.asanyarray(other).ndim == 1: + return type(self)( + self._array.multiply(other[:, np.newaxis]), + column_names=self.column_names, + term_names=self.term_names, + ) + return type(self)( + self._array.multiply(other), + column_names=self.column_names, + term_names=self.term_names, + ) def get_names( self, From 0b133d6c1fb54c40ed7ea83e6b56ddff44945853 Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Wed, 9 Aug 2023 16:36:57 +0200 Subject: [PATCH 18/19] Fix merge mistake --- src/tabmat/sparse_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py index 0754f771..1c568757 100644 --- a/src/tabmat/sparse_matrix.py +++ b/src/tabmat/sparse_matrix.py @@ -33,7 +33,7 @@ class SparseMatrix(MatrixBase): def __init__( self, - array, + input_array, shape=None, dtype=None, copy=False, From 5109466825a21cab4e713b18b0af46dadf8861dd Mon Sep 17 00:00:00 2001 From: Martin Stancsics Date: Mon, 14 Aug 2023 14:25:15 +0200 Subject: [PATCH 19/19] Add changelog entry --- CHANGELOG.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 331695b1..9de08d2f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,6 +10,10 @@ Changelog Unreleased ---------- +**New features:** + +- Add column name and term name metadata to ``MatrixBase`` objects. These are automatically populated when initializing a ``MatrixBase`` from a ``pandas.DataFrame``. In addition, they can be accessed and modified via the ``column_names`` and ``term_names`` properties. + **Other changes:** - Improve the performance of ``from_pandas`` in the case of low-cardinality categorical variables.