From ed016b4a5d4e979a72262397c9da38a58990f378 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Tue, 11 Jul 2023 14:33:01 +0200
Subject: [PATCH 01/19] Add column name getters

---
 src/tabmat/categorical_matrix.py | 63 ++++++++++++++++++++++++++++++++
 src/tabmat/dense_matrix.py       | 62 +++++++++++++++++++++++++++++++
 src/tabmat/matrix_base.py        | 51 ++++++++++++++++++++++++++
 src/tabmat/sparse_matrix.py      | 63 ++++++++++++++++++++++++++++++++
 src/tabmat/split_matrix.py       | 63 ++++++++++++++++++++++++++++++++
 src/tabmat/standardized_mat.py   | 49 +++++++++++++++++++++++++
 6 files changed, 351 insertions(+)

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index 4968c628..df878393 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -264,6 +264,9 @@ def __init__(
         self.indices = self.cat.codes.astype(np.int32)
         self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None
         self.dtype = np.dtype(dtype)
+        self._colname = None
+        self._term = None
+        self._colname_format = "{name}[{category}]"
 
     __array_ufunc__ = None
 
@@ -655,3 +658,63 @@ def multiply(self, other) -> SparseMatrix:
 
     def __repr__(self):
         return str(self.cat)
+
+    def get_column_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get column names.
+
+        For columns that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the column.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for columns that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Column names.
+        """
+        if self._colname is None:
+            colname = f"{missing_prefix}{start_index}"
+        else:
+            colname = self._colname
+        return [
+            self._colname_format.format(name=colname, category=cat)
+            for cat in self.cat.categories[self.drop_first :]
+        ]
+
+    def get_term_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get term names.
+
+        The main difference to ``get_column_names`` is that a categorical submatrix
+        is counted as a single term. Furthermore, matrices created from formulas
+        have a difference between a column and term (c.f. ``formulaic`` docs).
+        For terms that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the term.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for terms that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Term names.
+        """
+        if self._term is None:
+            term = f"{missing_prefix}{start_index}"
+        else:
+            term = self._term
+        return [term] * (len(self.cat.categories) - self.drop_first)
diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index 1a70457f..b624832c 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -43,6 +43,9 @@ def __init__(self, input_array):
 
         self._array = np.asarray(input_array)
 
+        self._colnames = [None] * input_array.shape[1]
+        self._terms = [None] * input_array.shape[1]
+
     def __getitem__(self, key):
         if not isinstance(key, tuple):
             key = (key,)
@@ -219,3 +222,62 @@ def multiply(self, other):
         if np.asanyarray(other).ndim == 1:
             return type(self)(self._array.__mul__(other[:, np.newaxis]))
         return type(self)(self._array.__mul__(other))
+
+    def get_column_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get column names.
+
+        For columns that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the column.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for columns that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Column names.
+        """
+        colnames = np.array(self._colnames)
+        default_colnames = np.array(
+            [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
+        )
+        colnames[colnames == None] = default_colnames[colnames == None]  # noqa: E711
+        return list(colnames)
+
+    def get_term_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get term names.
+
+        The main difference to ``get_column_names`` is that a categorical submatrix
+        is counted as a single term. Furthermore, matrices created from formulas
+        have a difference between a column and term (c.f. ``formulaic`` docs).
+        For terms that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the term.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for terms that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Term names.
+        """
+        terms = np.array(self._terms)
+        default_terms = np.array(
+            [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
+        )
+        terms[terms == None] = default_terms[terms == None]  # noqa: E711
+        return list(terms)
diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py
index 88091834..318fd4d9 100644
--- a/src/tabmat/matrix_base.py
+++ b/src/tabmat/matrix_base.py
@@ -164,6 +164,57 @@ def standardize(
     def __getitem__(self, item):
         pass
 
+    @abstractmethod
+    def get_column_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get column names.
+
+        For columns that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the column.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for columns that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Column names.
+        """
+        pass
+
+    @abstractmethod
+    def get_term_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get term names.
+
+        The main difference to ``get_column_names`` is that a categorical submatrix
+        is counted as a single term. Furthermore, matrices created from formulas
+        have a difference between a column and term (c.f. ``formulaic`` docs).
+        For terms that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the term.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for terms that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Term names.
+        """
+        pass
+
     # Higher priority than numpy arrays, so behavior for funcs like "@" defaults to the
     # behavior of this class
     __array_priority__ = 11
diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py
index 188f6862..7053236d 100644
--- a/src/tabmat/sparse_matrix.py
+++ b/src/tabmat/sparse_matrix.py
@@ -44,6 +44,10 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False):
             self._array.sort_indices()
         self._array_csr = None
 
+        self._colnames = [None] * self.shape[1]
+        self._terms = [None] * self.shape[1]
+
+
     def __getitem__(self, key):
         if not isinstance(key, tuple):
             key = (key,)
@@ -287,3 +291,62 @@ def multiply(self, other):
         if other.ndim == 1:
             return type(self)(self._array.multiply(other[:, np.newaxis]))
         return type(self)(self._array.multiply(other))
+
+    def get_column_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get column names.
+
+        For columns that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the column.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for columns that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Column names.
+        """
+        colnames = np.array(self._colnames)
+        default_colnames = np.array(
+            [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
+        )
+        colnames[colnames == None] = default_colnames[colnames == None]  # noqa: E711
+        return list(colnames)
+
+    def get_term_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get term names.
+
+        The main difference to ``get_column_names`` is that a categorical submatrix
+        is counted as a single term. Furthermore, matrices created from formulas
+        have a difference between a column and term (c.f. ``formulaic`` docs).
+        For terms that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the term.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for terms that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Term names.
+        """
+        terms = np.array(self._terms)
+        default_terms = np.array(
+            [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
+        )
+        terms[terms == None] = default_terms[terms == None]  # noqa: E711
+        return list(terms)
diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py
index a091949f..4cfe36c1 100644
--- a/src/tabmat/split_matrix.py
+++ b/src/tabmat/split_matrix.py
@@ -477,3 +477,66 @@ def __repr__(self):
         return out
 
     __array_priority__ = 13
+
+    def get_column_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get column names.
+
+        For columns that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the column.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for columns that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Column names.
+        """
+        column_names = np.empty(self.shape[1], dtype=object)
+        for idx, mat in zip(self.indices, self.matrices):
+            column_names[idx] = mat.get_column_names(missing_prefix, start_index)
+            if isinstance(mat, CategoricalMatrix):
+                start_index += 1
+            else:
+                start_index += mat.shape[1]
+        return list(column_names)
+
+    def get_term_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get term names.
+
+        The main difference to ``get_column_names`` is that a categorical submatrix
+        is counted as a single term. Furthermore, matrices created from formulas
+        have a difference between a column and term (c.f. ``formulaic`` docs).
+        For terms that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the term.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for terms that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Term names.
+        """
+        term_names = np.empty(self.shape[1], dtype=object)
+        for idx, mat in zip(self.indices, self.matrices):
+            term_names[idx] = mat.get_term_names(missing_prefix, start_index)
+            if isinstance(mat, CategoricalMatrix):
+                start_index += 1
+            else:
+                start_index += mat.shape[1]
+        return list(term_names)
diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py
index 19b04f5a..df69b15a 100644
--- a/src/tabmat/standardized_mat.py
+++ b/src/tabmat/standardized_mat.py
@@ -298,3 +298,52 @@ def __repr__(self):
         Mult: {self.mult}
         """
         return out
+
+    def get_column_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get column names.
+
+        For columns that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the column.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for columns that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Column names.
+        """
+        return self.mat.get_column_names(missing_prefix, start_index)
+
+    def get_term_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get term names.
+
+        The main difference to ``get_column_names`` is that a categorical submatrix
+        is counted as a single term. Furthermore, matrices created from formulas
+        have a difference between a column and term (c.f. ``formulaic`` docs).
+        For terms that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the term.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for terms that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Term names.
+        """
+        return self.mat.get_term_names(missing_prefix, start_index)

From 0519d7af3e2eff5df43e7e93ea67df7b296098c2 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Tue, 11 Jul 2023 14:48:30 +0200
Subject: [PATCH 02/19] Matrix names are also combined

---
 src/tabmat/split_matrix.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py
index 4cfe36c1..e00dc0ee 100644
--- a/src/tabmat/split_matrix.py
+++ b/src/tabmat/split_matrix.py
@@ -113,8 +113,16 @@ def _combine_matrices(matrices, indices):
         if len(this_type_matrices) > 1:
             new_matrix = mat_type_(stack_fn([matrices[i] for i in this_type_matrices]))
             new_indices = np.concatenate([indices[i] for i in this_type_matrices])
+            new_colnames = np.concatenate(
+                np.array([matrices[i]._colnames for i in this_type_matrices])
+            )
+            new_terms = np.concatenate(
+                np.array([matrices[i]._terms for i in this_type_matrices])
+            )
             sorter = np.argsort(new_indices)
             sorted_matrix = new_matrix[:, sorter]
+            sorted_matrix._colnames = list(new_colnames[sorter])
+            sorted_matrix._terms = list(new_terms[sorter])
             sorted_indices = new_indices[sorter]
 
             assert sorted_matrix.shape[0] == n_row

From 9b652aec134cfd3fa930fb465ed07c84764c5f91 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Tue, 11 Jul 2023 18:15:39 +0200
Subject: [PATCH 03/19] Add names to constructors

---
 src/tabmat/categorical_matrix.py | 13 +++++--
 src/tabmat/constructor.py        | 59 +++++++++++++++++++++++++++-----
 src/tabmat/dense_matrix.py       | 20 +++++++++--
 src/tabmat/sparse_matrix.py      | 29 ++++++++++++++--
 src/tabmat/split_matrix.py       |  4 +--
 5 files changed, 106 insertions(+), 19 deletions(-)

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index df878393..6d87adc7 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -250,6 +250,9 @@ def __init__(
         cat_vec: Union[List, np.ndarray, pd.Categorical],
         drop_first: bool = False,
         dtype: np.dtype = np.float64,
+        column_name: Optional[str] = None,
+        term_name: Optional[str] = None,
+        column_name_format: str = "{name}[{category}]",
     ):
         if pd.isnull(cat_vec).any():
             raise ValueError("Categorical data can't have missing values.")
@@ -264,9 +267,13 @@ def __init__(
         self.indices = self.cat.codes.astype(np.int32)
         self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None
         self.dtype = np.dtype(dtype)
-        self._colname = None
-        self._term = None
-        self._colname_format = "{name}[{category}]"
+
+        self._colname = column_name
+        if term_name is None:
+            self._term = self._colname
+        else:
+            self._term = term_name
+        self._colname_format = column_name_format
 
     __array_ufunc__ = None
 
diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py
index f8e23c31..08b09316 100644
--- a/src/tabmat/constructor.py
+++ b/src/tabmat/constructor.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import List, Tuple, Union
+from typing import List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -21,6 +21,7 @@ def from_pandas(
     object_as_cat: bool = False,
     cat_position: str = "expand",
     drop_first: bool = False,
+    categorical_format: str = "{name}[{category}]",
 ) -> MatrixBase:
     """
     Transform a pandas.DataFrame into an efficient SplitMatrix. For most users, this
@@ -72,7 +73,14 @@ def from_pandas(
         if object_as_cat and coldata.dtype == object:
             coldata = coldata.astype("category")
         if isinstance(coldata.dtype, pd.CategoricalDtype):
-            cat = CategoricalMatrix(coldata, drop_first=drop_first, dtype=dtype)
+            cat = CategoricalMatrix(
+                coldata,
+                drop_first=drop_first,
+                dtype=dtype,
+                column_name=colname,
+                term_name=colname,
+                column_name_format=categorical_format,
+            )
             if len(coldata.cat.categories) < cat_threshold:
                 (
                     X_dense_F,
@@ -82,6 +90,8 @@ def from_pandas(
                 ) = _split_sparse_and_dense_parts(
                     sps.csc_matrix(cat.tocsr(), dtype=dtype),
                     threshold=sparse_threshold,
+                    column_names=cat.get_column_names(),
+                    term_names=cat.get_term_names(),
                 )
                 matrices.append(X_dense_F)
                 is_cat.append(True)
@@ -128,13 +138,26 @@ def from_pandas(
             f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype."
         )
     if len(dense_dfidx) > 0:
-        matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype)))
+        matrices.append(
+            DenseMatrix(
+                df.iloc[:, dense_dfidx].astype(dtype),
+                column_names=df.columns[dense_dfidx],
+                term_names=df.columns[dense_dfidx],
+            )
+        )
         indices.append(dense_mxidx)
         is_cat.append(False)
     if len(sparse_dfcols) > 0:
         sparse_dict = {i: v for i, v in enumerate(sparse_dfcols)}
         full_sparse = pd.DataFrame(sparse_dict).sparse.to_coo()
-        matrices.append(SparseMatrix(full_sparse, dtype=dtype))
+        matrices.append(
+            SparseMatrix(
+                full_sparse,
+                dtype=dtype,
+                column_names=[col.name for col in sparse_dfcols],
+                term_names=[col.name for col in sparse_dfcols],
+            )
+        )
         indices.append(sparse_mxidx)
         is_cat.append(False)
 
@@ -157,7 +180,10 @@ def from_pandas(
 
 
 def _split_sparse_and_dense_parts(
-    arg1: sps.csc_matrix, threshold: float = 0.1
+    arg1: sps.csc_matrix,
+    threshold: float = 0.1,
+    column_names: Optional[Sequence[Optional[str]]] = None,
+    term_names: Optional[Sequence[Optional[str]]] = None,
 ) -> Tuple[DenseMatrix, SparseMatrix, np.ndarray, np.ndarray]:
     """
     Split matrix.
@@ -176,17 +202,34 @@ def _split_sparse_and_dense_parts(
     dense_indices = np.where(densities > threshold)[0]
     sparse_indices = np.setdiff1d(np.arange(densities.shape[0]), dense_indices)
 
-    X_dense_F = DenseMatrix(np.asfortranarray(arg1[:, dense_indices].toarray()))
-    X_sparse = SparseMatrix(arg1[:, sparse_indices])
+    if column_names is None:
+        column_names = [None] * arg1.shape[1]
+    if term_names is None:
+        term_names = column_names
+
+    X_dense_F = DenseMatrix(
+        np.asfortranarray(arg1[:, dense_indices].toarray()),
+        column_names=[column_names[i] for i in dense_indices],
+        term_names=[term_names[i] for i in dense_indices],
+    )
+    X_sparse = SparseMatrix(
+        arg1[:, sparse_indices],
+        column_names=[column_names[i] for i in sparse_indices],
+        term_names=[term_names[i] for i in sparse_indices],
+    )
     return X_dense_F, X_sparse, dense_indices, sparse_indices
 
 
-def from_csc(mat: sps.csc_matrix, threshold=0.1):
+def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=None):
     """
     Convert a CSC-format sparse matrix into a ``SplitMatrix``.
 
     The ``threshold`` parameter specifies the density below which a column is
     treated as sparse.
     """
+    if column_names is None:
+        column_names = [None] * mat.shape[1]
+    if term_names is None:
+        term_names = column_names
     dense, sparse, dense_idx, sparse_idx = _split_sparse_and_dense_parts(mat, threshold)
     return SplitMatrix([dense, sparse], [dense_idx, sparse_idx])
diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index b624832c..7b35b654 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -33,7 +33,7 @@ class DenseMatrix(MatrixBase):
 
     """
 
-    def __init__(self, input_array):
+    def __init__(self, input_array, column_names=None, term_names=None):
         input_array = np.asarray(input_array)
 
         if input_array.ndim == 1:
@@ -42,9 +42,23 @@ def __init__(self, input_array):
             raise ValueError("Input array must be 1- or 2-dimensional")
 
         self._array = np.asarray(input_array)
+        width = self._array.shape[1]
+
+        if column_names is not None:
+            if len(column_names) != width:
+                raise ValueError(
+                    f"Expected {width} column names, got {len(column_names)}"
+                )
+            self._colnames = column_names
+        else:
+            self._colnames = [None] * width
 
-        self._colnames = [None] * input_array.shape[1]
-        self._terms = [None] * input_array.shape[1]
+        if term_names is not None:
+            if len(term_names) != width:
+                raise ValueError(f"Expected {width} term names, got {len(term_names)}")
+            self._terms = term_names
+        else:
+            self._terms = obj._colnames
 
     def __getitem__(self, key):
         if not isinstance(key, tuple):
diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py
index 7053236d..e62de092 100644
--- a/src/tabmat/sparse_matrix.py
+++ b/src/tabmat/sparse_matrix.py
@@ -30,7 +30,15 @@ class SparseMatrix(MatrixBase):
     SparseMatrix is instantiated in the same way as scipy.sparse.csc_matrix.
     """
 
-    def __init__(self, arg1, shape=None, dtype=None, copy=False):
+    def __init__(
+        self,
+        arg1,
+        shape=None,
+        dtype=None,
+        copy=False,
+        column_names=None,
+        term_names=None,
+    ):
         self._array = sps.csc_matrix(arg1, shape, dtype, copy)
 
         self.idx_dtype = max(self._array.indices.dtype, self._array.indptr.dtype)
@@ -44,8 +52,23 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False):
             self._array.sort_indices()
         self._array_csr = None
 
-        self._colnames = [None] * self.shape[1]
-        self._terms = [None] * self.shape[1]
+        if column_names is not None:
+            if len(column_names) != self.shape[1]:
+                raise ValueError(
+                    f"Expected {self.shape[1]} column names, got {len(column_names)}"
+                )
+            self._colnames = column_names
+        else:
+            self._colnames = [None] * self.shape[1]
+
+        if term_names is not None:
+            if len(term_names) != self.shape[1]:
+                raise ValueError(
+                    f"Expected {self.shape[1]} term names, got {len(term_names)}"
+                )
+            self._terms = term_names
+        else:
+            self._terms = self._colnames
 
 
     def __getitem__(self, key):
diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py
index e00dc0ee..e7669fba 100644
--- a/src/tabmat/split_matrix.py
+++ b/src/tabmat/split_matrix.py
@@ -114,10 +114,10 @@ def _combine_matrices(matrices, indices):
             new_matrix = mat_type_(stack_fn([matrices[i] for i in this_type_matrices]))
             new_indices = np.concatenate([indices[i] for i in this_type_matrices])
             new_colnames = np.concatenate(
-                np.array([matrices[i]._colnames for i in this_type_matrices])
+                [np.array(matrices[i]._colnames) for i in this_type_matrices]
             )
             new_terms = np.concatenate(
-                np.array([matrices[i]._terms for i in this_type_matrices])
+                [np.array(matrices[i]._terms) for i in this_type_matrices]
             )
             sorter = np.argsort(new_indices)
             sorted_matrix = new_matrix[:, sorter]

From 96c2e5658eb6f478fc543d2546f34b59d2d901ef Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Tue, 11 Jul 2023 18:23:35 +0200
Subject: [PATCH 04/19] Add indexing support for column names

---
 src/tabmat/dense_matrix.py  |  8 ++++++++
 src/tabmat/sparse_matrix.py | 14 ++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index 7b35b654..42ae5b22 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -114,6 +114,14 @@ def astype(self, dtype, order="K", casting="unsafe", copy=True):
         """Copy of the array, cast to a specified type."""
         return type(self)(self._array.astype(dtype, order, casting, copy))
 
+    def __getitem__(self, key):
+        """Return a subset of the matrix."""
+        result = super().__getitem__(key)
+        if len(key) == 2:
+            result._colnames = list(np.array(self._colnames)[key[1]])
+            result._terms = list(np.array(self._terms)[key[1]])
+        return result
+
     def getcol(self, i):
         """Return matrix column at specified index."""
         return type(self)(self._array[:, [i]])
diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py
index e62de092..af25eaad 100644
--- a/src/tabmat/sparse_matrix.py
+++ b/src/tabmat/sparse_matrix.py
@@ -70,7 +70,6 @@ def __init__(
         else:
             self._terms = self._colnames
 
-
     def __getitem__(self, key):
         if not isinstance(key, tuple):
             key = (key,)
@@ -78,7 +77,18 @@ def __getitem__(self, key):
         # Always return a 2d array
         key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key)
 
-        return type(self)(self._array.__getitem__(key))
+        if len(key) == 2:
+            colnames = list(np.array(self._colnames)[key[1]])
+            terms = list(np.array(self._terms)[key[1]])
+        else:
+            colnames = self._colnames
+            terms = self._terms
+
+        return type(self)(
+            self._array.__getitem__(key),
+            column_names=colnames,
+            term_names=terms
+        )
 
     def __matmul__(self, other):
         return self._array.__matmul__(other)

From 7ba73b2399977476615de83010cf3e88b5144887 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 12 Jul 2023 08:57:27 +0200
Subject: [PATCH 05/19] Remove unnecessary code

---
 src/tabmat/constructor.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py
index 08b09316..ed274377 100644
--- a/src/tabmat/constructor.py
+++ b/src/tabmat/constructor.py
@@ -227,9 +227,5 @@ def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=N
     The ``threshold`` parameter specifies the density below which a column is
     treated as sparse.
     """
-    if column_names is None:
-        column_names = [None] * mat.shape[1]
-    if term_names is None:
-        term_names = column_names
     dense, sparse, dense_idx, sparse_idx = _split_sparse_and_dense_parts(mat, threshold)
     return SplitMatrix([dense, sparse], [dense_idx, sparse_idx])

From cd1ca1e33609f536d103d4ee76e307c164e929a4 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 12 Jul 2023 10:12:02 +0200
Subject: [PATCH 06/19] Better default column names

---
 src/tabmat/categorical_matrix.py | 22 ++++++++++++++--------
 src/tabmat/dense_matrix.py       | 26 ++++++++++++++------------
 src/tabmat/matrix_base.py        | 14 ++++++++------
 src/tabmat/sparse_matrix.py      | 26 ++++++++++++++------------
 src/tabmat/split_matrix.py       | 24 ++++++++----------------
 src/tabmat/standardized_mat.py   | 18 +++++++++---------
 6 files changed, 67 insertions(+), 63 deletions(-)

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index 6d87adc7..31c8282d 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -667,7 +667,7 @@ def __repr__(self):
         return str(self.cat)
 
     def get_column_names(
-        self, missing_prefix: str = "_col_", start_index: int = 0
+        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
     ) -> List[str]:
         """Get column names.
 
@@ -679,16 +679,19 @@ def get_column_names(
         ----------
         missing_prefix
             Prefix to use for columns that do not have a name.
-        start_index
-            Index to start from when creating default names.
+        indices
+            The indices used for columns that do not have a name. If ``None``,
+            then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
         list of str
             Column names.
         """
+        if indices is None:
+            indices = list(range(len(self.cat.categories) - self.drop_first))
         if self._colname is None:
-            colname = f"{missing_prefix}{start_index}"
+            colname = f"{missing_prefix}{indices[0]}-{indices[-1]}"
         else:
             colname = self._colname
         return [
@@ -697,7 +700,7 @@ def get_column_names(
         ]
 
     def get_term_names(
-        self, missing_prefix: str = "_col_", start_index: int = 0
+        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
     ) -> List[str]:
         """Get term names.
 
@@ -712,16 +715,19 @@ def get_term_names(
         ----------
         missing_prefix
             Prefix to use for terms that do not have a name.
-        start_index
-            Index to start from when creating default names.
+        indices
+            The indices used for columns that do not have a name. If ``None``,
+            then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
         list of str
             Term names.
         """
+        if indices is None:
+            indices = list(range(len(self.cat.categories) - self.drop_first))
         if self._term is None:
-            term = f"{missing_prefix}{start_index}"
+            term = f"{missing_prefix}{indices[0]}-{indices[-1]}"
         else:
             term = self._term
         return [term] * (len(self.cat.categories) - self.drop_first)
diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index 42ae5b22..c4fe345c 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -246,7 +246,7 @@ def multiply(self, other):
         return type(self)(self._array.__mul__(other))
 
     def get_column_names(
-        self, missing_prefix: str = "_col_", start_index: int = 0
+        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
     ) -> List[str]:
         """Get column names.
 
@@ -258,23 +258,24 @@ def get_column_names(
         ----------
         missing_prefix
             Prefix to use for columns that do not have a name.
-        start_index
-            Index to start from when creating default names.
+        indices
+            The indices used for columns that do not have a name. If ``None``,
+            then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
         list of str
             Column names.
         """
+        if indices is None:
+            indices = list(range(self.shape[1]))
         colnames = np.array(self._colnames)
-        default_colnames = np.array(
-            [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
-        )
+        default_colnames = np.array([f"{missing_prefix}{i}" for i in indices])
         colnames[colnames == None] = default_colnames[colnames == None]  # noqa: E711
         return list(colnames)
 
     def get_term_names(
-        self, missing_prefix: str = "_col_", start_index: int = 0
+        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
     ) -> List[str]:
         """Get term names.
 
@@ -289,17 +290,18 @@ def get_term_names(
         ----------
         missing_prefix
             Prefix to use for terms that do not have a name.
-        start_index
-            Index to start from when creating default names.
+        indices
+            The indices used for columns that do not have a name. If ``None``,
+            then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
         list of str
             Term names.
         """
+        if indices is None:
+            indices = list(range(self.shape[1]))
         terms = np.array(self._terms)
-        default_terms = np.array(
-            [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
-        )
+        default_terms = np.array([f"{missing_prefix}{i}" for i in indices])
         terms[terms == None] = default_terms[terms == None]  # noqa: E711
         return list(terms)
diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py
index 318fd4d9..393196b0 100644
--- a/src/tabmat/matrix_base.py
+++ b/src/tabmat/matrix_base.py
@@ -166,7 +166,7 @@ def __getitem__(self, item):
 
     @abstractmethod
     def get_column_names(
-        self, missing_prefix: str = "_col_", start_index: int = 0
+        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
     ) -> List[str]:
         """Get column names.
 
@@ -178,8 +178,9 @@ def get_column_names(
         ----------
         missing_prefix
             Prefix to use for columns that do not have a name.
-        start_index
-            Index to start from when creating default names.
+        indices
+            The indices used for columns that do not have a name. If ``None``,
+            then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
@@ -190,7 +191,7 @@ def get_column_names(
 
     @abstractmethod
     def get_term_names(
-        self, missing_prefix: str = "_col_", start_index: int = 0
+        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
     ) -> List[str]:
         """Get term names.
 
@@ -205,8 +206,9 @@ def get_term_names(
         ----------
         missing_prefix
             Prefix to use for terms that do not have a name.
-        start_index
-            Index to start from when creating default names.
+        indices
+            The indices used for columns that do not have a name. If ``None``,
+            then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py
index af25eaad..18e554f3 100644
--- a/src/tabmat/sparse_matrix.py
+++ b/src/tabmat/sparse_matrix.py
@@ -326,7 +326,7 @@ def multiply(self, other):
         return type(self)(self._array.multiply(other))
 
     def get_column_names(
-        self, missing_prefix: str = "_col_", start_index: int = 0
+        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
     ) -> List[str]:
         """Get column names.
 
@@ -338,23 +338,24 @@ def get_column_names(
         ----------
         missing_prefix
             Prefix to use for columns that do not have a name.
-        start_index
-            Index to start from when creating default names.
+        indices
+            The indices used for columns that do not have a name. If ``None``,
+            then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
         list of str
             Column names.
         """
+        if indices is None:
+            indices = list(range(self.shape[1]))
         colnames = np.array(self._colnames)
-        default_colnames = np.array(
-            [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
-        )
+        default_colnames = np.array([f"{missing_prefix}{i}" for i in indices])
         colnames[colnames == None] = default_colnames[colnames == None]  # noqa: E711
         return list(colnames)
 
     def get_term_names(
-        self, missing_prefix: str = "_col_", start_index: int = 0
+        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
     ) -> List[str]:
         """Get term names.
 
@@ -369,17 +370,18 @@ def get_term_names(
         ----------
         missing_prefix
             Prefix to use for terms that do not have a name.
-        start_index
-            Index to start from when creating default names.
+        indices
+            The indices used for columns that do not have a name. If ``None``,
+            then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
         list of str
             Term names.
         """
+        if indices is None:
+            indices = list(range(self.shape[1]))
         terms = np.array(self._terms)
-        default_terms = np.array(
-            [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
-        )
+        default_terms = np.array([f"{missing_prefix}{i}" for i in indices])
         terms[terms == None] = default_terms[terms == None]  # noqa: E711
         return list(terms)
diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py
index e7669fba..ec2f34cc 100644
--- a/src/tabmat/split_matrix.py
+++ b/src/tabmat/split_matrix.py
@@ -487,7 +487,7 @@ def __repr__(self):
     __array_priority__ = 13
 
     def get_column_names(
-        self, missing_prefix: str = "_col_", start_index: int = 0
+        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
     ) -> List[str]:
         """Get column names.
 
@@ -499,8 +499,8 @@ def get_column_names(
         ----------
         missing_prefix
             Prefix to use for columns that do not have a name.
-        start_index
-            Index to start from when creating default names.
+        indices
+            Ignored.
 
         Returns
         -------
@@ -509,15 +509,11 @@ def get_column_names(
         """
         column_names = np.empty(self.shape[1], dtype=object)
         for idx, mat in zip(self.indices, self.matrices):
-            column_names[idx] = mat.get_column_names(missing_prefix, start_index)
-            if isinstance(mat, CategoricalMatrix):
-                start_index += 1
-            else:
-                start_index += mat.shape[1]
+            column_names[idx] = mat.get_column_names(missing_prefix, idx)
         return list(column_names)
 
     def get_term_names(
-        self, missing_prefix: str = "_col_", start_index: int = 0
+        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
     ) -> List[str]:
         """Get term names.
 
@@ -532,8 +528,8 @@ def get_term_names(
         ----------
         missing_prefix
             Prefix to use for terms that do not have a name.
-        start_index
-            Index to start from when creating default names.
+        indices
+            Ignored.
 
         Returns
         -------
@@ -542,9 +538,5 @@ def get_term_names(
         """
         term_names = np.empty(self.shape[1], dtype=object)
         for idx, mat in zip(self.indices, self.matrices):
-            term_names[idx] = mat.get_term_names(missing_prefix, start_index)
-            if isinstance(mat, CategoricalMatrix):
-                start_index += 1
-            else:
-                start_index += mat.shape[1]
+            term_names[idx] = mat.get_term_names(missing_prefix, idx)
         return list(term_names)
diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py
index df69b15a..8a8099c7 100644
--- a/src/tabmat/standardized_mat.py
+++ b/src/tabmat/standardized_mat.py
@@ -1,4 +1,4 @@
-from typing import List, Union
+from typing import List, Optional, Union
 
 import numpy as np
 from scipy import sparse as sps
@@ -300,7 +300,7 @@ def __repr__(self):
         return out
 
     def get_column_names(
-        self, missing_prefix: str = "_col_", start_index: int = 0
+        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
     ) -> List[str]:
         """Get column names.
 
@@ -312,18 +312,18 @@ def get_column_names(
         ----------
         missing_prefix
             Prefix to use for columns that do not have a name.
-        start_index
-            Index to start from when creating default names.
+        indices
+            Ignored.
 
         Returns
         -------
         list of str
             Column names.
         """
-        return self.mat.get_column_names(missing_prefix, start_index)
+        return self.mat.get_column_names(missing_prefix, indices)
 
     def get_term_names(
-        self, missing_prefix: str = "_col_", start_index: int = 0
+        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
     ) -> List[str]:
         """Get term names.
 
@@ -338,12 +338,12 @@ def get_term_names(
         ----------
         missing_prefix
             Prefix to use for terms that do not have a name.
-        start_index
-            Index to start from when creating default names.
+        indices
+            Ignored.
 
         Returns
         -------
         list of str
             Term names.
         """
-        return self.mat.get_term_names(missing_prefix, start_index)
+        return self.mat.get_term_names(missing_prefix, indices)

From f6549160ce2b4e9f704b0c4e9e0ee82c8f98f2a8 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 12 Jul 2023 10:36:15 +0200
Subject: [PATCH 07/19] Reduce code duplication

---
 src/tabmat/categorical_matrix.py | 64 ++++++++++++--------------------
 src/tabmat/constructor.py        |  4 +-
 src/tabmat/dense_matrix.py       | 57 ++++++++++------------------
 src/tabmat/matrix_base.py        | 40 +++++---------------
 src/tabmat/sparse_matrix.py      | 40 +++++++++++++-------
 src/tabmat/split_matrix.py       | 50 ++++++++-----------------
 src/tabmat/standardized_mat.py   | 43 +++++++--------------
 7 files changed, 110 insertions(+), 188 deletions(-)

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index 31c8282d..41a6e1d8 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -666,8 +666,11 @@ def multiply(self, other) -> SparseMatrix:
     def __repr__(self):
         return str(self.cat)
 
-    def get_column_names(
-        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
+    def get_names(
+        self,
+        type: str = "column",
+        missing_prefix: str = "_col_",
+        indices: Optional[List[int]] = None,
     ) -> List[str]:
         """Get column names.
 
@@ -677,6 +680,11 @@ def get_column_names(
 
         Parameters
         ----------
+        type: str {'column'|'term'}
+            Whether to get column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
         missing_prefix
             Prefix to use for columns that do not have a name.
         indices
@@ -688,46 +696,22 @@ def get_column_names(
         list of str
             Column names.
         """
-        if indices is None:
-            indices = list(range(len(self.cat.categories) - self.drop_first))
-        if self._colname is None:
-            colname = f"{missing_prefix}{indices[0]}-{indices[-1]}"
+        if type == "column":
+            name = self._colname
+        elif type == "term":
+            name = self._term
         else:
-            colname = self._colname
-        return [
-            self._colname_format.format(name=colname, category=cat)
-            for cat in self.cat.categories[self.drop_first :]
-        ]
-
-    def get_term_names(
-        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
-    ) -> List[str]:
-        """Get term names.
+            raise ValueError(f"Type must be 'column' or 'term', got {type}")
 
-        The main difference to ``get_column_names`` is that a categorical submatrix
-        is counted as a single term. Furthermore, matrices created from formulas
-        have a difference between a column and term (c.f. ``formulaic`` docs).
-        For terms that do not have a name, a default name is created using the
-        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
-        the index of the term.
-
-        Parameters
-        ----------
-        missing_prefix
-            Prefix to use for terms that do not have a name.
-        indices
-            The indices used for columns that do not have a name. If ``None``,
-            then the indices are ``list(range(self.shape[1]))``.
-
-        Returns
-        -------
-        list of str
-            Term names.
-        """
         if indices is None:
             indices = list(range(len(self.cat.categories) - self.drop_first))
-        if self._term is None:
-            term = f"{missing_prefix}{indices[0]}-{indices[-1]}"
+        if name is None:
+            name = f"{missing_prefix}{indices[0]}-{indices[-1]}"
+
+        if type == "column":
+            return [
+                self._colname_format.format(name=name, category=cat)
+                for cat in self.cat.categories[self.drop_first :]
+            ]
         else:
-            term = self._term
-        return [term] * (len(self.cat.categories) - self.drop_first)
+            return [name] * (len(self.cat.categories) - self.drop_first)
diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py
index ed274377..d01aa711 100644
--- a/src/tabmat/constructor.py
+++ b/src/tabmat/constructor.py
@@ -90,8 +90,8 @@ def from_pandas(
                 ) = _split_sparse_and_dense_parts(
                     sps.csc_matrix(cat.tocsr(), dtype=dtype),
                     threshold=sparse_threshold,
-                    column_names=cat.get_column_names(),
-                    term_names=cat.get_term_names(),
+                    column_names=cat.get_names("columns"),
+                    term_names=cat.get_names("terms"),
                 )
                 matrices.append(X_dense_F)
                 is_cat.append(True)
diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index c4fe345c..f93e777e 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -245,8 +245,11 @@ def multiply(self, other):
             return type(self)(self._array.__mul__(other[:, np.newaxis]))
         return type(self)(self._array.__mul__(other))
 
-    def get_column_names(
-        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
+    def get_names(
+        self,
+        type: str = "column",
+        missing_prefix: str = "_col_",
+        indices: Optional[List[int]] = None,
     ) -> List[str]:
         """Get column names.
 
@@ -256,6 +259,11 @@ def get_column_names(
 
         Parameters
         ----------
+        type: str {'column'|'term'}
+            Whether to get column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
         missing_prefix
             Prefix to use for columns that do not have a name.
         indices
@@ -267,41 +275,16 @@ def get_column_names(
         list of str
             Column names.
         """
-        if indices is None:
-            indices = list(range(self.shape[1]))
-        colnames = np.array(self._colnames)
-        default_colnames = np.array([f"{missing_prefix}{i}" for i in indices])
-        colnames[colnames == None] = default_colnames[colnames == None]  # noqa: E711
-        return list(colnames)
-
-    def get_term_names(
-        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
-    ) -> List[str]:
-        """Get term names.
-
-        The main difference to ``get_column_names`` is that a categorical submatrix
-        is counted as a single term. Furthermore, matrices created from formulas
-        have a difference between a column and term (c.f. ``formulaic`` docs).
-        For terms that do not have a name, a default name is created using the
-        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
-        the index of the term.
-
-        Parameters
-        ----------
-        missing_prefix
-            Prefix to use for terms that do not have a name.
-        indices
-            The indices used for columns that do not have a name. If ``None``,
-            then the indices are ``list(range(self.shape[1]))``.
+        if type == "column":
+            names = np.array(self._colnames)
+        elif type == "term":
+            names = np.array(self._terms)
+        else:
+            raise ValueError(f"Type must be 'column' or 'term', got {type}")
 
-        Returns
-        -------
-        list of str
-            Term names.
-        """
         if indices is None:
             indices = list(range(self.shape[1]))
-        terms = np.array(self._terms)
-        default_terms = np.array([f"{missing_prefix}{i}" for i in indices])
-        terms[terms == None] = default_terms[terms == None]  # noqa: E711
-        return list(terms)
+
+        default_names = np.array([f"{missing_prefix}{i}" for i in indices])
+        names[names == None] = default_names[names == None]  # noqa: E711
+        return list(names)
diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py
index 393196b0..ad9a8cff 100644
--- a/src/tabmat/matrix_base.py
+++ b/src/tabmat/matrix_base.py
@@ -165,8 +165,11 @@ def __getitem__(self, item):
         pass
 
     @abstractmethod
-    def get_column_names(
-        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
+    def get_names(
+        self,
+        type: str = "column",
+        missing_prefix: str = "_col_",
+        indices: Optional[List[int]] = None,
     ) -> List[str]:
         """Get column names.
 
@@ -176,6 +179,11 @@ def get_column_names(
 
         Parameters
         ----------
+        type: str {'column'|'term'}
+            Whether to get column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
         missing_prefix
             Prefix to use for columns that do not have a name.
         indices
@@ -189,34 +197,6 @@ def get_column_names(
         """
         pass
 
-    @abstractmethod
-    def get_term_names(
-        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
-    ) -> List[str]:
-        """Get term names.
-
-        The main difference to ``get_column_names`` is that a categorical submatrix
-        is counted as a single term. Furthermore, matrices created from formulas
-        have a difference between a column and term (c.f. ``formulaic`` docs).
-        For terms that do not have a name, a default name is created using the
-        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
-        the index of the term.
-
-        Parameters
-        ----------
-        missing_prefix
-            Prefix to use for terms that do not have a name.
-        indices
-            The indices used for columns that do not have a name. If ``None``,
-            then the indices are ``list(range(self.shape[1]))``.
-
-        Returns
-        -------
-        list of str
-            Term names.
-        """
-        pass
-
     # Higher priority than numpy arrays, so behavior for funcs like "@" defaults to the
     # behavior of this class
     __array_priority__ = 11
diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py
index 18e554f3..edd69f15 100644
--- a/src/tabmat/sparse_matrix.py
+++ b/src/tabmat/sparse_matrix.py
@@ -354,22 +354,27 @@ def get_column_names(
         colnames[colnames == None] = default_colnames[colnames == None]  # noqa: E711
         return list(colnames)
 
-    def get_term_names(
-        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
+    def get_names(
+        self,
+        type: str = "column",
+        missing_prefix: str = "_col_",
+        indices: Optional[List[int]] = None,
     ) -> List[str]:
-        """Get term names.
+        """Get column names.
 
-        The main difference to ``get_column_names`` is that a categorical submatrix
-        is counted as a single term. Furthermore, matrices created from formulas
-        have a difference between a column and term (c.f. ``formulaic`` docs).
-        For terms that do not have a name, a default name is created using the
+        For columns that do not have a name, a default name is created using the
         followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
-        the index of the term.
+        the index of the column.
 
         Parameters
         ----------
+        type: str {'column'|'term'}
+            Whether to get column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
         missing_prefix
-            Prefix to use for terms that do not have a name.
+            Prefix to use for columns that do not have a name.
         indices
             The indices used for columns that do not have a name. If ``None``,
             then the indices are ``list(range(self.shape[1]))``.
@@ -377,11 +382,18 @@ def get_term_names(
         Returns
         -------
         list of str
-            Term names.
+            Column names.
         """
+        if type == "column":
+            names = np.array(self._colnames)
+        elif type == "term":
+            names = np.array(self._terms)
+        else:
+            raise ValueError(f"Type must be 'column' or 'term', got {type}")
+
         if indices is None:
             indices = list(range(self.shape[1]))
-        terms = np.array(self._terms)
-        default_terms = np.array([f"{missing_prefix}{i}" for i in indices])
-        terms[terms == None] = default_terms[terms == None]  # noqa: E711
-        return list(terms)
+
+        default_names = np.array([f"{missing_prefix}{i}" for i in indices])
+        names[names == None] = default_names[names == None]  # noqa: E711
+        return list(names)
diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py
index ec2f34cc..133b3f68 100644
--- a/src/tabmat/split_matrix.py
+++ b/src/tabmat/split_matrix.py
@@ -486,8 +486,11 @@ def __repr__(self):
 
     __array_priority__ = 13
 
-    def get_column_names(
-        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
+    def get_names(
+        self,
+        type: str = "column",
+        missing_prefix: str = "_col_",
+        indices: Optional[List[int]] = None,
     ) -> List[str]:
         """Get column names.
 
@@ -497,46 +500,23 @@ def get_column_names(
 
         Parameters
         ----------
+        type: str {'column'|'term'}
+            Whether to get column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
         missing_prefix
             Prefix to use for columns that do not have a name.
         indices
-            Ignored.
+            The indices used for columns that do not have a name. If ``None``,
+            then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
         list of str
             Column names.
         """
-        column_names = np.empty(self.shape[1], dtype=object)
-        for idx, mat in zip(self.indices, self.matrices):
-            column_names[idx] = mat.get_column_names(missing_prefix, idx)
-        return list(column_names)
-
-    def get_term_names(
-        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
-    ) -> List[str]:
-        """Get term names.
-
-        The main difference to ``get_column_names`` is that a categorical submatrix
-        is counted as a single term. Furthermore, matrices created from formulas
-        have a difference between a column and term (c.f. ``formulaic`` docs).
-        For terms that do not have a name, a default name is created using the
-        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
-        the index of the term.
-
-        Parameters
-        ----------
-        missing_prefix
-            Prefix to use for terms that do not have a name.
-        indices
-            Ignored.
-
-        Returns
-        -------
-        list of str
-            Term names.
-        """
-        term_names = np.empty(self.shape[1], dtype=object)
+        names = np.empty(self.shape[1], dtype=object)
         for idx, mat in zip(self.indices, self.matrices):
-            term_names[idx] = mat.get_term_names(missing_prefix, idx)
-        return list(term_names)
+            names[idx] = mat.get_names(type, missing_prefix, idx)
+        return list(names)
diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py
index 8a8099c7..397113ed 100644
--- a/src/tabmat/standardized_mat.py
+++ b/src/tabmat/standardized_mat.py
@@ -299,8 +299,11 @@ def __repr__(self):
         """
         return out
 
-    def get_column_names(
-        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
+    def get_names(
+        self,
+        type: str = "column",
+        missing_prefix: str = "_col_",
+        indices: Optional[List[int]] = None,
     ) -> List[str]:
         """Get column names.
 
@@ -310,40 +313,20 @@ def get_column_names(
 
         Parameters
         ----------
+        type: str {'column'|'term'}
+            Whether to get column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
         missing_prefix
             Prefix to use for columns that do not have a name.
         indices
-            Ignored.
+            The indices used for columns that do not have a name. If ``None``,
+            then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
         list of str
             Column names.
         """
-        return self.mat.get_column_names(missing_prefix, indices)
-
-    def get_term_names(
-        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
-    ) -> List[str]:
-        """Get term names.
-
-        The main difference to ``get_column_names`` is that a categorical submatrix
-        is counted as a single term. Furthermore, matrices created from formulas
-        have a difference between a column and term (c.f. ``formulaic`` docs).
-        For terms that do not have a name, a default name is created using the
-        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
-        the index of the term.
-
-        Parameters
-        ----------
-        missing_prefix
-            Prefix to use for terms that do not have a name.
-        indices
-            Ignored.
-
-        Returns
-        -------
-        list of str
-            Term names.
-        """
-        return self.mat.get_term_names(missing_prefix, indices)
+        return self.mat.get_names(type, missing_prefix, indices)

From e0e33d1f3d738e02b9caa79e5c6fb28f3e27fb85 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 12 Jul 2023 10:53:00 +0200
Subject: [PATCH 08/19] Saner defaults

---
 src/tabmat/categorical_matrix.py | 15 +++++++++------
 src/tabmat/dense_matrix.py       | 17 ++++++++++-------
 src/tabmat/matrix_base.py        | 11 ++++++-----
 src/tabmat/sparse_matrix.py      | 17 ++++++++++-------
 src/tabmat/split_matrix.py       | 11 ++++++-----
 src/tabmat/standardized_mat.py   | 11 ++++++-----
 6 files changed, 47 insertions(+), 35 deletions(-)

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index 41a6e1d8..d1badcd7 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -669,9 +669,9 @@ def __repr__(self):
     def get_names(
         self,
         type: str = "column",
-        missing_prefix: str = "_col_",
+        missing_prefix: Optional[str] = None,
         indices: Optional[List[int]] = None,
-    ) -> List[str]:
+    ) -> List[Optional[str]]:
         """Get column names.
 
         For columns that do not have a name, a default name is created using the
@@ -685,15 +685,16 @@ def get_names(
             a categorical submatrix is counted as a single term, whereas it is
             counted as multiple columns. Furthermore, matrices created from formulas
             have a difference between a column and term (c.f. ``formulaic`` docs).
-        missing_prefix
-            Prefix to use for columns that do not have a name.
+        missing_prefix: Optional[str], default None
+            Prefix to use for columns that do not have a name. If None, then no
+            default name is created.
         indices
             The indices used for columns that do not have a name. If ``None``,
             then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
-        list of str
+        List[Optional[str]]
             Column names.
         """
         if type == "column":
@@ -705,7 +706,9 @@ def get_names(
 
         if indices is None:
             indices = list(range(len(self.cat.categories) - self.drop_first))
-        if name is None:
+        if name is None and missing_prefix is None:
+            return [None] * (len(self.cat.categories) - self.drop_first)
+        elif name is None:
             name = f"{missing_prefix}{indices[0]}-{indices[-1]}"
 
         if type == "column":
diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index f93e777e..085f9d65 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -248,9 +248,9 @@ def multiply(self, other):
     def get_names(
         self,
         type: str = "column",
-        missing_prefix: str = "_col_",
+        missing_prefix: Optional[str] = None,
         indices: Optional[List[int]] = None,
-    ) -> List[str]:
+    ) -> List[Optional[str]]:
         """Get column names.
 
         For columns that do not have a name, a default name is created using the
@@ -264,15 +264,16 @@ def get_names(
             a categorical submatrix is counted as a single term, whereas it is
             counted as multiple columns. Furthermore, matrices created from formulas
             have a difference between a column and term (c.f. ``formulaic`` docs).
-        missing_prefix
-            Prefix to use for columns that do not have a name.
+        missing_prefix: Optional[str], default None
+            Prefix to use for columns that do not have a name. If None, then no
+            default name is created.
         indices
             The indices used for columns that do not have a name. If ``None``,
             then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
-        list of str
+        List[Optional[str]]
             Column names.
         """
         if type == "column":
@@ -285,6 +286,8 @@ def get_names(
         if indices is None:
             indices = list(range(self.shape[1]))
 
-        default_names = np.array([f"{missing_prefix}{i}" for i in indices])
-        names[names == None] = default_names[names == None]  # noqa: E711
+        if missing_prefix is not None:
+            default_names = np.array([f"{missing_prefix}{i}" for i in indices])
+            names[names == None] = default_names[names == None]  # noqa: E711
+
         return list(names)
diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py
index ad9a8cff..c8815452 100644
--- a/src/tabmat/matrix_base.py
+++ b/src/tabmat/matrix_base.py
@@ -168,9 +168,9 @@ def __getitem__(self, item):
     def get_names(
         self,
         type: str = "column",
-        missing_prefix: str = "_col_",
+        missing_prefix: Optional[str] = None,
         indices: Optional[List[int]] = None,
-    ) -> List[str]:
+    ) -> List[Optional[str]]:
         """Get column names.
 
         For columns that do not have a name, a default name is created using the
@@ -184,15 +184,16 @@ def get_names(
             a categorical submatrix is counted as a single term, whereas it is
             counted as multiple columns. Furthermore, matrices created from formulas
             have a difference between a column and term (c.f. ``formulaic`` docs).
-        missing_prefix
-            Prefix to use for columns that do not have a name.
+        missing_prefix: Optional[str], default None
+            Prefix to use for columns that do not have a name. If None, then no
+            default name is created.
         indices
             The indices used for columns that do not have a name. If ``None``,
             then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
-        list of str
+        List[Optional[str]]
             Column names.
         """
         pass
diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py
index edd69f15..4dc4421f 100644
--- a/src/tabmat/sparse_matrix.py
+++ b/src/tabmat/sparse_matrix.py
@@ -357,9 +357,9 @@ def get_column_names(
     def get_names(
         self,
         type: str = "column",
-        missing_prefix: str = "_col_",
+        missing_prefix: Optional[str] = None,
         indices: Optional[List[int]] = None,
-    ) -> List[str]:
+    ) -> List[Optional[str]]:
         """Get column names.
 
         For columns that do not have a name, a default name is created using the
@@ -373,15 +373,16 @@ def get_names(
             a categorical submatrix is counted as a single term, whereas it is
             counted as multiple columns. Furthermore, matrices created from formulas
             have a difference between a column and term (c.f. ``formulaic`` docs).
-        missing_prefix
-            Prefix to use for columns that do not have a name.
+        missing_prefix: Optional[str], default None
+            Prefix to use for columns that do not have a name. If None, then no
+            default name is created.
         indices
             The indices used for columns that do not have a name. If ``None``,
             then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
-        list of str
+        List[Optional[str]]
             Column names.
         """
         if type == "column":
@@ -394,6 +395,8 @@ def get_names(
         if indices is None:
             indices = list(range(self.shape[1]))
 
-        default_names = np.array([f"{missing_prefix}{i}" for i in indices])
-        names[names == None] = default_names[names == None]  # noqa: E711
+        if missing_prefix is not None:
+            default_names = np.array([f"{missing_prefix}{i}" for i in indices])
+            names[names == None] = default_names[names == None]  # noqa: E711
+
         return list(names)
diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py
index 133b3f68..8eea2b4a 100644
--- a/src/tabmat/split_matrix.py
+++ b/src/tabmat/split_matrix.py
@@ -489,9 +489,9 @@ def __repr__(self):
     def get_names(
         self,
         type: str = "column",
-        missing_prefix: str = "_col_",
+        missing_prefix: Optional[str] = None,
         indices: Optional[List[int]] = None,
-    ) -> List[str]:
+    ) -> List[Optional[str]]:
         """Get column names.
 
         For columns that do not have a name, a default name is created using the
@@ -505,15 +505,16 @@ def get_names(
             a categorical submatrix is counted as a single term, whereas it is
             counted as multiple columns. Furthermore, matrices created from formulas
             have a difference between a column and term (c.f. ``formulaic`` docs).
-        missing_prefix
-            Prefix to use for columns that do not have a name.
+        missing_prefix: Optional[str], default None
+            Prefix to use for columns that do not have a name. If None, then no
+            default name is created.
         indices
             The indices used for columns that do not have a name. If ``None``,
             then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
-        list of str
+        List[Optional[str]]
             Column names.
         """
         names = np.empty(self.shape[1], dtype=object)
diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py
index 397113ed..df1304e8 100644
--- a/src/tabmat/standardized_mat.py
+++ b/src/tabmat/standardized_mat.py
@@ -302,9 +302,9 @@ def __repr__(self):
     def get_names(
         self,
         type: str = "column",
-        missing_prefix: str = "_col_",
+        missing_prefix: Optional[str] = None,
         indices: Optional[List[int]] = None,
-    ) -> List[str]:
+    ) -> List[Optional[str]]:
         """Get column names.
 
         For columns that do not have a name, a default name is created using the
@@ -318,15 +318,16 @@ def get_names(
             a categorical submatrix is counted as a single term, whereas it is
             counted as multiple columns. Furthermore, matrices created from formulas
             have a difference between a column and term (c.f. ``formulaic`` docs).
-        missing_prefix
-            Prefix to use for columns that do not have a name.
+        missing_prefix: Optional[str], default None
+            Prefix to use for columns that do not have a name. If None, then no
+            default name is created.
         indices
             The indices used for columns that do not have a name. If ``None``,
             then the indices are ``list(range(self.shape[1]))``.
 
         Returns
         -------
-        list of str
+        List[Optional[str]]
             Column names.
         """
         return self.mat.get_names(type, missing_prefix, indices)

From 81932ea8d5bdaa5b914964341f97c2fadc10510b Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 12 Jul 2023 11:23:15 +0200
Subject: [PATCH 09/19] Add convenient getters and setters

---
 src/tabmat/categorical_matrix.py | 29 +++++++++++++++++++++++++
 src/tabmat/dense_matrix.py       | 26 +++++++++++++++++++++++
 src/tabmat/matrix_base.py        | 36 ++++++++++++++++++++++++++++++++
 src/tabmat/sparse_matrix.py      | 26 +++++++++++++++++++++++
 src/tabmat/split_matrix.py       | 21 +++++++++++++++++++
 src/tabmat/standardized_mat.py   | 36 ++++++++++++++++++++++++++++++++
 6 files changed, 174 insertions(+)

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index d1badcd7..5f5d1914 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -718,3 +718,32 @@ def get_names(
             ]
         else:
             return [name] * (len(self.cat.categories) - self.drop_first)
+
+    def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"):
+        """Set column names.
+
+        Parameters
+        ----------
+        names: List[Optional[str]]
+            Names to set.
+        type: str {'column'|'term'}
+            Whether to set column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
+        """
+        if isinstance(names, str):
+            names = [names]
+
+        if len(names) == self.shape[1] and all(name == names[0] for name in names):
+            names = [names[0]]
+
+        if len(names) != 1:
+            raise ValueError("A categorical matrix has only one name")
+
+        if type == "column":
+            self._colname = names[0]
+        elif type == "term":
+            self._term = names[0]
+        else:
+            raise ValueError(f"Type must be 'column' or 'term', got {type}")
diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index 085f9d65..25371e17 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -291,3 +291,29 @@ def get_names(
             names[names == None] = default_names[names == None]  # noqa: E711
 
         return list(names)
+
+    def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"):
+        """Set column names.
+
+        Parameters
+        ----------
+        names: List[Optional[str]]
+            Names to set.
+        type: str {'column'|'term'}
+            Whether to set column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
+        """
+        if isinstance(names, str):
+            names = [names]
+
+        if len(names) != self.shape[1]:
+            raise ValueError(f"Length of names must be {self.shape[1]}")
+
+        if type == "column":
+            self._colnames = names
+        elif type == "term":
+            self._terms = names
+        else:
+            raise ValueError(f"Type must be 'column' or 'term', got {type}")
diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py
index c8815452..ac17d717 100644
--- a/src/tabmat/matrix_base.py
+++ b/src/tabmat/matrix_base.py
@@ -198,6 +198,42 @@ def get_names(
         """
         pass
 
+    def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"):
+        """Set column names.
+
+        Parameters
+        ----------
+        names: List[Optional[str]]
+            Names to set.
+        type: str {'column'|'term'}
+            Whether to set column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
+        """
+        pass
+
+    @property
+    def column_names(self):
+        """Column names of the matrix."""
+        return self.get_names(type="column")
+
+    @column_names.setter
+    def column_names(self, names: List[Optional[str]]):
+        self.set_names(names, type="column")
+
+    @property
+    def term_names(self):
+        """Term names of the matrix.
+
+        For differences between column names and term names, see ``get_names``.
+        """
+        return self.get_names(type="term")
+
+    @term_names.setter
+    def term_names(self, names: List[Optional[str]]):
+        self.set_names(names, type="term")
+
     # Higher priority than numpy arrays, so behavior for funcs like "@" defaults to the
     # behavior of this class
     __array_priority__ = 11
diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py
index 4dc4421f..ace52bbd 100644
--- a/src/tabmat/sparse_matrix.py
+++ b/src/tabmat/sparse_matrix.py
@@ -400,3 +400,29 @@ def get_names(
             names[names == None] = default_names[names == None]  # noqa: E711
 
         return list(names)
+
+    def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"):
+        """Set column names.
+
+        Parameters
+        ----------
+        names: List[Optional[str]]
+            Names to set.
+        type: str {'column'|'term'}
+            Whether to set column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
+        """
+        if isinstance(names, str):
+            names = [names]
+
+        if len(names) != self.shape[1]:
+            raise ValueError(f"Length of names must be {self.shape[1]}")
+
+        if type == "column":
+            self._colnames = names
+        elif type == "term":
+            self._terms = names
+        else:
+            raise ValueError(f"Type must be 'column' or 'term', got {type}")
diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py
index 8eea2b4a..a60b1fb3 100644
--- a/src/tabmat/split_matrix.py
+++ b/src/tabmat/split_matrix.py
@@ -521,3 +521,24 @@ def get_names(
         for idx, mat in zip(self.indices, self.matrices):
             names[idx] = mat.get_names(type, missing_prefix, idx)
         return list(names)
+
+    def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"):
+        """Set column names.
+
+        Parameters
+        ----------
+        names: List[Optional[str]]
+            Names to set.
+        type: str {'column'|'term'}
+            Whether to set column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
+        """
+        names_array = np.array(names)
+
+        if len(names) != self.shape[1]:
+            raise ValueError(f"Length of names must be {self.shape[1]}")
+
+        for idx, mat in zip(self.indices, self.matrices):
+            mat.set_names(list(names_array[idx]), type)
diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py
index df1304e8..2e88dbb0 100644
--- a/src/tabmat/standardized_mat.py
+++ b/src/tabmat/standardized_mat.py
@@ -331,3 +331,39 @@ def get_names(
             Column names.
         """
         return self.mat.get_names(type, missing_prefix, indices)
+
+    def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"):
+        """Set column names.
+
+        Parameters
+        ----------
+        names: List[Optional[str]]
+            Names to set.
+        type: str {'column'|'term'}
+            Whether to set column names or term names. The main difference is that
+            a categorical submatrix is counted as a single term, whereas it is
+            counted as multiple columns. Furthermore, matrices created from formulas
+            have a difference between a column and term (c.f. ``formulaic`` docs).
+        """
+        self.mat.set_names(names, type)
+
+    @property
+    def column_names(self):
+        """Column names of the matrix."""
+        return self.get_names(type="column")
+
+    @column_names.setter
+    def column_names(self, names: List[Optional[str]]):
+        self.set_names(names, type="column")
+
+    @property
+    def term_names(self):
+        """Term names of the matrix.
+
+        For differences between column names and term names, see ``get_names``.
+        """
+        return self.get_names(type="term")
+
+    @term_names.setter
+    def term_names(self, names: List[Optional[str]]):
+        self.set_names(names, type="term")

From 5147f8302734588f5b1398b58cc5947337dce6bb Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 12 Jul 2023 11:54:25 +0200
Subject: [PATCH 10/19] Fix indexing

---
 src/tabmat/constructor.py  | 4 ++--
 src/tabmat/dense_matrix.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py
index d01aa711..d280140a 100644
--- a/src/tabmat/constructor.py
+++ b/src/tabmat/constructor.py
@@ -90,8 +90,8 @@ def from_pandas(
                 ) = _split_sparse_and_dense_parts(
                     sps.csc_matrix(cat.tocsr(), dtype=dtype),
                     threshold=sparse_threshold,
-                    column_names=cat.get_names("columns"),
-                    term_names=cat.get_names("terms"),
+                    column_names=cat.get_names("column"),
+                    term_names=cat.get_names("term"),
                 )
                 matrices.append(X_dense_F)
                 is_cat.append(True)
diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index 25371e17..542c586d 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -117,7 +117,7 @@ def astype(self, dtype, order="K", casting="unsafe", copy=True):
     def __getitem__(self, key):
         """Return a subset of the matrix."""
         result = super().__getitem__(key)
-        if len(key) == 2:
+        if isinstance(key, tuple) and len(key) == 2:
             result._colnames = list(np.array(self._colnames)[key[1]])
             result._terms = list(np.array(self._terms)[key[1]])
         return result

From 237c9e53842453cf91b1663e899b486064c69165 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 12 Jul 2023 13:40:29 +0200
Subject: [PATCH 11/19] Smarter setter for categorical matrices

---
 src/tabmat/categorical_matrix.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index 5f5d1914..0cdecb10 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -161,6 +161,7 @@ def matvec(mat, vec):
 
 """
 
+import re
 from typing import Any, List, Optional, Tuple, Union
 
 import numpy as np
@@ -735,8 +736,23 @@ def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"
         if isinstance(names, str):
             names = [names]
 
-        if len(names) == self.shape[1] and all(name == names[0] for name in names):
-            names = [names[0]]
+        if len(names) != 1:
+            if type == "column":
+                # Try finding the column name
+                base_names = []
+                for name, cat in zip(names, self.cat.categories[self.drop_first :]):
+                    partial_name = self._colname_format.format(
+                        name="__CAPTURE__", category=cat
+                    )
+                    pattern = re.escape(partial_name).replace("__CAPTURE__", "(.*)")
+                    if (name is not None) and (match := re.search(pattern, name)):
+                        base_names.append(match.group(1))
+                    else:
+                        base_names.append(name)
+                names = base_names
+
+            if len(names) == self.shape[1] and all(name == names[0] for name in names):
+                names = [names[0]]
 
         if len(names) != 1:
             raise ValueError("A categorical matrix has only one name")

From 71498bfc2a7e3417c9a9bcd0aafb5b8187527c3c Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 12 Jul 2023 14:44:10 +0200
Subject: [PATCH 12/19] Add tests

---
 src/tabmat/dense_matrix.py  |   6 +-
 src/tabmat/sparse_matrix.py |  31 +-----
 tests/test_matrices.py      | 182 +++++++++++++++++++++++++++++++++++-
 3 files changed, 185 insertions(+), 34 deletions(-)

diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index 542c586d..76c43497 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -58,7 +58,7 @@ def __init__(self, input_array, column_names=None, term_names=None):
                 raise ValueError(f"Expected {width} term names, got {len(term_names)}")
             self._terms = term_names
         else:
-            self._terms = obj._colnames
+            self._terms = self._colnames
 
     def __getitem__(self, key):
         if not isinstance(key, tuple):
@@ -116,7 +116,7 @@ def astype(self, dtype, order="K", casting="unsafe", copy=True):
 
     def __getitem__(self, key):
         """Return a subset of the matrix."""
-        result = super().__getitem__(key)
+        result = type(self)(self._array.__getitem__(key))
         if isinstance(key, tuple) and len(key) == 2:
             result._colnames = list(np.array(self._colnames)[key[1]])
             result._terms = list(np.array(self._terms)[key[1]])
@@ -284,7 +284,7 @@ def get_names(
             raise ValueError(f"Type must be 'column' or 'term', got {type}")
 
         if indices is None:
-            indices = list(range(self.shape[1]))
+            indices = list(range(len(self._colnames)))
 
         if missing_prefix is not None:
             default_names = np.array([f"{missing_prefix}{i}" for i in indices])
diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py
index ace52bbd..7722894d 100644
--- a/src/tabmat/sparse_matrix.py
+++ b/src/tabmat/sparse_matrix.py
@@ -325,35 +325,6 @@ def multiply(self, other):
             return type(self)(self._array.multiply(other[:, np.newaxis]))
         return type(self)(self._array.multiply(other))
 
-    def get_column_names(
-        self, missing_prefix: str = "_col_", indices: Optional[List[int]] = None
-    ) -> List[str]:
-        """Get column names.
-
-        For columns that do not have a name, a default name is created using the
-        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
-        the index of the column.
-
-        Parameters
-        ----------
-        missing_prefix
-            Prefix to use for columns that do not have a name.
-        indices
-            The indices used for columns that do not have a name. If ``None``,
-            then the indices are ``list(range(self.shape[1]))``.
-
-        Returns
-        -------
-        list of str
-            Column names.
-        """
-        if indices is None:
-            indices = list(range(self.shape[1]))
-        colnames = np.array(self._colnames)
-        default_colnames = np.array([f"{missing_prefix}{i}" for i in indices])
-        colnames[colnames == None] = default_colnames[colnames == None]  # noqa: E711
-        return list(colnames)
-
     def get_names(
         self,
         type: str = "column",
@@ -393,7 +364,7 @@ def get_names(
             raise ValueError(f"Type must be 'column' or 'term', got {type}")
 
         if indices is None:
-            indices = list(range(self.shape[1]))
+            indices = list(range(len(self._colnames)))
 
         if missing_prefix is not None:
             default_names = np.array([f"{missing_prefix}{i}" for i in indices])
diff --git a/tests/test_matrices.py b/tests/test_matrices.py
index 34f6a5bb..b192f5ba 100644
--- a/tests/test_matrices.py
+++ b/tests/test_matrices.py
@@ -621,7 +621,6 @@ def test_split_matrix_creation(mat):
     assert sm.shape[1] == 2 * mat.shape[1]
 
 
-@pytest.mark.parametrize("mat", get_matrices())
 def test_multiply(mat):
     other = np.arange(mat.shape[0])
     expected = mat.A * other[:, np.newaxis]
@@ -661,3 +660,184 @@ def test_hstack(mat_1, mat_2):
         stacked.A,
         np.hstack([mat.A if not isinstance(mat, np.ndarray) else mat for mat in mats]),
     )
+def test_names_against_expectation():
+    X = tm.DenseMatrix(
+        np.ones((5, 2)), column_names=["a", None], term_names=["a", None]
+    )
+    Xc = tm.CategoricalMatrix(
+        pd.Categorical(["a", "b", "c", "b", "a"]), column_name="c", term_name="c"
+    )
+    Xc2 = tm.CategoricalMatrix(pd.Categorical(["a", "b", "c", "b", "a"]))
+    Xs = tm.SparseMatrix(
+        sps.csc_matrix(np.ones((5, 2))),
+        column_names=["s1", "s2"],
+        term_names=["s", "s"],
+    )
+
+    mat = tm.SplitMatrix(matrices=[X, Xc, Xc2, Xs])
+
+    assert mat.get_names(type="column") == [
+        "a",
+        None,
+        "c[a]",
+        "c[b]",
+        "c[c]",
+        None,
+        None,
+        None,
+        "s1",
+        "s2",
+    ]
+
+    assert mat.get_names(type="term") == [
+        "a",
+        None,
+        "c",
+        "c",
+        "c",
+        None,
+        None,
+        None,
+        "s",
+        "s",
+    ]
+
+    assert mat.get_names(type="column", missing_prefix="_col_") == [
+        "a",
+        "_col_1",
+        "c[a]",
+        "c[b]",
+        "c[c]",
+        "_col_5-7[a]",
+        "_col_5-7[b]",
+        "_col_5-7[c]",
+        "s1",
+        "s2",
+    ]
+
+    assert mat.get_names(type="term", missing_prefix="_col_") == [
+        "a",
+        "_col_1",
+        "c",
+        "c",
+        "c",
+        "_col_5-7",
+        "_col_5-7",
+        "_col_5-7",
+        "s",
+        "s",
+    ]
+
+
+@pytest.mark.parametrize("mat", get_matrices())
+@pytest.mark.parametrize("missing_prefix", ["_col_", "X"])
+def test_names_getter_setter(mat, missing_prefix):
+    names = mat.get_names(missing_prefix=missing_prefix, type="column")
+    mat.column_names = names
+    assert mat.column_names == names
+
+
+@pytest.mark.parametrize("mat", get_matrices())
+@pytest.mark.parametrize("missing_prefix", ["_col_", "X"])
+def test_terms_getter_setter(mat, missing_prefix):
+    names = mat.get_names(missing_prefix=missing_prefix, type="term")
+    mat.term_names = names
+    assert mat.term_names == names
+
+
+@pytest.mark.parametrize("indexer_1", [slice(None, None), 0, slice(2, 8)])
+@pytest.mark.parametrize("indexer_2", [[0], slice(1, 4), [0, 2, 3], [4, 3, 2, 1, 0]])
+@pytest.mark.parametrize("sparse", [True, False])
+def test_names_indexing(indexer_1, indexer_2, sparse):
+    X = np.ones((10, 5), dtype=np.float64)
+    colnames = ["a", "b", None, "d", "e"]
+    termnames = ["t1", "t1", None, "t4", "t5"]
+
+    colnames_array = np.array(colnames)
+    termnames_array = np.array(termnames)
+
+    if sparse:
+        X = tm.SparseMatrix(
+            sps.csc_matrix(X), column_names=colnames, term_names=termnames
+        )
+    else:
+        X = tm.DenseMatrix(X, column_names=colnames, term_names=termnames)
+
+    X_indexed = X[indexer_1, indexer_2]
+    if not isinstance(X_indexed, tm.MatrixBase):
+        pytest.skip("Does not return MatrixBase")
+    assert X_indexed.column_names == list(colnames_array[indexer_2])
+    assert X_indexed.term_names == list(termnames_array[indexer_2])
+
+
+@pytest.mark.parametrize("mat_1", get_all_matrix_base_subclass_mats())
+@pytest.mark.parametrize("mat_2", get_all_matrix_base_subclass_mats())
+def test_combine_names(mat_1, mat_2):
+    mat_1.column_names = mat_1.get_names(missing_prefix="m1_", type="column")
+    mat_2.column_names = mat_2.get_names(missing_prefix="m2_", type="column")
+
+    mat_1.term_names = mat_1.get_names(missing_prefix="m1_", type="term")
+    mat_2.term_names = mat_2.get_names(missing_prefix="m2_", type="term")
+
+    combined = tm.SplitMatrix(matrices=[mat_1, mat_2])
+
+    assert combined.column_names == mat_1.column_names + mat_2.column_names
+    assert combined.term_names == mat_1.term_names + mat_2.term_names
+
+
+@pytest.mark.parametrize("prefix_sep", ["_", ": "])
+@pytest.mark.parametrize("drop_first", [True, False])
+def test_names_pandas(prefix_sep, drop_first):
+    n_rows = 50
+    dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
+    dense_column_with_lots_of_zeros = dense_column.copy()
+    dense_column_with_lots_of_zeros[:44] = 0.0
+    sparse_column = np.zeros(n_rows, dtype=np.float64)
+    sparse_column[0] = 1.0
+    cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
+    cat_column_highdim = np.arange(n_rows)
+
+    dense_ser = pd.Series(dense_column)
+    lowdense_ser = pd.Series(dense_column_with_lots_of_zeros)
+    sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0))
+    cat_ser_lowdim = pd.Categorical(cat_column_lowdim)
+    cat_ser_highdim = pd.Categorical(cat_column_highdim)
+
+    df = pd.DataFrame(
+        data={
+            "d": dense_ser,
+            "cl_obj": cat_ser_lowdim.astype(object),
+            "ch": cat_ser_highdim,
+            "ds": lowdense_ser,
+            "s": sparse_ser,
+        }
+    )
+
+    categorical_format = "{name}" + prefix_sep + "{category}"
+    mat_end = tm.from_pandas(
+        df,
+        dtype=np.float64,
+        sparse_threshold=0.3,
+        cat_threshold=4,
+        object_as_cat=True,
+        cat_position="end",
+        categorical_format=categorical_format,
+        drop_first=drop_first,
+    )
+
+    expanded_df = pd.get_dummies(df, prefix_sep=prefix_sep, drop_first=drop_first)
+    assert mat_end.column_names == expanded_df.columns.tolist()
+
+    mat_expand = tm.from_pandas(
+        df,
+        dtype=np.float64,
+        sparse_threshold=0.3,
+        cat_threshold=4,
+        object_as_cat=True,
+        cat_position="expand",
+        categorical_format=categorical_format,
+        drop_first=drop_first,
+    )
+
+    unique_terms = list(dict.fromkeys(mat_expand.term_names))
+    assert unique_terms == df.columns.tolist()

From 49cdabf2fc0ca352ebafec4dd14118894fd631c2 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 12 Jul 2023 15:09:15 +0200
Subject: [PATCH 13/19] Fix subsetting with np.newaxis

---
 src/tabmat/dense_matrix.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index 76c43497..1ca4a071 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -118,8 +118,13 @@ def __getitem__(self, key):
         """Return a subset of the matrix."""
         result = type(self)(self._array.__getitem__(key))
         if isinstance(key, tuple) and len(key) == 2:
-            result._colnames = list(np.array(self._colnames)[key[1]])
-            result._terms = list(np.array(self._terms)[key[1]])
+            if key[1] is None:
+                # Handle np.newaxis
+                result._colnames = self._colnames
+                result._terms = self._terms
+            else:
+                result._colnames = list(np.array(self._colnames)[key[1]])
+                result._terms = list(np.array(self._terms)[key[1]])
         return result
 
     def getcol(self, i):

From fcc0c7369813de75472752f3b21f7296907ae8ef Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 12 Jul 2023 15:19:34 +0200
Subject: [PATCH 14/19] Remove the walrus :(

---
 src/tabmat/categorical_matrix.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index 0cdecb10..ba4259f6 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -745,7 +745,11 @@ def set_names(self, names: Union[str, List[Optional[str]]], type: str = "column"
                         name="__CAPTURE__", category=cat
                     )
                     pattern = re.escape(partial_name).replace("__CAPTURE__", "(.*)")
-                    if (name is not None) and (match := re.search(pattern, name)):
+                    if name is not None:
+                        match = re.search(pattern, name)
+                    else:
+                        match = None
+                    if match is not None:
                         base_names.append(match.group(1))
                     else:
                         base_names.append(name)

From 0a14af7a9ba79e3c510b21ca8be5da233e13410d Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 12 Jul 2023 15:27:02 +0200
Subject: [PATCH 15/19] Fix test

---
 tests/test_matrices.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_matrices.py b/tests/test_matrices.py
index b192f5ba..f8f7664e 100644
--- a/tests/test_matrices.py
+++ b/tests/test_matrices.py
@@ -621,6 +621,7 @@ def test_split_matrix_creation(mat):
     assert sm.shape[1] == 2 * mat.shape[1]
 
 
+@pytest.mark.parametrize("mat", get_matrices())
 def test_multiply(mat):
     other = np.arange(mat.shape[0])
     expected = mat.A * other[:, np.newaxis]

From db0ac7513df9cf4497e15cb5a06e04f2f0c4e586 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Thu, 27 Jul 2023 10:54:02 +0200
Subject: [PATCH 16/19] Fix indexing with np.ix_

---
 src/tabmat/dense_matrix.py  | 24 ++++++++++--------------
 src/tabmat/sparse_matrix.py |  8 +++-----
 2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index 1ca4a071..72fb8847 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -67,7 +67,16 @@ def __getitem__(self, key):
         # Always return a 2d array
         key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key)
 
-        return type(self)(self._array.__getitem__(key))
+        if len(key) == 2:
+            colnames = list(np.array(self._colnames)[key[1]].ravel())
+            terms = list(np.array(self._terms)[key[1]].ravel())
+        else:
+            colnames = self._colnames
+            terms = self._terms
+
+        return type(self)(
+            self._array.__getitem__(key), column_names=colnames, term_names=terms
+        )
 
     __array_ufunc__ = None
 
@@ -114,19 +123,6 @@ def astype(self, dtype, order="K", casting="unsafe", copy=True):
         """Copy of the array, cast to a specified type."""
         return type(self)(self._array.astype(dtype, order, casting, copy))
 
-    def __getitem__(self, key):
-        """Return a subset of the matrix."""
-        result = type(self)(self._array.__getitem__(key))
-        if isinstance(key, tuple) and len(key) == 2:
-            if key[1] is None:
-                # Handle np.newaxis
-                result._colnames = self._colnames
-                result._terms = self._terms
-            else:
-                result._colnames = list(np.array(self._colnames)[key[1]])
-                result._terms = list(np.array(self._terms)[key[1]])
-        return result
-
     def getcol(self, i):
         """Return matrix column at specified index."""
         return type(self)(self._array[:, [i]])
diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py
index 7722894d..fe8e752c 100644
--- a/src/tabmat/sparse_matrix.py
+++ b/src/tabmat/sparse_matrix.py
@@ -78,16 +78,14 @@ def __getitem__(self, key):
         key = tuple([key_i] if np.isscalar(key_i) else key_i for key_i in key)
 
         if len(key) == 2:
-            colnames = list(np.array(self._colnames)[key[1]])
-            terms = list(np.array(self._terms)[key[1]])
+            colnames = list(np.array(self._colnames)[key[1]].ravel())
+            terms = list(np.array(self._terms)[key[1]].ravel())
         else:
             colnames = self._colnames
             terms = self._terms
 
         return type(self)(
-            self._array.__getitem__(key),
-            column_names=colnames,
-            term_names=terms
+            self._array.__getitem__(key), column_names=colnames, term_names=terms
         )
 
     def __matmul__(self, other):

From 1bd940f4ed4339a25984afd342e07f41fff48b75 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Thu, 27 Jul 2023 13:17:13 +0200
Subject: [PATCH 17/19] Propagate column names where it makes sense

---
 src/tabmat/categorical_matrix.py | 16 ++++++++++++----
 src/tabmat/dense_matrix.py       | 24 ++++++++++++++++++++----
 src/tabmat/sparse_matrix.py      | 20 ++++++++++++++++----
 3 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
index 8bdb4915..1180646c 100644
--- a/src/tabmat/categorical_matrix.py
+++ b/src/tabmat/categorical_matrix.py
@@ -477,10 +477,16 @@ def getcol(self, i: int) -> SparseMatrix:
         i %= self.shape[1]  # wrap-around indexing
 
         if self.drop_first:
-            i += 1
+            i_corr = i + 1
+        else:
+            i_corr = i
 
-        col_i = sps.csc_matrix((self.indices == i).astype(int)[:, None])
-        return SparseMatrix(col_i)
+        col_i = sps.csc_matrix((self.indices == i_corr).astype(int)[:, None])
+        return SparseMatrix(
+            col_i,
+            column_names=[self.column_names[i]],
+            term_names=[self.term_names[i]],
+        )
 
     def tocsr(self) -> sps.csr_matrix:
         """Return scipy csr representation of matrix."""
@@ -657,7 +663,9 @@ def multiply(self, other) -> SparseMatrix:
                     np.arange(self.shape[0] + 1, dtype=int),
                 ),
                 shape=self.shape,
-            )
+            ),
+            column_names=self.column_names,
+            term_names=self.term_names,
         )
 
     def __repr__(self):
diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
index 98006131..5de2c91f 100644
--- a/src/tabmat/dense_matrix.py
+++ b/src/tabmat/dense_matrix.py
@@ -113,11 +113,19 @@ def transpose(self):
 
     def astype(self, dtype, order="K", casting="unsafe", copy=True):
         """Copy of the array, cast to a specified type."""
-        return type(self)(self._array.astype(dtype, order, casting, copy))
+        return type(self)(
+            self._array.astype(dtype, order, casting, copy),
+            column_names=self.column_names,
+            term_names=self.term_names,
+        )
 
     def getcol(self, i):
         """Return matrix column at specified index."""
-        return type(self)(self._array[:, [i]])
+        return type(self)(
+            self._array[:, [i]],
+            column_names=[self.column_names[i]],
+            term_names=[self.term_names[i]],
+        )
 
     def toarray(self):
         """Return array representation of matrix."""
@@ -235,8 +243,16 @@ def multiply(self, other):
         This assumes that ``other`` is a vector of size ``self.shape[0]``.
         """
         if np.asanyarray(other).ndim == 1:
-            return type(self)(self._array.__mul__(other[:, np.newaxis]))
-        return type(self)(self._array.__mul__(other))
+            return type(self)(
+                self._array.__mul__(other[:, np.newaxis]),
+                column_names=self.column_names,
+                term_names=self.term_names,
+            )
+        return type(self)(
+            self._array.__mul__(other),
+            column_names=self.column_names,
+            term_names=self.term_names,
+        )
 
     def get_names(
         self,
diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py
index 39ce829f..c2be7d57 100644
--- a/src/tabmat/sparse_matrix.py
+++ b/src/tabmat/sparse_matrix.py
@@ -147,7 +147,11 @@ def transpose(self):
 
     def getcol(self, i):
         """Return matrix column at specified index."""
-        return type(self)(self._array.getcol(i))
+        return type(self)(
+            self._array.getcol(i),
+            column_names=[self.column_names[i]],
+            term_names=[self.term_names[i]],
+        )
 
     def unpack(self):
         """Return the underlying scipy.sparse.csc_matrix."""
@@ -311,9 +315,17 @@ def multiply(self, other):
         from the parent class except that ``other`` is assumed to be a vector of size
         ``self.shape[0]``.
         """
-        if other.ndim == 1:
-            return type(self)(self._array.multiply(other[:, np.newaxis]))
-        return type(self)(self._array.multiply(other))
+        if np.asanyarray(other).ndim == 1:
+            return type(self)(
+                self._array.multiply(other[:, np.newaxis]),
+                column_names=self.column_names,
+                term_names=self.term_names,
+            )
+        return type(self)(
+            self._array.multiply(other),
+            column_names=self.column_names,
+            term_names=self.term_names,
+        )
 
     def get_names(
         self,

From 0b133d6c1fb54c40ed7ea83e6b56ddff44945853 Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Wed, 9 Aug 2023 16:36:57 +0200
Subject: [PATCH 18/19] Fix merge mistake

---
 src/tabmat/sparse_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py
index 0754f771..1c568757 100644
--- a/src/tabmat/sparse_matrix.py
+++ b/src/tabmat/sparse_matrix.py
@@ -33,7 +33,7 @@ class SparseMatrix(MatrixBase):
 
     def __init__(
         self,
-        array,
+        input_array,
         shape=None,
         dtype=None,
         copy=False,

From 5109466825a21cab4e713b18b0af46dadf8861dd Mon Sep 17 00:00:00 2001
From: Martin Stancsics <martin.stancsics@quantco.com>
Date: Mon, 14 Aug 2023 14:25:15 +0200
Subject: [PATCH 19/19] Add changelog entry

---
 CHANGELOG.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 331695b1..9de08d2f 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -10,6 +10,10 @@ Changelog
 Unreleased
 ----------
 
+**New features:**
+
+- Add column name and term name metadata to ``MatrixBase`` objects. These are automatically populated when initializing a ``MatrixBase`` from a ``pandas.DataFrame``. In addition, they can be accessed and modified via the ``column_names`` and ``term_names`` properties.
+
 **Other changes:**
 
 - Improve the performance of ``from_pandas`` in the case of low-cardinality categorical variables.