Add column name getters

Quantco · Jul 11, 2023 · 9fe977e · 9fe977e
1 parent 212a1c6
commit 9fe977e
Show file tree

Hide file tree

Showing 6 changed files with 353 additions and 0 deletions.
diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
@@ -265,6 +265,10 @@ def __init__(
         self.x_csc: Optional[Tuple[Optional[np.ndarray], np.ndarray, np.ndarray]] = None
         self.dtype = np.dtype(dtype)
 
+        self._colname = None
+        self._term = None
+        self._colname_format = "{name}[{category}]"
+
     def recover_orig(self) -> np.ndarray:
         """
         Return 1d numpy array with same data as what was initially fed to __init__.
@@ -641,3 +645,63 @@ def multiply(self, other) -> SparseMatrix:
 
     def __repr__(self):
         return str(self.cat)
+
+    def get_column_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get column names.
+
+        For columns that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the column.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for columns that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Column names.
+        """
+        if self._colname is None:
+            colname = f"{missing_prefix}{start_index}"
+        else:
+            colname = self._colname
+        return [
+            self._colname_format.format(name=colname, category=cat)
+            for cat in self.cat.categories[self.drop_first :]
+        ]
+
+    def get_term_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get term names.
+
+        The main difference to ``get_column_names`` is that a categorical submatrix
+        is counted as a single term. Furthermore, matrices created from formulas
+        have a difference between a column and term (c.f. ``formulaic`` docs).
+        For terms that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the term.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for terms that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Term names.
+        """
+        if self._term is None:
+            term = f"{missing_prefix}{start_index}"
+        else:
+            term = self._term
+        return [term] * (len(self.cat.categories) - self.drop_first)
diff --git a/src/tabmat/dense_matrix.py b/src/tabmat/dense_matrix.py
@@ -42,6 +42,11 @@ def __new__(cls, input_array):  # noqa
         obj = np.asarray(input_array).view(cls)
         if not np.issubdtype(obj.dtype, np.floating):
             raise NotImplementedError("DenseMatrix is only implemented for float data")
+
+        if obj.ndim == 2:
+            obj._colnames = [None] * obj.shape[1]
+            obj._terms = [None] * obj.shape[1]
+
         return obj
 
     def __array_finalize__(self, obj):
@@ -166,3 +171,62 @@ def multiply(self, other):
         if np.asanyarray(other).ndim == 1:
             return super().__mul__(other[:, np.newaxis])
         return super().__mul__(other)
+
+    def get_column_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get column names.
+
+        For columns that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the column.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for columns that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Column names.
+        """
+        colnames = np.array(self._colnames)
+        default_colnames = np.array(
+            [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
+        )
+        colnames[colnames == None] = default_colnames[colnames == None]  # noqa: E711
+        return list(colnames)
+
+    def get_term_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get term names.
+
+        The main difference to ``get_column_names`` is that a categorical submatrix
+        is counted as a single term. Furthermore, matrices created from formulas
+        have a difference between a column and term (c.f. ``formulaic`` docs).
+        For terms that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the term.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for terms that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Term names.
+        """
+        terms = np.array(self._terms)
+        default_terms = np.array(
+            [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
+        )
+        terms[terms == None] = default_terms[terms == None]  # noqa: E711
+        return list(terms)
diff --git a/src/tabmat/matrix_base.py b/src/tabmat/matrix_base.py
@@ -164,6 +164,57 @@ def standardize(
     def __getitem__(self, item):
         pass
 
+    @abstractmethod
+    def get_column_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get column names.
+
+        For columns that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the column.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for columns that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Column names.
+        """
+        pass
+
+    @abstractmethod
+    def get_term_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get term names.
+
+        The main difference to ``get_column_names`` is that a categorical submatrix
+        is counted as a single term. Furthermore, matrices created from formulas
+        have a difference between a column and term (c.f. ``formulaic`` docs).
+        For terms that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the term.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for terms that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Term names.
+        """
+        pass
+
     # Higher priority than numpy arrays, so behavior for funcs like "@" defaults to the
     # behavior of this class
     __array_priority__ = 11

diff --git a/src/tabmat/sparse_matrix.py b/src/tabmat/sparse_matrix.py
@@ -43,6 +43,9 @@ def __init__(self, arg1, shape=None, dtype=None, copy=False):
             self.sort_indices()
         self._x_csr = None
 
+        self._colnames = [None] * self.shape[1]
+        self._terms = [None] * self.shape[1]
+
     @property
     def x_csr(self):
         """Cache the CSR representation of the matrix."""
@@ -203,3 +206,62 @@ def multiply(self, other):
         if other.ndim == 1:
             return SparseMatrix(super().multiply(other[:, np.newaxis]))
         return SparseMatrix(super().multiply(other))
+
+    def get_column_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get column names.
+
+        For columns that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the column.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for columns that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Column names.
+        """
+        colnames = np.array(self._colnames)
+        default_colnames = np.array(
+            [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
+        )
+        colnames[colnames == None] = default_colnames[colnames == None]  # noqa: E711
+        return list(colnames)
+
+    def get_term_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get term names.
+
+        The main difference to ``get_column_names`` is that a categorical submatrix
+        is counted as a single term. Furthermore, matrices created from formulas
+        have a difference between a column and term (c.f. ``formulaic`` docs).
+        For terms that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the term.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for terms that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Term names.
+        """
+        terms = np.array(self._terms)
+        default_terms = np.array(
+            [f"{missing_prefix}{start_index + i}" for i in range(self.shape[1])]
+        )
+        terms[terms == None] = default_terms[terms == None]  # noqa: E711
+        return list(terms)
diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py
@@ -448,3 +448,66 @@ def __repr__(self):
         return out
 
     __array_priority__ = 13
+
+    def get_column_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get column names.
+
+        For columns that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the column.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for columns that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Column names.
+        """
+        column_names = np.empty(self.shape[1], dtype=object)
+        for idx, mat in zip(self.indices, self.matrices):
+            column_names[idx] = mat.get_column_names(missing_prefix, start_index)
+            if isinstance(mat, CategoricalMatrix):
+                start_index += 1
+            else:
+                start_index += mat.shape[1]
+        return list(column_names)
+
+    def get_term_names(
+        self, missing_prefix: str = "_col_", start_index: int = 0
+    ) -> List[str]:
+        """Get term names.
+
+        The main difference to ``get_column_names`` is that a categorical submatrix
+        is counted as a single term. Furthermore, matrices created from formulas
+        have a difference between a column and term (c.f. ``formulaic`` docs).
+        For terms that do not have a name, a default name is created using the
+        followig pattern: ``"{missing_prefix}{start_index + i}"`` where ``i`` is
+        the index of the term.
+
+        Parameters
+        ----------
+        missing_prefix
+            Prefix to use for terms that do not have a name.
+        start_index
+            Index to start from when creating default names.
+
+        Returns
+        -------
+        list of str
+            Term names.
+        """
+        term_names = np.empty(self.shape[1], dtype=object)
+        for idx, mat in zip(self.indices, self.matrices):
+            term_names[idx] = mat.get_term_names(missing_prefix, start_index)
+            if isinstance(mat, CategoricalMatrix):
+                start_index += 1
+            else:
+                start_index += mat.shape[1]
+        return list(term_names)