Quantco · MarcAntoineSchmidtQC · Jul 20, 2020 · Jul 10, 2020 · Jul 13, 2020 · Jul 16, 2020
diff --git a/src/quantcore/matrix/__init__.py b/src/quantcore/matrix/__init__.py
@@ -1,4 +1,5 @@
 from .categorical_matrix import CategoricalMatrix
+from .constructor import from_pandas
 from .dense_matrix import DenseMatrix
 from .matrix_base import MatrixBase, one_over_var_inf_to_val
 from .sparse_matrix import SparseMatrix
@@ -14,4 +15,5 @@
     "CategoricalMatrix",
     "csc_to_split",
     "one_over_var_inf_to_val",
+    "from_pandas",
 ]
diff --git a/src/quantcore/matrix/constructor.py b/src/quantcore/matrix/constructor.py
@@ -0,0 +1,154 @@
+import warnings
+from typing import List, Union
+
+import numpy as np
+import pandas as pd
+from pandas.api.types import is_numeric_dtype
+
+from .categorical_matrix import CategoricalMatrix
+from .dense_matrix import DenseMatrix
+from .matrix_base import MatrixBase
+from .sparse_matrix import SparseMatrix
+from .split_matrix import SplitMatrix, split_sparse_and_dense_parts
+
+
+def from_pandas(
+    df: pd.DataFrame,
+    dtype: np.dtype = np.float64,
+    sparse_threshold: float = 0.1,
+    cat_threshold: int = 4,
+    object_as_cat: bool = False,
+    cat_position: str = "expand",
+) -> MatrixBase:
+    """
+    Transform a pandas.DataFrame into an efficient SplitMatrix
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        pandas DataFrame to be converted.
+    dtype : np.dtype, default np.float64
+        dtype of all sub-matrices of the resulting SplitMatrix.
+    sparse_threshold : float, default 0.1
+        Density threshold below which numerical columns will be stored in a sparse
+        format.
+    cat_threshold : int, default 4
+        Number of levels of a categorical column under which the column will be stored
+        as sparse one-hot-encoded columns instead of CategoricalMatrix
+    object_as_cat : bool, default False
+        If True, DataFrame columns stored as python objects will be treated as
+        categorical columns.
+    cat_position : str {'end'|'expand'}, default 'expand'
+        Position of the categorical variable in the index. If "last", all the
+        categoricals (including the ones that did not satisfy cat_threshold)
+        will be placed at the end of the index list. If "expand", all the variables
+        will remain in the same order.
+
+    Returns
+    -------
+    SplitMatrix
+    """
+    if object_as_cat:
+        for colname in df.select_dtypes("object"):
+            df[colname] = df[colname].astype("category")
+
+    matrices: List[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = []
+    indices: List[List[int]] = []
+    is_cat: List[bool] = []
+
+    dense_dfidx = []  # column index in original DataFrame
+    dense_mxidx = []  # index in the new SplitMatrix
+    sparse_dfidx = []  # column index in original DataFrame
+    sparse_mxidx = []  # index in the new SplitMatrix
+    ignored_cols = []
+
+    mxcolidx = 0
+
+    for dfcolidx, (colname, coldata) in enumerate(df.iteritems()):
+        # categorical
+        if isinstance(coldata.dtype, pd.CategoricalDtype):
+            if len(coldata.cat.categories) < cat_threshold:
+                (
+                    X_dense_F,
+                    X_sparse,
+                    dense_indices,
+                    sparse_indices,
+                ) = split_sparse_and_dense_parts(
+                    pd.get_dummies(
+                        coldata, prefix=colname, sparse=True, dtype=np.float64
+                    )
+                    .sparse.to_coo()
+                    .tocsc(),
+                    threshold=sparse_threshold,
+                )
+                matrices.append(X_dense_F)
+                is_cat.append(True)
+                matrices.append(X_sparse)
+                is_cat.append(True)
+                if cat_position == "expand":
+                    indices.append(mxcolidx + dense_indices)
+                    indices.append(mxcolidx + sparse_indices)
+                    mxcolidx += len(dense_indices) + len(sparse_indices)
+                elif cat_position == "end":
+                    indices.append(dense_indices)
+                    indices.append(sparse_indices)
+
+            else:
+                cat = CategoricalMatrix(coldata, dtype=dtype)
+                matrices.append(cat)
+                is_cat.append(True)
+                if cat_position == "expand":
+                    indices.append(mxcolidx + np.arange(cat.shape[1]))
+                    mxcolidx += cat.shape[1]
+                elif cat_position == "end":
+                    indices.append(np.arange(cat.shape[1]))
+        # All other numerical dtypes (needs to be after pd.SparseDtype)
+        elif is_numeric_dtype(coldata):
+            # check if we want to store as sparse
+            if (coldata != 0).mean() <= sparse_threshold:
+                if not isinstance(coldata.dtype, pd.SparseDtype):
+                    sparse_dtype = pd.SparseDtype(coldata.dtype, fill_value=0)
+                    df.iloc[:, dfcolidx] = coldata.astype(sparse_dtype)
+                sparse_dfidx.append(dfcolidx)
+                sparse_mxidx.append(mxcolidx)
+                mxcolidx += 1
+            else:
+                dense_dfidx.append(dfcolidx)
+                dense_mxidx.append(mxcolidx)
+                mxcolidx += 1
+
+        # dtype not handled yet
+        else:
+            ignored_cols.append((dfcolidx, colname))
+
+    if len(ignored_cols) > 0:
+        warnings.warn(
+            f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype."
+        )
+    if len(dense_dfidx) > 0:
+        matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype)))
+        indices.append(dense_mxidx)
+        is_cat.append(False)
+    if len(sparse_dfidx) > 0:
+        matrices.append(
+            SparseMatrix(df.iloc[:, sparse_dfidx].sparse.to_coo(), dtype=dtype)
+        )
+        indices.append(sparse_mxidx)
+        is_cat.append(False)
+
+    if cat_position == "end":
+        new_indices = []
+        for mat_indices, is_cat_ in zip(indices, is_cat):
+            if is_cat_:
+                new_indices.append(np.asarray(mat_indices) + mxcolidx)
+                mxcolidx += len(mat_indices)
+            else:
+                new_indices.append(mat_indices)
+        indices = new_indices
+
+    if len(matrices) > 1:
+        return SplitMatrix(matrices, indices)
+    elif len(matrices) == 0:
+        raise ValueError("DataFrame contained no valid column")
+    else:
+        return matrices[0]
diff --git a/tests/test_matrices.py b/tests/test_matrices.py
@@ -2,6 +2,7 @@
 from typing import List, Optional, Union
 
 import numpy as np
+import pandas as pd
 import pytest
 from scipy import sparse as sps
 
@@ -422,3 +423,40 @@ def test_indexing_range_row(mat: Union[mx.MatrixBase, mx.StandardizedMatrix]):
         res = res.A
     expected = mat.A[0:2, :]
     np.testing.assert_allclose(np.squeeze(res), expected)
+
+
+def test_pandas_to_matrix():
+    n_rows = 50
+    dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
+    sparse_column = np.zeros(n_rows, dtype=np.float64)
+    sparse_column[0] = 1.0
+    cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
+    cat_column_highdim = np.arange(n_rows)
+
+    dense_ser = pd.Series(dense_column)
+    sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0))
+    cat_ser_lowdim = pd.Categorical(cat_column_lowdim)
+    cat_ser_highdim = pd.Categorical(cat_column_highdim)
+
+    df = pd.DataFrame(
+        data={
+            "d": dense_ser,
+            "s": sparse_ser,
+            "cl": cat_ser_lowdim,
+            "ch": cat_ser_highdim,
+        }
+    )
+
+    mat = mx.from_pandas(df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4)
+
+    assert mat.shape == (n_rows, n_rows + 4)
+    assert len(mat.matrices) == 3
+    assert isinstance(mat, mx.SplitMatrix)
+
+    nb_col_by_type = {
+        mx.DenseMatrix: 3,  # includes low-dimension categorical
+        mx.SparseMatrix: 1,  # sparse column
+        mx.CategoricalMatrix: n_rows,
+    }
+    for submat in mat.matrices:
+        assert submat.shape[1] == nb_col_by_type[type(submat)]