Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pandas to matrix function #16

Merged
merged 8 commits into from
Jul 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/quantcore/matrix/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .categorical_matrix import CategoricalMatrix
from .constructor import from_pandas
from .dense_matrix import DenseMatrix
from .matrix_base import MatrixBase, one_over_var_inf_to_val
from .sparse_matrix import SparseMatrix
Expand All @@ -14,4 +15,5 @@
"CategoricalMatrix",
"csc_to_split",
"one_over_var_inf_to_val",
"from_pandas",
]
154 changes: 154 additions & 0 deletions src/quantcore/matrix/constructor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import warnings
from typing import List, Union

import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

from .categorical_matrix import CategoricalMatrix
from .dense_matrix import DenseMatrix
from .matrix_base import MatrixBase
from .sparse_matrix import SparseMatrix
from .split_matrix import SplitMatrix, split_sparse_and_dense_parts


def from_pandas(
df: pd.DataFrame,
dtype: np.dtype = np.float64,
sparse_threshold: float = 0.1,
cat_threshold: int = 4,
object_as_cat: bool = False,
cat_position: str = "expand",
) -> MatrixBase:
"""
Transform a pandas.DataFrame into an efficient SplitMatrix

Parameters
----------
df : pd.DataFrame
pandas DataFrame to be converted.
dtype : np.dtype, default np.float64
dtype of all sub-matrices of the resulting SplitMatrix.
sparse_threshold : float, default 0.1
Density threshold below which numerical columns will be stored in a sparse
format.
cat_threshold : int, default 4
Number of levels of a categorical column under which the column will be stored
as sparse one-hot-encoded columns instead of CategoricalMatrix
MarcAntoineSchmidtQC marked this conversation as resolved.
Show resolved Hide resolved
object_as_cat : bool, default False
If True, DataFrame columns stored as python objects will be treated as
categorical columns.
cat_position : str {'end'|'expand'}, default 'expand'
Position of the categorical variable in the index. If "last", all the
categoricals (including the ones that did not satisfy cat_threshold)
will be placed at the end of the index list. If "expand", all the variables
will remain in the same order.

Returns
-------
SplitMatrix
"""
if object_as_cat:
for colname in df.select_dtypes("object"):
df[colname] = df[colname].astype("category")

matrices: List[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = []
indices: List[List[int]] = []
is_cat: List[bool] = []

dense_dfidx = [] # column index in original DataFrame
dense_mxidx = [] # index in the new SplitMatrix
sparse_dfidx = [] # column index in original DataFrame
sparse_mxidx = [] # index in the new SplitMatrix
ignored_cols = []

mxcolidx = 0

for dfcolidx, (colname, coldata) in enumerate(df.iteritems()):
# categorical
if isinstance(coldata.dtype, pd.CategoricalDtype):
if len(coldata.cat.categories) < cat_threshold:
(
X_dense_F,
X_sparse,
dense_indices,
sparse_indices,
) = split_sparse_and_dense_parts(
pd.get_dummies(
coldata, prefix=colname, sparse=True, dtype=np.float64
)
.sparse.to_coo()
.tocsc(),
threshold=sparse_threshold,
)
matrices.append(X_dense_F)
is_cat.append(True)
matrices.append(X_sparse)
is_cat.append(True)
if cat_position == "expand":
indices.append(mxcolidx + dense_indices)
indices.append(mxcolidx + sparse_indices)
mxcolidx += len(dense_indices) + len(sparse_indices)
elif cat_position == "end":
indices.append(dense_indices)
indices.append(sparse_indices)

else:
cat = CategoricalMatrix(coldata, dtype=dtype)
matrices.append(cat)
is_cat.append(True)
if cat_position == "expand":
indices.append(mxcolidx + np.arange(cat.shape[1]))
mxcolidx += cat.shape[1]
elif cat_position == "end":
indices.append(np.arange(cat.shape[1]))
# All other numerical dtypes (needs to be after pd.SparseDtype)
elif is_numeric_dtype(coldata):
# check if we want to store as sparse
if (coldata != 0).mean() <= sparse_threshold:
if not isinstance(coldata.dtype, pd.SparseDtype):
sparse_dtype = pd.SparseDtype(coldata.dtype, fill_value=0)
df.iloc[:, dfcolidx] = coldata.astype(sparse_dtype)
sparse_dfidx.append(dfcolidx)
sparse_mxidx.append(mxcolidx)
mxcolidx += 1
else:
dense_dfidx.append(dfcolidx)
dense_mxidx.append(mxcolidx)
mxcolidx += 1

# dtype not handled yet
else:
ignored_cols.append((dfcolidx, colname))

if len(ignored_cols) > 0:
warnings.warn(
f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype."
)
if len(dense_dfidx) > 0:
matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype)))
indices.append(dense_mxidx)
is_cat.append(False)
if len(sparse_dfidx) > 0:
matrices.append(
SparseMatrix(df.iloc[:, sparse_dfidx].sparse.to_coo(), dtype=dtype)
)
indices.append(sparse_mxidx)
is_cat.append(False)

if cat_position == "end":
new_indices = []
for mat_indices, is_cat_ in zip(indices, is_cat):
if is_cat_:
new_indices.append(np.asarray(mat_indices) + mxcolidx)
mxcolidx += len(mat_indices)
else:
new_indices.append(mat_indices)
indices = new_indices

if len(matrices) > 1:
return SplitMatrix(matrices, indices)
elif len(matrices) == 0:
raise ValueError("DataFrame contained no valid column")
else:
return matrices[0]
38 changes: 38 additions & 0 deletions tests/test_matrices.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import List, Optional, Union

import numpy as np
import pandas as pd
import pytest
from scipy import sparse as sps

Expand Down Expand Up @@ -422,3 +423,40 @@ def test_indexing_range_row(mat: Union[mx.MatrixBase, mx.StandardizedMatrix]):
res = res.A
expected = mat.A[0:2, :]
np.testing.assert_allclose(np.squeeze(res), expected)


def test_pandas_to_matrix():
n_rows = 50
dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64)
sparse_column = np.zeros(n_rows, dtype=np.float64)
sparse_column[0] = 1.0
cat_column_lowdim = np.tile(["a", "b"], n_rows // 2)
cat_column_highdim = np.arange(n_rows)

dense_ser = pd.Series(dense_column)
sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0))
cat_ser_lowdim = pd.Categorical(cat_column_lowdim)
cat_ser_highdim = pd.Categorical(cat_column_highdim)

df = pd.DataFrame(
data={
"d": dense_ser,
"s": sparse_ser,
"cl": cat_ser_lowdim,
"ch": cat_ser_highdim,
}
)

mat = mx.from_pandas(df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4)

assert mat.shape == (n_rows, n_rows + 4)
assert len(mat.matrices) == 3
assert isinstance(mat, mx.SplitMatrix)

nb_col_by_type = {
mx.DenseMatrix: 3, # includes low-dimension categorical
mx.SparseMatrix: 1, # sparse column
mx.CategoricalMatrix: n_rows,
}
for submat in mat.matrices:
assert submat.shape[1] == nb_col_by_type[type(submat)]