From 0abf742d09f62b8d35099f595791e649271267f1 Mon Sep 17 00:00:00 2001 From: Marc-Antoine Schmidt Date: Fri, 10 Jul 2020 16:56:28 -0400 Subject: [PATCH 1/7] working prototype --- src/quantcore/matrix/__init__.py | 2 ++ src/quantcore/matrix/pandas.py | 39 ++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 src/quantcore/matrix/pandas.py diff --git a/src/quantcore/matrix/__init__.py b/src/quantcore/matrix/__init__.py index 210c7a44..8ef5b54c 100644 --- a/src/quantcore/matrix/__init__.py +++ b/src/quantcore/matrix/__init__.py @@ -1,6 +1,7 @@ from .categorical_matrix import CategoricalMatrix from .dense_matrix import DenseMatrix from .matrix_base import MatrixBase, one_over_var_inf_to_val +from .pandas import from_pandas from .sparse_matrix import SparseMatrix from .split_matrix import SplitMatrix, csc_to_split from .standardized_mat import StandardizedMatrix @@ -14,4 +15,5 @@ "CategoricalMatrix", "csc_to_split", "one_over_var_inf_to_val", + "from_pandas", ] diff --git a/src/quantcore/matrix/pandas.py b/src/quantcore/matrix/pandas.py new file mode 100644 index 00000000..cd77624e --- /dev/null +++ b/src/quantcore/matrix/pandas.py @@ -0,0 +1,39 @@ +import warnings + +import pandas as pd +import scipy.sparse as sps + +from .categorical_matrix import CategoricalMatrix +from .matrix_base import MatrixBase +from .split_matrix import SplitMatrix, csc_to_split + + +def from_pandas( + df: pd.DataFrame, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + object_as_cat: bool = False, +) -> MatrixBase: + """ + TODO: + - docstring + - tests + - efficiency + - consider changing filename + """ + if object_as_cat: + for colname in df.select_dtypes("object"): + df[colname] = df[colname].astype("category") + else: + if not df.select_dtypes(include=object).empty: + warnings.warn("DataFrame contains columns with object dtypes. Ignoring") + + categorical_component = df.select_dtypes(include=pd.CategoricalDtype) + X_cat = [] + for colname in categorical_component: + X_cat.append(CategoricalMatrix(categorical_component[colname])) + + numerical_component = df.select_dtypes(include="number") + X_noncat = csc_to_split(sps.csc_matrix(numerical_component)) + + return SplitMatrix([*X_noncat.matrices, *X_cat]) From 1d8bee701d7e2b8544e61019f4f7cb91ea4cacd9 Mon Sep 17 00:00:00 2001 From: Marc-Antoine Schmidt Date: Mon, 13 Jul 2020 14:24:40 -0400 Subject: [PATCH 2/7] more efficient implementation + docstring --- src/quantcore/matrix/pandas.py | 95 ++++++++++++++++++++++++++++------ 1 file changed, 78 insertions(+), 17 deletions(-) diff --git a/src/quantcore/matrix/pandas.py b/src/quantcore/matrix/pandas.py index cd77624e..81b6d9ec 100644 --- a/src/quantcore/matrix/pandas.py +++ b/src/quantcore/matrix/pandas.py @@ -1,39 +1,100 @@ import warnings +import numpy as np import pandas as pd -import scipy.sparse as sps +from pandas.api.types import is_numeric_dtype from .categorical_matrix import CategoricalMatrix +from .dense_matrix import DenseMatrix from .matrix_base import MatrixBase -from .split_matrix import SplitMatrix, csc_to_split +from .sparse_matrix import SparseMatrix +from .split_matrix import SplitMatrix def from_pandas( df: pd.DataFrame, + dtype: np.dtype = np.float64, sparse_threshold: float = 0.1, cat_threshold: int = 4, object_as_cat: bool = False, ) -> MatrixBase: """ - TODO: - - docstring - - tests - - efficiency - - consider changing filename + Transform a pandas.DataFrame into an efficient SplitMatrix + + Parameters + ---------- + df : pd.DataFrame + pandas DataFrame to be converted. + dtype : np.dtype, default np.float64 + dtype of all sub-matrices of the resulting SplitMatrix. + sparse_threshold : float, default 0.1 + Density threshold below which numerical columns will be stored in a sparse + format. + cat_threshold : int, default 4 + Number of levels of a categorical column under which the column will be stored + as sparse one-hot-encoded columns instead of CategoricalMatrix + object_as_cat : bool, default False + If True, DataFrame columns stored as python objects will be treated as + categorical columns. + + Returns + ------- + SplitMatrix """ if object_as_cat: for colname in df.select_dtypes("object"): df[colname] = df[colname].astype("category") - else: - if not df.select_dtypes(include=object).empty: - warnings.warn("DataFrame contains columns with object dtypes. Ignoring") - categorical_component = df.select_dtypes(include=pd.CategoricalDtype) - X_cat = [] - for colname in categorical_component: - X_cat.append(CategoricalMatrix(categorical_component[colname])) + matrices = [] + sparse_ohe_comp = [] + sparse_idx = [] + dense_idx = [] + ignored_cols = [] + for colidx, (colname, coldata) in enumerate(df.iteritems()): + # categorical + if isinstance(coldata.dtype, pd.CategoricalDtype): + if len(coldata.cat.categories) < cat_threshold: + sparse_ohe_comp.append( + pd.get_dummies(coldata, prefix=colname, sparse=True) + ) + else: + matrices.append(CategoricalMatrix(coldata, dtype=dtype)) + + # sparse data, keep in sparse format even if density is larger than threshold + elif isinstance(coldata.dtype, pd.SparseDtype): + sparse_idx.append(colidx) - numerical_component = df.select_dtypes(include="number") - X_noncat = csc_to_split(sps.csc_matrix(numerical_component)) + # All other numerical dtypes (needs to be after pd.SparseDtype) + elif is_numeric_dtype(coldata): + # check if we want to store as sparse + if (coldata != 0).mean() <= sparse_threshold: + sparse_dtype = pd.SparseDtype(coldata.dtype, fill_value=0) + df.iloc[:, colidx] = df.iloc[:, colidx].astype(sparse_dtype) + sparse_idx.append(colidx) + else: + dense_idx.append(colidx) - return SplitMatrix([*X_noncat.matrices, *X_cat]) + # dtype not handled yet + else: + ignored_cols.append((colidx, colname)) + + if len(ignored_cols) > 0: + warnings.warn( + f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype." + ) + if len(dense_idx) > 0: + dense_comp = DenseMatrix(df.iloc[:, dense_idx].astype(dtype)) + matrices.append(dense_comp) + if len(sparse_idx) > 0: + sparse_comp = SparseMatrix(df.iloc[:, sparse_idx].sparse.to_coo(), dtype=dtype) + matrices.append(sparse_comp) + if len(sparse_ohe_comp) > 0: + sparse_ohe_comp = SparseMatrix( + pd.concat(sparse_ohe_comp, axis=1).sparse.to_coo(), dtype=dtype + ) + matrices.append(sparse_ohe_comp) + + if len(matrices) > 1: + return SplitMatrix(matrices) + else: + return matrices[0] From 875f466a0c30ba388e0e00bdb4e985b1a810d787 Mon Sep 17 00:00:00 2001 From: Marc-Antoine Schmidt Date: Thu, 16 Jul 2020 10:28:07 -0400 Subject: [PATCH 3/7] added simple test --- src/quantcore/matrix/__init__.py | 2 +- .../matrix/{pandas.py => constructor.py} | 0 tests/test_matrices.py | 38 +++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) rename src/quantcore/matrix/{pandas.py => constructor.py} (100%) diff --git a/src/quantcore/matrix/__init__.py b/src/quantcore/matrix/__init__.py index 8ef5b54c..34887106 100644 --- a/src/quantcore/matrix/__init__.py +++ b/src/quantcore/matrix/__init__.py @@ -1,7 +1,7 @@ from .categorical_matrix import CategoricalMatrix +from .constructor import from_pandas from .dense_matrix import DenseMatrix from .matrix_base import MatrixBase, one_over_var_inf_to_val -from .pandas import from_pandas from .sparse_matrix import SparseMatrix from .split_matrix import SplitMatrix, csc_to_split from .standardized_mat import StandardizedMatrix diff --git a/src/quantcore/matrix/pandas.py b/src/quantcore/matrix/constructor.py similarity index 100% rename from src/quantcore/matrix/pandas.py rename to src/quantcore/matrix/constructor.py diff --git a/tests/test_matrices.py b/tests/test_matrices.py index 4b3310f2..f4ebed6e 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -2,6 +2,7 @@ from typing import List, Optional, Union import numpy as np +import pandas as pd import pytest from scipy import sparse as sps @@ -407,3 +408,40 @@ def test_indexing_range_row(mat: Union[mx.MatrixBase, mx.StandardizedMatrix]): res = res.A expected = mat.A[0:2, :] np.testing.assert_allclose(np.squeeze(res), expected) + + +def test_pandas_to_matrix(): + n_rows = 10 + dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) + sparse_column = np.zeros(n_rows, dtype=np.float64) + sparse_column[::10] = 1.0 + cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) + cat_column_highdim = np.arange(n_rows) + + dense_ser = pd.Series(dense_column) + sparse_ser = pd.Series(sparse_column, dtype=pd.SparseDtype("float", 0.0)) + cat_ser_lowdim = pd.Categorical(cat_column_lowdim) + cat_ser_highdim = pd.Categorical(cat_column_highdim) + + df = pd.DataFrame( + data={ + "d": dense_ser, + "s": sparse_ser, + "cl": cat_ser_lowdim, + "ch": cat_ser_highdim, + } + ) + + mat = mx.from_pandas(df, sparse_threshold=0.3, cat_threshold=4) + + assert mat.shape == (n_rows, 14) + assert len(mat.matrices) == 3 + assert isinstance(mat, mx.SplitMatrix) + + nb_col_by_type = { + mx.DenseMatrix: 1, + mx.SparseMatrix: 3, # sparse column + low dimensional categorical + mx.CategoricalMatrix: n_rows, + } + for submat in mat.matrices: + assert submat.shape[1] == nb_col_by_type[type(submat)] From bcc840a66621f299b5be7a16821929a5f8b85ba9 Mon Sep 17 00:00:00 2001 From: Marc-Antoine Schmidt Date: Thu, 16 Jul 2020 18:03:30 -0400 Subject: [PATCH 4/7] keep ordering --- src/quantcore/matrix/constructor.py | 79 ++++++++++++++++++----------- 1 file changed, 50 insertions(+), 29 deletions(-) diff --git a/src/quantcore/matrix/constructor.py b/src/quantcore/matrix/constructor.py index 81b6d9ec..f1bae7fc 100644 --- a/src/quantcore/matrix/constructor.py +++ b/src/quantcore/matrix/constructor.py @@ -1,4 +1,5 @@ import warnings +from typing import List, Union import numpy as np import pandas as pd @@ -8,7 +9,7 @@ from .dense_matrix import DenseMatrix from .matrix_base import MatrixBase from .sparse_matrix import SparseMatrix -from .split_matrix import SplitMatrix +from .split_matrix import SplitMatrix, split_sparse_and_dense_parts def from_pandas( @@ -45,56 +46,76 @@ def from_pandas( for colname in df.select_dtypes("object"): df[colname] = df[colname].astype("category") - matrices = [] - sparse_ohe_comp = [] - sparse_idx = [] - dense_idx = [] + matrices: List[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = [] + indices: List[List[int]] = [] + + dense_dfidx = [] # column index in original DataFrame + dense_mxidx = [] # index in the new SplitMatrix + sparse_dfidx = [] # column index in original DataFrame + sparse_mxidx = [] # index in the new SplitMatrix ignored_cols = [] - for colidx, (colname, coldata) in enumerate(df.iteritems()): + + mxcolidx = 0 + + for dfcolidx, (colname, coldata) in enumerate(df.iteritems()): # categorical if isinstance(coldata.dtype, pd.CategoricalDtype): if len(coldata.cat.categories) < cat_threshold: - sparse_ohe_comp.append( - pd.get_dummies(coldata, prefix=colname, sparse=True) + ( + X_dense_F, + X_sparse, + dense_indices, + sparse_indices, + ) = split_sparse_and_dense_parts( + pd.get_dummies(coldata, prefix=colname, sparse=True), + threshold=sparse_threshold, ) + matrices.append(X_dense_F) + indices.append(mxcolidx + dense_indices) + matrices.append(X_sparse) + indices.append(mxcolidx + sparse_indices) + mxcolidx += len(dense_indices) + len(sparse_indices) else: - matrices.append(CategoricalMatrix(coldata, dtype=dtype)) - - # sparse data, keep in sparse format even if density is larger than threshold - elif isinstance(coldata.dtype, pd.SparseDtype): - sparse_idx.append(colidx) + cat = CategoricalMatrix(coldata, dtype=dtype) + matrices.append(cat) + indices.append(mxcolidx + np.arange(cat.shape[1])) + mxcolidx += cat.shape[1] # All other numerical dtypes (needs to be after pd.SparseDtype) elif is_numeric_dtype(coldata): # check if we want to store as sparse if (coldata != 0).mean() <= sparse_threshold: - sparse_dtype = pd.SparseDtype(coldata.dtype, fill_value=0) - df.iloc[:, colidx] = df.iloc[:, colidx].astype(sparse_dtype) - sparse_idx.append(colidx) + if not isinstance(coldata.dtype, pd.SparseDtype): + sparse_dtype = pd.SparseDtype(coldata.dtype, fill_value=0) + df.iloc[:, dfcolidx] = coldata.astype(sparse_dtype) + sparse_dfidx.append(dfcolidx) + sparse_mxidx.append(mxcolidx) + mxcolidx += 1 else: - dense_idx.append(colidx) + dense_dfidx.append(dfcolidx) + dense_mxidx.append(mxcolidx) + mxcolidx += 1 # dtype not handled yet else: - ignored_cols.append((colidx, colname)) + ignored_cols.append((dfcolidx, colname)) if len(ignored_cols) > 0: warnings.warn( f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype." ) - if len(dense_idx) > 0: - dense_comp = DenseMatrix(df.iloc[:, dense_idx].astype(dtype)) - matrices.append(dense_comp) - if len(sparse_idx) > 0: - sparse_comp = SparseMatrix(df.iloc[:, sparse_idx].sparse.to_coo(), dtype=dtype) - matrices.append(sparse_comp) - if len(sparse_ohe_comp) > 0: - sparse_ohe_comp = SparseMatrix( - pd.concat(sparse_ohe_comp, axis=1).sparse.to_coo(), dtype=dtype + if len(dense_dfidx) > 0: + matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype))) + indices.append(dense_mxidx) + if len(sparse_dfidx) > 0: + matrices.append( + SparseMatrix(df.iloc[:, sparse_dfidx].sparse.to_coo(), dtype=dtype) ) - matrices.append(sparse_ohe_comp) + indices.append(sparse_mxidx) if len(matrices) > 1: - return SplitMatrix(matrices) + return SplitMatrix(matrices, indices) + elif len(matrices) == 0: + raise ValueError("DataFrame contained no valid column") else: return matrices[0] From ad8e9e326f3445825db35e7b09acd2930ffd0b68 Mon Sep 17 00:00:00 2001 From: Marc-Antoine Schmidt Date: Thu, 16 Jul 2020 18:17:08 -0400 Subject: [PATCH 5/7] fix test --- src/quantcore/matrix/constructor.py | 6 +++++- tests/test_matrices.py | 12 ++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/quantcore/matrix/constructor.py b/src/quantcore/matrix/constructor.py index f1bae7fc..3f2ac9d2 100644 --- a/src/quantcore/matrix/constructor.py +++ b/src/quantcore/matrix/constructor.py @@ -67,7 +67,11 @@ def from_pandas( dense_indices, sparse_indices, ) = split_sparse_and_dense_parts( - pd.get_dummies(coldata, prefix=colname, sparse=True), + pd.get_dummies( + coldata, prefix=colname, sparse=True, dtype=np.float64 + ) + .sparse.to_coo() + .tocsc(), threshold=sparse_threshold, ) matrices.append(X_dense_F) diff --git a/tests/test_matrices.py b/tests/test_matrices.py index e2c18c1a..99e61aa2 100644 --- a/tests/test_matrices.py +++ b/tests/test_matrices.py @@ -426,10 +426,10 @@ def test_indexing_range_row(mat: Union[mx.MatrixBase, mx.StandardizedMatrix]): def test_pandas_to_matrix(): - n_rows = 10 + n_rows = 50 dense_column = np.linspace(-10, 10, num=n_rows, dtype=np.float64) sparse_column = np.zeros(n_rows, dtype=np.float64) - sparse_column[::10] = 1.0 + sparse_column[0] = 1.0 cat_column_lowdim = np.tile(["a", "b"], n_rows // 2) cat_column_highdim = np.arange(n_rows) @@ -447,15 +447,15 @@ def test_pandas_to_matrix(): } ) - mat = mx.from_pandas(df, sparse_threshold=0.3, cat_threshold=4) + mat = mx.from_pandas(df, dtype=np.float64, sparse_threshold=0.3, cat_threshold=4) - assert mat.shape == (n_rows, 14) + assert mat.shape == (n_rows, n_rows + 4) assert len(mat.matrices) == 3 assert isinstance(mat, mx.SplitMatrix) nb_col_by_type = { - mx.DenseMatrix: 1, - mx.SparseMatrix: 3, # sparse column + low dimensional categorical + mx.DenseMatrix: 3, # includes low-dimension categorical + mx.SparseMatrix: 1, # sparse column mx.CategoricalMatrix: n_rows, } for submat in mat.matrices: From b658a686154cec79171f95c1cb0baacff2166e09 Mon Sep 17 00:00:00 2001 From: Marc-Antoine Schmidt Date: Thu, 16 Jul 2020 18:46:03 -0400 Subject: [PATCH 6/7] let user choose categorical location --- src/quantcore/matrix/constructor.py | 41 ++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/src/quantcore/matrix/constructor.py b/src/quantcore/matrix/constructor.py index 3f2ac9d2..a8d3006b 100644 --- a/src/quantcore/matrix/constructor.py +++ b/src/quantcore/matrix/constructor.py @@ -18,6 +18,7 @@ def from_pandas( sparse_threshold: float = 0.1, cat_threshold: int = 4, object_as_cat: bool = False, + cat_position: str = "expand", ) -> MatrixBase: """ Transform a pandas.DataFrame into an efficient SplitMatrix @@ -37,6 +38,11 @@ def from_pandas( object_as_cat : bool, default False If True, DataFrame columns stored as python objects will be treated as categorical columns. + cat_position : str {'end'|'expand'}, default 'expand' + Position of the categorical variable in the index. If "last", all the + categoricals (including the ones that did not satisfy cat_threshold) + will be placed at the end of the index list. If "expand", all the variables + will remain in the same order. Returns ------- @@ -48,6 +54,7 @@ def from_pandas( matrices: List[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = [] indices: List[List[int]] = [] + is_cat: List[bool] = [] dense_dfidx = [] # column index in original DataFrame dense_mxidx = [] # index in the new SplitMatrix @@ -75,16 +82,26 @@ def from_pandas( threshold=sparse_threshold, ) matrices.append(X_dense_F) - indices.append(mxcolidx + dense_indices) + is_cat.append(True) matrices.append(X_sparse) - indices.append(mxcolidx + sparse_indices) - mxcolidx += len(dense_indices) + len(sparse_indices) + is_cat.append(True) + if cat_position == "expand": + indices.append(mxcolidx + dense_indices) + indices.append(mxcolidx + sparse_indices) + mxcolidx += len(dense_indices) + len(sparse_indices) + elif cat_position == "end": + indices.append(dense_indices) + indices.append(sparse_indices) + else: cat = CategoricalMatrix(coldata, dtype=dtype) matrices.append(cat) - indices.append(mxcolidx + np.arange(cat.shape[1])) - mxcolidx += cat.shape[1] - + is_cat.append(True) + if cat_position == "expand": + indices.append(mxcolidx + np.arange(cat.shape[1])) + mxcolidx += cat.shape[1] + elif cat_position == "end": + indices.append(np.arange(cat.shape[1])) # All other numerical dtypes (needs to be after pd.SparseDtype) elif is_numeric_dtype(coldata): # check if we want to store as sparse @@ -111,11 +128,23 @@ def from_pandas( if len(dense_dfidx) > 0: matrices.append(DenseMatrix(df.iloc[:, dense_dfidx].astype(dtype))) indices.append(dense_mxidx) + is_cat.append(False) if len(sparse_dfidx) > 0: matrices.append( SparseMatrix(df.iloc[:, sparse_dfidx].sparse.to_coo(), dtype=dtype) ) indices.append(sparse_mxidx) + is_cat.append(False) + + if cat_position == "end": + new_indices = [] + for mat_indices, is_cat_ in zip(indices, is_cat): + if is_cat: + new_indices.append(np.asarray(mat_indices) + mxcolidx) + mxcolidx += len(mat_indices) + else: + new_indices.append(mat_indices) + indices = new_indices if len(matrices) > 1: return SplitMatrix(matrices, indices) From 29586bf328d7ef852cf15d94f6be43d0e5fe9f32 Mon Sep 17 00:00:00 2001 From: Marc-Antoine Schmidt Date: Thu, 16 Jul 2020 18:51:40 -0400 Subject: [PATCH 7/7] typo --- src/quantcore/matrix/constructor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/quantcore/matrix/constructor.py b/src/quantcore/matrix/constructor.py index a8d3006b..4a54965c 100644 --- a/src/quantcore/matrix/constructor.py +++ b/src/quantcore/matrix/constructor.py @@ -139,7 +139,7 @@ def from_pandas( if cat_position == "end": new_indices = [] for mat_indices, is_cat_ in zip(indices, is_cat): - if is_cat: + if is_cat_: new_indices.append(np.asarray(mat_indices) + mxcolidx) mxcolidx += len(mat_indices) else: