From 3e6f8d1d8cbd54a105a04216fd78b3e16ca07eb6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 13 Sep 2019 20:37:19 -0500 Subject: [PATCH] ARROW-6556: [Python] Handle future removal of pandas SparseDataFrame https://issues.apache.org/jira/browse/ARROW-6556 The plan in pandas is to remove SparseDataFrame/Series in pandas 1.0. By making sure our code works with that, we can ensure that a pandas release does not break the pyarrow release that is at that moment the latest stable release. (and this also makes it easier for me to develop on master branches of both together) This isn't merged yet in pandas, so we can also wait until that is done to merge this PR. I was just trying out some things in pandas and saw that the pyarrow feather tests were failing when we remove those classes. Closes #5377 from jorisvandenbossche/ARROW-6556-pandas-sparse and squashes the following commits: 705f0a725 ARROW-6556: Handle future removal of pandas SparseDataFrame Authored-by: Joris Van den Bossche Signed-off-by: Wes McKinney --- python/pyarrow/feather.py | 3 ++- python/pyarrow/pandas-shim.pxi | 7 +++++++ python/pyarrow/serialization.py | 6 ++++-- python/pyarrow/tests/test_feather.py | 4 ++++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 418bb7a37077d..432215237d7fa 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -82,7 +82,8 @@ def __init__(self, dest): self.writer.open(dest) def write(self, df): - if isinstance(df, _pandas_api.pd.SparseDataFrame): + if (_pandas_api.has_sparse + and isinstance(df, _pandas_api.pd.SparseDataFrame)): df = df.to_dense() if not df.columns.is_unique: diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi index 35ce3eeb9fb8d..1281f03550e28 100644 --- a/python/pyarrow/pandas-shim.pxi +++ b/python/pyarrow/pandas-shim.pxi @@ -32,6 +32,7 @@ cdef class _PandasAPIShim(object): object _data_frame, _index, _series, _categorical_type object _datetimetz_type, _extension_array object _array_like_types + bint has_sparse def __init__(self): self._tried_importing_pandas = False @@ -81,6 +82,12 @@ cdef class _PandasAPIShim(object): self._datetimetz_type = DatetimeTZDtype self._have_pandas = True + try: + from pandas import SparseDataFrame + self.has_sparse = True + except ImportError: + self.has_sparse = False + cdef inline _check_import(self, bint raise_=True): if self._tried_importing_pandas: if not self._have_pandas and raise_: diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py index 3513c504874d3..f622b5ecab88d 100644 --- a/python/pyarrow/serialization.py +++ b/python/pyarrow/serialization.py @@ -163,7 +163,8 @@ def _register_custom_pandas_handlers(context): ) def _serialize_pandas_dataframe(obj): - if isinstance(obj, pd.SparseDataFrame): + if (pdcompat._pandas_api.has_sparse + and isinstance(obj, pd.SparseDataFrame)): raise NotImplementedError( sparse_type_error_msg.format('SparseDataFrame') ) @@ -174,7 +175,8 @@ def _deserialize_pandas_dataframe(data): return pdcompat.serialized_dict_to_dataframe(data) def _serialize_pandas_series(obj): - if isinstance(obj, pd.SparseSeries): + if (pdcompat._pandas_api.has_sparse + and isinstance(obj, pd.SparseSeries)): raise NotImplementedError( sparse_type_error_msg.format('SparseSeries') ) diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 137dfeaeaa9a5..dfbeacc48f981 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -515,7 +515,11 @@ def test_filelike_objects(self): result = read_feather(buf) assert_frame_equal(result, df) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") + @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") def test_sparse_dataframe(self): + if not hasattr(pd, 'SparseDataFrame'): + pytest.skip("version of pandas does not support SparseDataFrame") # GH #221 data = {'A': [0, 1, 2], 'B': [1, 0, 1]}