diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 2c8f66dd99e72..ee097c1f4d5e8 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4710,7 +4710,8 @@ Several caveats. indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can force including or omitting indexes with the ``index`` argument, regardless of the underlying engine. * Index level names, if specified, must be strings. -* Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype. +* In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. +* The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message on an attempt at serialization. @@ -4734,7 +4735,9 @@ See the documentation for `pyarrow `__ an 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('20130101', periods=3), - 'g': pd.date_range('20130101', periods=3, tz='US/Eastern')}) + 'g': pd.date_range('20130101', periods=3, tz='US/Eastern'), + 'h': pd.Categorical(list('abc')), + 'i': pd.Categorical(list('abc'), ordered=True)}) df df.dtypes diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index f8c4f9f3dc410..2b147f948adb1 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -176,6 +176,7 @@ Categorical - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) - Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`) - :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` now work on unordered categoricals also (:issue:`21667`) +- Added test to assert roundtripping to parquet with :func:`DataFrame.to_parquet` or :func:`read_parquet` will preserve Categorical dtypes for string types (:issue:`27955`) - diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index efc2b6d6c5b3d..2a95904d5668d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -167,6 +167,7 @@ def compare(repeat): df.to_parquet(path, **write_kwargs) with catch_warnings(record=True): actual = read_parquet(path, **read_kwargs) + tm.assert_frame_equal(expected, actual, check_names=check_names) if path is None: @@ -461,11 +462,26 @@ def test_unsupported(self, pa): def test_categorical(self, pa): # supported in >= 0.7.0 - df = pd.DataFrame({"a": pd.Categorical(list("abc"))}) + df = pd.DataFrame() + df["a"] = pd.Categorical(list("abcdef")) - # de-serialized as object - expected = df.assign(a=df.a.astype(object)) - check_round_trip(df, pa, expected=expected) + # test for null, out-of-order values, and unobserved category + df["b"] = pd.Categorical( + ["bar", "foo", "foo", "bar", None, "bar"], + dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), + ) + + # test for ordered flag + df["c"] = pd.Categorical( + ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True + ) + + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.0"): + check_round_trip(df, pa) + else: + # de-serialized as object for pyarrow < 0.15 + expected = df.astype(object) + check_round_trip(df, pa, expected=expected) def test_s3_roundtrip(self, df_compat, s3_resource, pa): # GH #19134