Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,6 @@ Development Changes
Other API changes
^^^^^^^^^^^^^^^^^

- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
Expand Down Expand Up @@ -741,6 +739,7 @@ Deprecations

- Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`)

- :meth:`Series.describe` and :meth:`DataFrame.describe` treating datetime dtypes as categorical rather than numeric is deprecated. Specify ``datetime_is_numeric=True`` to show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`, :issue:`33903`)
- :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`)
- Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`)
- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`)
Expand Down
47 changes: 42 additions & 5 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9792,7 +9792,11 @@ def abs(self: FrameOrSeries) -> FrameOrSeries:
return np.abs(self)

def describe(
self: FrameOrSeries, percentiles=None, include=None, exclude=None
self: FrameOrSeries,
percentiles=None,
include=None,
exclude=None,
datetime_is_numeric=False,
) -> FrameOrSeries:
"""
Generate descriptive statistics.
Expand Down Expand Up @@ -9838,6 +9842,10 @@ def describe(
``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
exclude pandas categorical columns, use ``'category'``
- None (default) : The result will exclude nothing.
datetime_is_numeric : bool, default False
Whether to treat datetime dtypes as numeric.

.. versionadded:: 1.1.0

Returns
-------
Expand Down Expand Up @@ -9915,7 +9923,7 @@ def describe(
... np.datetime64("2010-01-01"),
... np.datetime64("2010-01-01")
... ])
>>> s.describe()
>>> s.describe(datetime_is_numeric=True)
count 3
mean 2006-09-01 08:00:00
min 2000-01-01 00:00:00
Expand Down Expand Up @@ -10073,8 +10081,37 @@ def describe_categorical_1d(data):
dtype = None
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
names += ["top", "freq"]
result += [top, freq]
if is_datetime64_any_dtype(data.dtype):
if self.ndim == 1:
stacklevel = 4
else:
stacklevel = 5
warnings.warn(
"Treating datetime data as categorical rather than numeric in "
"`.describe` is deprecated and will be removed in a future "
"version of pandas. Specify `datetime_is_numeric=True` to "
"silence this warning and adopt the future behavior now.",
FutureWarning,
stacklevel=stacklevel,
)
tz = data.dt.tz
asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
names += ["top", "freq", "first", "last"]
result += [
top,
freq,
Timestamp(asint.min(), tz=tz),
Timestamp(asint.max(), tz=tz),
]
else:
names += ["top", "freq"]
result += [top, freq]

# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
Expand All @@ -10100,7 +10137,7 @@ def describe_1d(data):
return describe_categorical_1d(data)
elif is_numeric_dtype(data):
return describe_numeric_1d(data)
elif is_datetime64_any_dtype(data.dtype):
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
return describe_timestamp_1d(data)
elif is_timedelta64_dtype(data.dtype):
return describe_numeric_1d(data)
Expand Down
35 changes: 34 additions & 1 deletion pandas/tests/frame/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,40 @@ def test_describe_tz_values(self, tz_naive_fixture):
},
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
)
result = df.describe(include="all")
result = df.describe(include="all", datetime_is_numeric=True)
tm.assert_frame_equal(result, expected)

s1_ = s1.describe()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make as a separate test

s2_ = pd.Series(
[
5,
5,
s2.value_counts().index[0],
1,
start.tz_localize(tz),
end.tz_localize(tz),
],
index=["count", "unique", "top", "freq", "first", "last"],
)
idx = [
"count",
"unique",
"top",
"freq",
"first",
"last",
"mean",
"std",
"min",
"25%",
"50%",
"75%",
"max",
]
expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx]

with tm.assert_produces_warning(FutureWarning):
result = df.describe(include="all")
tm.assert_frame_equal(result, expected)

def test_describe_percentiles_integer_idx(self):
Expand Down
18 changes: 17 additions & 1 deletion pandas/tests/series/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_describe_with_tz(self, tz_naive_fixture):
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s = Series(date_range(start, end, tz=tz), name=name)
result = s.describe()
result = s.describe(datetime_is_numeric=True)
expected = Series(
[
5,
Expand All @@ -98,3 +98,19 @@ def test_describe_with_tz(self, tz_naive_fixture):
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)

with tm.assert_produces_warning(FutureWarning):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

result = s.describe()
expected = Series(
[
5,
5,
s.value_counts().index[0],
1,
start.tz_localize(tz),
end.tz_localize(tz),
],
name=name,
index=["count", "unique", "top", "freq", "first", "last"],
)
tm.assert_series_equal(result, expected)