- 
          
- 
                Notifications
    You must be signed in to change notification settings 
- Fork 19.2k
ENH: better dtype inference when doing DataFrame reductions #52788
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 64 commits
1e7e563
              6397977
              0e797b9
              b846e70
              76ce594
              7644598
              51da9ef
              8d925cd
              d7d1989
              54bcb60
              03b8ce4
              e0af36f
              64d8d60
              a95e5b9
              e7a75e4
              2e64191
              b6c1dc8
              32f9a73
              8bf7ba8
              35b07c5
              6a390d4
              8dc2acf
              5a65c70
              9cb34ec
              8521f18
              82cd91e
              e0bc63e
              7cf26ae
              6330840
              efae9dc
              b585f3b
              52763ab
              d4f2a84
              7bfe3fe
              f48ea09
              bbd8cb8
              c6e9a80
              5200896
              26d4059
              b6bd75e
              44dcdce
              3ebcbff
              99d034e
              79df9db
              d01fc1d
              bc582f6
              a7fd1b1
              1781d30
              68fd316
              8ceb57d
              4375cb2
              f7b354c
              f91c6ca
              9a881fa
              9d50f85
              026696f
              f603de0
              a7e69ad
              772998f
              b20a289
              082ddd9
              8032514
              3a3ec95
              77992f7
              23f22fb
              3b8d8f0
              1e39b65
              1ed3e2d
              467073a
              dd0bfe8
              49334c7
              5634106
              f85deab
              6519712
              74410f6
              e7503dc
              24e2d11
              e3afa18
              899a2fb
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -1549,6 +1549,12 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): | |
|  | ||
| return result.as_py() | ||
|  | ||
| def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): | ||
|          | ||
| """Takes the result of ``_reduce`` and wraps it an a ndarray/extensionArray.""" | ||
| result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) | ||
| result = pa.array([result.as_py()], type=result.type) | ||
| return type(self)(result) | ||
|  | ||
| def __setitem__(self, key, value) -> None: | ||
| """Set one or more values inplace. | ||
|  | ||
|  | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -2124,6 +2124,10 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: | |
| # ------------------------------------------------------------------ | ||
| # Reductions | ||
|  | ||
| def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): | ||
| result = self._reduce(name, skipna=skipna, **kwargs) | ||
| return type(self)([result], dtype=self.dtype) | ||
|          | ||
|  | ||
| def min(self, *, skipna: bool = True, **kwargs): | ||
| """ | ||
| The minimum value of the object. | ||
|  | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -34,6 +34,10 @@ | |
| Shape, | ||
| npt, | ||
| ) | ||
| from pandas.compat import ( | ||
| IS64, | ||
| is_platform_windows, | ||
| ) | ||
| from pandas.errors import AbstractMethodError | ||
| from pandas.util._decorators import doc | ||
| from pandas.util._validators import validate_fillna_kwargs | ||
|  | @@ -1088,13 +1092,22 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): | |
|  | ||
| # median, skew, kurt, sem | ||
| op = getattr(nanops, f"nan{name}") | ||
| result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) | ||
|  | ||
| axis = kwargs.pop("axis", None) | ||
| result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs) | ||
| if np.isnan(result): | ||
| return libmissing.NA | ||
|  | ||
| return result | ||
|  | ||
| def _reduce_and_wrap(self, name: str, *, skipna: bool = True, kwargs): | ||
| res = self._reduce(name=name, skipna=skipna, **kwargs) | ||
| if res is libmissing.NA: | ||
| return self._wrap_na_result(name=name, axis=0, mask_size=(1,)) | ||
| else: | ||
| res = res.reshape(1) | ||
| mask = np.zeros(1, dtype=bool) | ||
| return self._maybe_mask_result(res, mask) | ||
|  | ||
| def _wrap_reduction_result(self, name: str, result, *, skipna, axis): | ||
| if isinstance(result, np.ndarray): | ||
| if skipna: | ||
|  | @@ -1106,6 +1119,32 @@ def _wrap_reduction_result(self, name: str, result, *, skipna, axis): | |
| return self._maybe_mask_result(result, mask) | ||
| return result | ||
|  | ||
| def _wrap_na_result(self, *, name, axis, mask_size): | ||
| mask = np.ones(mask_size, dtype=bool) | ||
|  | ||
| float_dtyp = "float32" if self.dtype == "Float32" else "float64" | ||
| if name in ["mean", "median", "var", "std", "skew"]: | ||
| np_dtype = float_dtyp | ||
| elif name in ["min", "max"] or self.dtype.itemsize == 8: | ||
| np_dtype = self.dtype.numpy_dtype.name | ||
| else: | ||
| is_windows_or_32bit = is_platform_windows() or not IS64 | ||
| int_dtyp = "int32" if is_windows_or_32bit else "int64" | ||
| uint_dtyp = "uint32" if is_windows_or_32bit else "uint64" | ||
| np_dtype = {"b": int_dtyp, "i": int_dtyp, "u": uint_dtyp, "f": float_dtyp}[ | ||
| self.dtype.kind | ||
| ] | ||
|  | ||
| value = np.array([1], dtype=np_dtype) | ||
| return self._maybe_mask_result(value, mask=mask) | ||
|  | ||
| def _wrap_min_count_reduction_result( | ||
| self, name: str, result, *, skipna, min_count, axis | ||
| ): | ||
| if min_count == 0 and isinstance(result, np.ndarray): | ||
| return self._maybe_mask_result(result, np.zeros(result.shape, dtype=bool)) | ||
| return self._wrap_reduction_result(name, result, skipna=skipna, axis=axis) | ||
|  | ||
| def sum( | ||
| self, | ||
| *, | ||
|  | @@ -1123,7 +1162,9 @@ def sum( | |
| min_count=min_count, | ||
| axis=axis, | ||
| ) | ||
| return self._wrap_reduction_result("sum", result, skipna=skipna, axis=axis) | ||
| return self._wrap_min_count_reduction_result( | ||
| "sum", result, skipna=skipna, min_count=min_count, axis=axis | ||
| ) | ||
|  | ||
| def prod( | ||
| self, | ||
|  | @@ -1134,14 +1175,17 @@ def prod( | |
| **kwargs, | ||
| ): | ||
| nv.validate_prod((), kwargs) | ||
|  | ||
| result = masked_reductions.prod( | ||
| self._data, | ||
| self._mask, | ||
| skipna=skipna, | ||
| min_count=min_count, | ||
| axis=axis, | ||
| ) | ||
| return self._wrap_reduction_result("prod", result, skipna=skipna, axis=axis) | ||
| return self._wrap_min_count_reduction_result( | ||
| "prod", result, skipna=skipna, min_count=min_count, axis=axis | ||
| ) | ||
|  | ||
| def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): | ||
| nv.validate_mean((), kwargs) | ||
|  | @@ -1181,23 +1225,25 @@ def std( | |
|  | ||
| def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): | ||
| nv.validate_min((), kwargs) | ||
| return masked_reductions.min( | ||
| result = masked_reductions.min( | ||
| self._data, | ||
| self._mask, | ||
| skipna=skipna, | ||
| axis=axis, | ||
| ) | ||
| return self._wrap_reduction_result("min", result, skipna=skipna, axis=axis) | ||
|  | ||
| def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): | ||
| nv.validate_max((), kwargs) | ||
| return masked_reductions.max( | ||
| result = masked_reductions.max( | ||
| self._data, | ||
| self._mask, | ||
| skipna=skipna, | ||
| axis=axis, | ||
| ) | ||
| return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) | ||
|  | ||
| def any(self, *, skipna: bool = True, **kwargs): | ||
| def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it needed to add the   There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll look into it, could be connected to your previous comment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this works, and I've made another version, will if it passes, and then I'll look into your other comments | ||
| """ | ||
| Return whether any element is truthy. | ||
|  | ||
|  | @@ -1216,6 +1262,7 @@ def any(self, *, skipna: bool = True, **kwargs): | |
| If `skipna` is False, the result will still be True if there is | ||
| at least one element that is truthy, otherwise NA will be returned | ||
| if there are NA's present. | ||
| axis : int, optional, default 0 | ||
| **kwargs : any, default None | ||
| Additional keywords have no effect but might be accepted for | ||
| compatibility with NumPy. | ||
|  | @@ -1259,7 +1306,6 @@ def any(self, *, skipna: bool = True, **kwargs): | |
| >>> pd.array([0, 0, pd.NA]).any(skipna=False) | ||
| <NA> | ||
| """ | ||
| kwargs.pop("axis", None) | ||
| nv.validate_any((), kwargs) | ||
|  | ||
| values = self._data.copy() | ||
|  | @@ -1278,7 +1324,7 @@ def any(self, *, skipna: bool = True, **kwargs): | |
| else: | ||
| return self.dtype.na_value | ||
|  | ||
| def all(self, *, skipna: bool = True, **kwargs): | ||
| def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): | ||
| """ | ||
| Return whether all elements are truthy. | ||
|  | ||
|  | @@ -1297,6 +1343,7 @@ def all(self, *, skipna: bool = True, **kwargs): | |
| If `skipna` is False, the result will still be False if there is | ||
| at least one element that is falsey, otherwise NA will be returned | ||
| if there are NA's present. | ||
| axis : int, optional, default 0 | ||
| **kwargs : any, default None | ||
| Additional keywords have no effect but might be accepted for | ||
| compatibility with NumPy. | ||
|  | @@ -1340,7 +1387,6 @@ def all(self, *, skipna: bool = True, **kwargs): | |
| >>> pd.array([1, 0, pd.NA]).all(skipna=False) | ||
| False | ||
| """ | ||
| kwargs.pop("axis", None) | ||
| nv.validate_all((), kwargs) | ||
|  | ||
| values = self._data.copy() | ||
|  | @@ -1350,7 +1396,7 @@ def all(self, *, skipna: bool = True, **kwargs): | |
| # bool, int, float, complex, str, bytes, | ||
| # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" | ||
| np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type] | ||
| result = values.all() | ||
| result = values.all(axis=axis) | ||
|  | ||
| if skipna: | ||
| return result | ||
|  | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that now you are using
subsetagain on this line, passing thisaxisis not doing anything (and would actually raise an error if you would passaxis=1here)(it doesn't really matter in practice because we never call this with an axis=1, but seeing
axispassed through might give the false impression that this algo actually supports 2D data, while that is not the case)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@topper-123 can you address this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I thought I had answered this, apparently not...
funchere is eithernp.minornp.max, so supplyingaxis=axiswill not raise here, but will work as expected AFAIKS.Additionally, without the
axis=axispart,func(subset)is similar tonp.max|min(subset, axis=None). Not a problem for 1d arrays, but will be a problem if we ever want to supportdf.min(axis=None)using 2d masked arrays. (I'm not sure we want to support 2d masked arrays or are going all in on arrow?)