-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-45065][PYTHON][PS] Support Pandas 2.1.0 #42793
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 24 commits
bf79e7a
e81a97a
246d2a0
f874b85
49c5c5d
7184a3b
15c5aa7
5dbf456
2a17d1d
f48215c
dc68af1
4e89cf5
a585fbe
1831923
91b865c
76433d0
afecab9
6ff4df2
5a0fe26
bba34a0
f66d824
21a7dfe
2323237
0c07f55
5c054ec
1cb9df4
0e8ea3b
46cd7dd
357fbce
76f0720
cf54c67
0008089
5723b6c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1271,6 +1271,8 @@ def applymap(self, func: Callable[[Any], Any]) -> "DataFrame": | |
| This method applies a function that accepts and returns a scalar | ||
| to every element of a DataFrame. | ||
|
|
||
| .. deprecated:: 4.0.0 | ||
|
|
||
| .. note:: this API executes the function once to infer the type which is | ||
| potentially expensive, for instance, when the dataset is created after | ||
| aggregations or sorting. | ||
|
|
@@ -1321,11 +1323,76 @@ def applymap(self, func: Callable[[Any], Any]) -> "DataFrame": | |
| 0 1.000000 4.494400 | ||
| 1 11.262736 20.857489 | ||
| """ | ||
| warnings.warn( | ||
| "DataFrame.applymap has been deprecated. Use DataFrame.map instead", FutureWarning | ||
| ) | ||
|
|
||
| # TODO: We can implement shortcut theoretically since it creates new DataFrame | ||
| # anyway and we don't have to worry about operations on different DataFrames. | ||
| return self._apply_series_op(lambda psser: psser.apply(func)) | ||
|
|
||
| def map(self, func: Callable[[Any], Any]) -> "DataFrame": | ||
| """ | ||
| Apply a function to a Dataframe elementwise. | ||
|
|
||
| This method applies a function that accepts and returns a scalar | ||
| to every element of a DataFrame. | ||
|
|
||
| .. versionadded:: 4.0.0 | ||
| DataFrame.applymap was deprecated and renamed to DataFrame.map. | ||
|
|
||
| .. note:: this API executes the function once to infer the type which is | ||
| potentially expensive, for instance, when the dataset is created after | ||
| aggregations or sorting. | ||
|
|
||
| To avoid this, specify return type in ``func``, for instance, as below: | ||
|
|
||
| >>> def square(x) -> np.int32: | ||
| ... return x ** 2 | ||
|
|
||
| pandas-on-Spark uses return type hints and does not try to infer the type. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| func : callable | ||
| Python function returns a single value from a single value. | ||
|
|
||
| Returns | ||
| ------- | ||
| DataFrame | ||
| Transformed DataFrame. | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> df = ps.DataFrame([[1, 2.12], [3.356, 4.567]]) | ||
| >>> df | ||
| 0 1 | ||
| 0 1.000 2.120 | ||
| 1 3.356 4.567 | ||
|
|
||
| >>> def str_len(x) -> int: | ||
| ... return len(str(x)) | ||
| >>> df.map(str_len) | ||
| 0 1 | ||
| 0 3 4 | ||
| 1 5 5 | ||
|
|
||
| >>> def power(x) -> float: | ||
| ... return x ** 2 | ||
| >>> df.map(power) | ||
| 0 1 | ||
| 0 1.000000 4.494400 | ||
| 1 11.262736 20.857489 | ||
|
|
||
| You can omit type hints and let pandas-on-Spark infer its type. | ||
|
|
||
| >>> df.map(lambda x: x ** 2) | ||
| 0 1 | ||
| 0 1.000000 4.494400 | ||
| 1 11.262736 20.857489 | ||
| """ | ||
| return self.applymap(func=func) | ||
|
||
|
|
||
| # TODO: not all arguments are implemented comparing to pandas' for now. | ||
| def aggregate(self, func: Union[List[str], Dict[Name, List[str]]]) -> "DataFrame": | ||
| """Aggregate using one or more operations over the specified axis. | ||
|
|
@@ -5556,6 +5623,10 @@ def from_records( | |
| Parameters | ||
| ---------- | ||
| data : ndarray (structured dtype), list of tuples, dict, or DataFrame | ||
|
|
||
| .. deprecated:: 4.0.0 | ||
| Passing a DataFrame is deprecated. | ||
|
|
||
| index : string, list of fields, array-like | ||
| Field of array to use as the index, alternately a specific set of input labels to use | ||
| exclude : sequence, default None | ||
|
|
@@ -5952,6 +6023,9 @@ def fillna( | |
| Method to use for filling holes in reindexed Series pad / ffill: propagate last valid | ||
| observation forward to next valid backfill / bfill: | ||
| use NEXT valid observation to fill gap | ||
|
|
||
| .. deprecated:: 4.0.0 | ||
|
|
||
| axis : {0 or `index`} | ||
| 1 and `columns` are not supported. | ||
| inplace : boolean, default False | ||
|
|
@@ -5963,6 +6037,8 @@ def fillna( | |
| this is the maximum number of entries along the entire axis where NaNs will be filled. | ||
| Must be greater than 0 if not None | ||
|
|
||
| .. deprecated:: 4.0.0 | ||
|
|
||
| Returns | ||
| ------- | ||
| DataFrame | ||
|
|
@@ -6046,6 +6122,11 @@ def op(psser: ps.Series) -> ps.Series: | |
| return psser._fillna(value=value, method=method, axis=axis, limit=limit) | ||
|
|
||
| elif method is not None: | ||
| warnings.warn( | ||
| "DataFrame.fillna with 'method' is deprecated and will raise in a future version. " | ||
| "Use DataFrame.ffill() or DataFrame.bfill() instead.", | ||
| FutureWarning, | ||
| ) | ||
|
|
||
| def op(psser: ps.Series) -> ps.Series: | ||
| return psser._fillna(value=value, method=method, axis=axis, limit=limit) | ||
|
|
@@ -6121,6 +6202,21 @@ def replace( | |
| If value is a list or tuple, value should be of the same length with to_replace. | ||
| inplace : boolean, default False | ||
| Fill in place (do not create a new object) | ||
| limit : int, default None | ||
| Maximum size gap to forward or backward fill. | ||
|
|
||
| .. deprecated:: 4.0.0 | ||
|
|
||
| regex : bool or str, default False | ||
| Whether to interpret to_replace and/or value as regular expressions. | ||
| If this is True then to_replace must be a string. | ||
| Alternatively, this could be a regular expression in which case to_replace must be None. | ||
| method : 'pad', default None | ||
| The method to use when for replacement, when to_replace is a scalar, | ||
| list or tuple and value is None. | ||
|
|
||
| .. deprecated:: 4.0.0 | ||
|
|
||
|
|
||
| Returns | ||
| ------- | ||
|
|
@@ -6189,8 +6285,18 @@ def replace( | |
| 3 Hulk Smash | ||
| """ | ||
| if method != "pad": | ||
| warnings.warn( | ||
| "The 'method' keyword in DataFrame.replace is deprecated " | ||
| "and will be removed in a future version.", | ||
| FutureWarning, | ||
| ) | ||
| raise NotImplementedError("replace currently works only for method='pad") | ||
| if limit is not None: | ||
| warnings.warn( | ||
| "The 'limit' keyword in DataFrame.replace is deprecated " | ||
| "and will be removed in a future version.", | ||
| FutureWarning, | ||
| ) | ||
| raise NotImplementedError("replace currently works only when limit=None") | ||
| if regex is not False: | ||
| raise NotImplementedError("replace currently doesn't supports regex") | ||
|
|
@@ -6221,6 +6327,13 @@ def op(psser: ps.Series) -> ps.Series: | |
| return psser | ||
|
|
||
| else: | ||
| if value is None: | ||
| warnings.warn( | ||
| "DataFrame.replace without 'value' and with non-dict-like 'to_replace' " | ||
| "is deprecated and will raise in a future version. " | ||
| "Explicitly specify the new values instead.", | ||
| FutureWarning, | ||
| ) | ||
|
|
||
| def op(psser: ps.Series) -> ps.Series: | ||
| return psser.replace(to_replace=to_replace, value=value, regex=regex) | ||
|
|
@@ -6344,6 +6457,8 @@ def last(self, offset: Union[str, DateOffset]) -> "DataFrame": | |
| When having a DataFrame with dates as index, this function can | ||
| select the last few rows based on a date offset. | ||
|
|
||
| .. deprecated:: 4.0.0 | ||
|
|
||
| Parameters | ||
| ---------- | ||
| offset : str or DateOffset | ||
|
|
@@ -6383,6 +6498,11 @@ def last(self, offset: Union[str, DateOffset]) -> "DataFrame": | |
| 3 observed days in the dataset, and therefore data for 2018-04-11 was | ||
| not returned. | ||
| """ | ||
| warnings.warn( | ||
| "last is deprecated and will be removed in a future version. " | ||
| "Please create a mask and filter using `.loc` instead", | ||
| FutureWarning, | ||
| ) | ||
| # Check index type should be format DateTime | ||
| if not isinstance(self.index, ps.DatetimeIndex): | ||
| raise TypeError("'last' only supports a DatetimeIndex") | ||
|
|
@@ -6401,6 +6521,8 @@ def first(self, offset: Union[str, DateOffset]) -> "DataFrame": | |
| When having a DataFrame with dates as index, this function can | ||
| select the first few rows based on a date offset. | ||
|
|
||
| .. deprecated:: 4.0.0 | ||
|
|
||
| Parameters | ||
| ---------- | ||
| offset : str or DateOffset | ||
|
|
@@ -6440,6 +6562,11 @@ def first(self, offset: Union[str, DateOffset]) -> "DataFrame": | |
| 3 observed days in the dataset, and therefore data for 2018-04-13 was | ||
| not returned. | ||
| """ | ||
| warnings.warn( | ||
| "first is deprecated and will be removed in a future version. " | ||
| "Please create a mask and filter using `.loc` instead", | ||
| FutureWarning, | ||
| ) | ||
| # Check index type should be format DatetimeIndex | ||
| if not isinstance(self.index, ps.DatetimeIndex): | ||
| raise TypeError("'first' only supports a DatetimeIndex") | ||
|
|
@@ -10524,12 +10651,12 @@ def stack(self) -> DataFrameOrSeries: | |
| kg m | ||
| cat 1.0 2.0 | ||
| dog 3.0 4.0 | ||
| >>> df_multi_level_cols2.stack().sort_index() # doctest: +SKIP | ||
| height weight | ||
| cat kg NaN 1.0 | ||
| m 2.0 NaN | ||
| dog kg NaN 3.0 | ||
| m 4.0 NaN | ||
| >>> df_multi_level_cols2.stack().sort_index() | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Column ordering bug is fixed in Pandas: pandas-dev/pandas#53786. |
||
| weight height | ||
| cat kg 1.0 NaN | ||
| m NaN 2.0 | ||
| dog kg 3.0 NaN | ||
| m NaN 4.0 | ||
| """ | ||
| from pyspark.pandas.series import first_series | ||
|
|
||
|
|
@@ -10555,8 +10682,6 @@ def stack(self) -> DataFrameOrSeries: | |
|
|
||
| index_values.add(value) | ||
|
|
||
| column_labels = dict(sorted(column_labels.items(), key=lambda x: x[0])) | ||
|
|
||
| index_name = self._internal.column_label_names[-1] | ||
| column_label_names = self._internal.column_label_names[:-1] | ||
| if len(column_label_names) == 0: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we add a line her, where we tell users to have pandas version 2.1.0 installed for spark 4.0
The only way now to find witch pandas version to install is to check the docker file in dev/infra
https://github.com/jupyter/docker-stacks/blob/52a999a554fe42951e017f7be132d808695a1261/images/pyspark-notebook/Dockerfile#L69
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea. Related information has been added to the top of the migration guide. Thanks!