Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add Series|Expr.rank #1342

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
- over
- pipe
- quantile
- rank
- replace_strict
- round
- sample
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
- null_count
- pipe
- quantile
- rank
- rename
- replace_strict
- round
Expand Down
10 changes: 10 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,16 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]:
def mode(self: Self) -> Self:
return reuse_series_implementation(self, "mode")

def rank(
self: Self,
method: Literal["average", "min", "max", "dense", "ordinal"],
*,
descending: bool,
) -> Self:
return reuse_series_implementation(
self, "rank", method=method, descending=descending
)

@property
def dt(self: Self) -> ArrowExprDateTimeNamespace:
return ArrowExprDateTimeNamespace(self)
Expand Down
27 changes: 27 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,33 @@ def mode(self: Self) -> ArrowSeries:
plx.col(col_token) == plx.col(col_token).max()
)[self.name]

def rank(
self: Self,
method: Literal["average", "min", "max", "dense", "ordinal"],
*,
descending: bool,
) -> Self:
if method == "average":
msg = (
"`rank` with `method='average' is not supported for pyarrow backend. "
"The available methods are {'min', 'max', 'dense', 'ordinal'}."
)
raise ValueError(msg)

import pyarrow as pa # ignore-banned-import
import pyarrow.compute as pc # ignore-banned-import

sort_keys = "descending" if descending else "ascending"
tiebreaker = "first" if method == "ordinal" else method

native_series = self._native_series
null_mask = pc.is_null(native_series)

rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker)

result = pc.if_else(null_mask, pa.scalar(None), rank)
return self._from_native_series(result)

def __iter__(self: Self) -> Iterator[Any]:
yield from self._native_series.__iter__()

Expand Down
10 changes: 10 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,16 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self:
def mode(self: Self) -> Self:
return reuse_series_implementation(self, "mode")

def rank(
self: Self,
method: Literal["average", "min", "max", "dense", "ordinal"],
*,
descending: bool,
) -> Self:
return reuse_series_implementation(
self, "rank", method=method, descending=descending
)

@property
def str(self: Self) -> PandasLikeExprStringNamespace:
return PandasLikeExprStringNamespace(self)
Expand Down
50 changes: 50 additions & 0 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,56 @@ def mode(self: Self) -> Self:
def __iter__(self: Self) -> Iterator[Any]:
yield from self._native_series.__iter__()

def rank(
self: Self,
method: Literal["average", "min", "max", "dense", "ordinal"],
*,
descending: bool,
) -> Self:
pd_method = "first" if method == "ordinal" else method
native_series = self._native_series

if (
self._implementation is Implementation.PANDAS
and self._backend_version < (3,)
and self.dtype
in {
self._dtypes.Int64,
self._dtypes.Int32,
self._dtypes.Int16,
self._dtypes.Int8,
self._dtypes.UInt64,
self._dtypes.UInt32,
self._dtypes.UInt16,
self._dtypes.UInt8,
}
and (null_mask := native_series.isna()).any()
):
# crazy workaround for the case of `na_option="keep"` and nullable
# integer dtypes. This should be supported in pandas > 3.0
# https://github.com/pandas-dev/pandas/issues/56976
Comment on lines +760 to +762
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the workaround.

@MarcoGorelli I was not able to properly use the pandas like util function get_dtype_backend to figure out the nullable backend. It should not really matter as the non-nullable backend would not result in integer type if the series contains nulls anyway

ranked_series = (
native_series.to_frame()
.assign(**{f"{native_series.name}_is_null": null_mask})
.groupby(f"{native_series.name}_is_null")
.rank(
method=pd_method,
na_option="keep",
ascending=not descending,
pct=False,
)[native_series.name]
)

else:
ranked_series = native_series.rank(
method=pd_method,
na_option="keep",
ascending=not descending,
pct=False,
)

return self._from_native_series(ranked_series)

@property
def str(self) -> PandasLikeSeriesStringNamespace:
return PandasLikeSeriesStringNamespace(self)
Expand Down
87 changes: 87 additions & 0 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2385,6 +2385,93 @@ def mode(self: Self) -> Self:
"""
return self.__class__(lambda plx: self._call(plx).mode())

def rank(
self: Self,
method: Literal["average", "min", "max", "dense", "ordinal"] = "average",
*,
descending: bool = False,
) -> Self:
"""
Assign ranks to data, dealing with ties appropriately.

Notes:
The resulting dtype may differ between backends.

Arguments:
method: The method used to assign ranks to tied elements.
The following methods are available (default is 'average'):

- 'average' : The average of the ranks that would have been assigned to
all the tied values is assigned to each value.
- 'min' : The minimum of the ranks that would have been assigned to all
the tied values is assigned to each value. (This is also referred to
as "competition" ranking.)
- 'max' : The maximum of the ranks that would have been assigned to all
the tied values is assigned to each value.
- 'dense' : Like 'min', but the rank of the next highest element is
assigned the rank immediately after those assigned to the tied
elements.
- 'ordinal' : All values are given a distinct rank, corresponding to the
order that the values occur in the Series.

descending: Rank in descending order.

Examples:
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> data = {"a": [3, 6, 1, 1, 6]}

We define a dataframe-agnostic function that computes the dense rank for
the data:

>>> @nw.narwhalify
... def func(df):
... return df.with_columns(rnk=nw.col("a").rank(method="dense"))

We can then pass any supported library such as pandas, Polars, or PyArrow:

>>> func(pl.DataFrame(data))
shape: (5, 2)
β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”
β”‚ a ┆ rnk β”‚
β”‚ --- ┆ --- β”‚
β”‚ i64 ┆ u32 β”‚
β•žβ•β•β•β•β•β•ͺ═════║
β”‚ 3 ┆ 2 β”‚
β”‚ 6 ┆ 3 β”‚
β”‚ 1 ┆ 1 β”‚
β”‚ 1 ┆ 1 β”‚
β”‚ 6 ┆ 3 β”‚
β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜

>>> func(pd.DataFrame(data))
a rnk
0 3 2.0
1 6 3.0
2 1 1.0
3 1 1.0
4 6 3.0

>>> func(pa.table(data))
pyarrow.Table
a: int64
rnk: uint64
----
a: [[3,6,1,1,6]]
rnk: [[2,3,1,1,3]]
"""

supported_rank_methods = {"average", "min", "max", "dense", "ordinal"}
if method not in supported_rank_methods:
msg = f"Ranking method must be one of {supported_rank_methods}. Found '{method}'"
raise ValueError(msg)

return self.__class__(
lambda plx: self._call(plx).rank(method=method, descending=descending)
)

@property
def str(self: Self) -> ExprStringNamespace[Self]:
return ExprStringNamespace(self)
Expand Down
89 changes: 88 additions & 1 deletion narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2601,6 +2601,93 @@ def mode(self: Self) -> Self:
def __iter__(self: Self) -> Iterator[Any]:
yield from self._compliant_series.__iter__()

def rank(
self: Self,
method: Literal["average", "min", "max", "dense", "ordinal"] = "average",
*,
descending: bool = False,
) -> Self:
"""
Assign ranks to data, dealing with ties appropriately.

Notes:
The resulting dtype may differ between backends.

Arguments:
method: The method used to assign ranks to tied elements.
The following methods are available (default is 'average'):

- 'average' : The average of the ranks that would have been assigned to
all the tied values is assigned to each value.
- 'min' : The minimum of the ranks that would have been assigned to all
the tied values is assigned to each value. (This is also referred to
as "competition" ranking.)
- 'max' : The maximum of the ranks that would have been assigned to all
the tied values is assigned to each value.
- 'dense' : Like 'min', but the rank of the next highest element is
assigned the rank immediately after those assigned to the tied
elements.
- 'ordinal' : All values are given a distinct rank, corresponding to the
order that the values occur in the Series.

descending: Rank in descending order.

Examples:
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> data = [3, 6, 1, 1, 6]

We define a dataframe-agnostic function that computes the dense rank for
the data:

>>> @nw.narwhalify
... def func(s):
... return s.rank(method="dense")

We can then pass any supported library such as pandas, Polars, or PyArrow:

>>> func(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE
shape: (5,)
Series: '' [u32]
[
2
3
1
1
3
]

>>> func(pd.Series(data))
0 2.0
1 3.0
2 1.0
3 1.0
4 3.0
dtype: float64

>>> func(pa.chunked_array([data])) # doctest:+ELLIPSIS
<pyarrow.lib.ChunkedArray object at ...>
[
[
2,
3,
1,
1,
3
]
]
"""
supported_rank_methods = {"average", "min", "max", "dense", "ordinal"}
if method not in supported_rank_methods:
msg = f"Ranking method must be one of {supported_rank_methods}. Found '{method}'"
raise ValueError(msg)

return self._from_compliant_series(
self._compliant_series.rank(method=method, descending=descending)
)

@property
def str(self: Self) -> SeriesStringNamespace[Self]:
return SeriesStringNamespace(self)
Expand Down Expand Up @@ -3245,7 +3332,7 @@ def to_datetime(self: Self, format: str | None = None) -> T: # noqa: A002
... def func(s):
... return s.str.to_datetime(format="%Y-%m-%d")

We can then pass any supported library such as pandas, Polars, or PyArrow::
We can then pass any supported library such as pandas, Polars, or PyArrow:

>>> func(s_pd)
0 2020-01-01
Expand Down
Loading
Loading