narwhals-dev · FBruzzesi · Nov 6, 2024 · Nov 7, 2024 · Nov 8, 2024 · Nov 9, 2024
diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md
@@ -36,6 +36,7 @@
         - over
         - pipe
         - quantile
+        - rank
         - replace_strict
         - round
         - sample

diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md
@@ -43,6 +43,7 @@
         - null_count
         - pipe
         - quantile
+        - rank
         - rename
         - replace_strict
         - round

diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
@@ -380,6 +380,16 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]:
     def mode(self: Self) -> Self:
         return reuse_series_implementation(self, "mode")
 
+    def rank(
+        self: Self,
+        method: Literal["average", "min", "max", "dense", "ordinal"],
+        *,
+        descending: bool,
+    ) -> Self:
+        return reuse_series_implementation(
+            self, "rank", method=method, descending=descending
+        )
+
     @property
     def dt(self: Self) -> ArrowExprDateTimeNamespace:
         return ArrowExprDateTimeNamespace(self)

diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py
@@ -746,6 +746,33 @@ def mode(self: Self) -> ArrowSeries:
             plx.col(col_token) == plx.col(col_token).max()
         )[self.name]
 
+    def rank(
+        self: Self,
+        method: Literal["average", "min", "max", "dense", "ordinal"],
+        *,
+        descending: bool,
+    ) -> Self:
+        if method == "average":
+            msg = (
+                "`rank` with `method='average' is not supported for pyarrow backend. "
+                "The available methods are {'min', 'max', 'dense', 'ordinal'}."
+            )
+            raise ValueError(msg)
+
+        import pyarrow as pa  # ignore-banned-import
+        import pyarrow.compute as pc  # ignore-banned-import
+
+        sort_keys = "descending" if descending else "ascending"
+        tiebreaker = "first" if method == "ordinal" else method
+
+        native_series = self._native_series
+        null_mask = pc.is_null(native_series)
+
+        rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker)
+
+        result = pc.if_else(null_mask, pa.scalar(None), rank)
+        return self._from_native_series(result)
+
     def __iter__(self: Self) -> Iterator[Any]:
         yield from self._native_series.__iter__()
 

diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py
@@ -396,6 +396,16 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self:
     def mode(self: Self) -> Self:
         return reuse_series_implementation(self, "mode")
 
+    def rank(
+        self: Self,
+        method: Literal["average", "min", "max", "dense", "ordinal"],
+        *,
+        descending: bool,
+    ) -> Self:
+        return reuse_series_implementation(
+            self, "rank", method=method, descending=descending
+        )
+
     @property
     def str(self: Self) -> PandasLikeExprStringNamespace:
         return PandasLikeExprStringNamespace(self)

diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py
@@ -719,6 +719,56 @@ def mode(self: Self) -> Self:
     def __iter__(self: Self) -> Iterator[Any]:
         yield from self._native_series.__iter__()
 
+    def rank(
+        self: Self,
+        method: Literal["average", "min", "max", "dense", "ordinal"],
+        *,
+        descending: bool,
+    ) -> Self:
+        pd_method = "first" if method == "ordinal" else method
+        native_series = self._native_series
+
+        if (
+            self._implementation is Implementation.PANDAS
+            and self._backend_version < (3,)
+            and self.dtype
+            in {
+                self._dtypes.Int64,
+                self._dtypes.Int32,
+                self._dtypes.Int16,
+                self._dtypes.Int8,
+                self._dtypes.UInt64,
+                self._dtypes.UInt32,
+                self._dtypes.UInt16,
+                self._dtypes.UInt8,
+            }
+            and (null_mask := native_series.isna()).any()
+        ):
+            # crazy workaround for the case of `na_option="keep"` and nullable
+            # integer dtypes. This should be supported in pandas > 3.0
+            # https://github.com/pandas-dev/pandas/issues/56976
+            ranked_series = (
+                native_series.to_frame()
+                .assign(**{f"{native_series.name}_is_null": null_mask})
+                .groupby(f"{native_series.name}_is_null")
+                .rank(
+                    method=pd_method,
+                    na_option="keep",
+                    ascending=not descending,
+                    pct=False,
+                )[native_series.name]
+            )
+
+        else:
+            ranked_series = native_series.rank(
+                method=pd_method,
+                na_option="keep",
+                ascending=not descending,
+                pct=False,
+            )
+
+        return self._from_native_series(ranked_series)
+
     @property
     def str(self) -> PandasLikeSeriesStringNamespace:
         return PandasLikeSeriesStringNamespace(self)

diff --git a/narwhals/expr.py b/narwhals/expr.py
@@ -2385,6 +2385,93 @@ def mode(self: Self) -> Self:
         """
         return self.__class__(lambda plx: self._call(plx).mode())
 
+    def rank(
+        self: Self,
+        method: Literal["average", "min", "max", "dense", "ordinal"] = "average",
+        *,
+        descending: bool = False,
+    ) -> Self:
+        """
+        Assign ranks to data, dealing with ties appropriately.
+
+        Notes:
+            The resulting dtype may differ between backends.
+
+        Arguments:
+            method: The method used to assign ranks to tied elements.
+                The following methods are available (default is 'average'):
+
+                - 'average' : The average of the ranks that would have been assigned to
+                  all the tied values is assigned to each value.
+                - 'min' : The minimum of the ranks that would have been assigned to all
+                    the tied values is assigned to each value. (This is also referred to
+                    as "competition" ranking.)
+                - 'max' : The maximum of the ranks that would have been assigned to all
+                    the tied values is assigned to each value.
+                - 'dense' : Like 'min', but the rank of the next highest element is
+                   assigned the rank immediately after those assigned to the tied
+                   elements.
+                - 'ordinal' : All values are given a distinct rank, corresponding to the
+                    order that the values occur in the Series.
+
+            descending: Rank in descending order.
+
+        Examples:
+            >>> import narwhals as nw
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> import pyarrow as pa
+            >>> data = {"a": [3, 6, 1, 1, 6]}
+
+            We define a dataframe-agnostic function that computes the dense rank for
+            the data:
+
+            >>> @nw.narwhalify
+            ... def func(df):
+            ...     return df.with_columns(rnk=nw.col("a").rank(method="dense"))
+
+            We can then pass any supported library such as pandas, Polars, or PyArrow:
+
+            >>> func(pl.DataFrame(data))
+            shape: (5, 2)
+            ┌─────┬─────┐
+            │ a   ┆ rnk │
+            │ --- ┆ --- │
+            │ i64 ┆ u32 │
+            ╞═════╪═════╡
+            │ 3   ┆ 2   │
+            │ 6   ┆ 3   │
+            │ 1   ┆ 1   │
+            │ 1   ┆ 1   │
+            │ 6   ┆ 3   │
+            └─────┴─────┘
+
+            >>> func(pd.DataFrame(data))
+               a  rnk
+            0  3  2.0
+            1  6  3.0
+            2  1  1.0
+            3  1  1.0
+            4  6  3.0
+
+            >>> func(pa.table(data))
+            pyarrow.Table
+            a: int64
+            rnk: uint64
+            ----
+            a: [[3,6,1,1,6]]
+            rnk: [[2,3,1,1,3]]
+        """
+
+        supported_rank_methods = {"average", "min", "max", "dense", "ordinal"}
+        if method not in supported_rank_methods:
+            msg = f"Ranking method must be one of {supported_rank_methods}. Found '{method}'"
+            raise ValueError(msg)
+
+        return self.__class__(
+            lambda plx: self._call(plx).rank(method=method, descending=descending)
+        )
+
     @property
     def str(self: Self) -> ExprStringNamespace[Self]:
         return ExprStringNamespace(self)

diff --git a/narwhals/series.py b/narwhals/series.py
@@ -2601,6 +2601,93 @@ def mode(self: Self) -> Self:
     def __iter__(self: Self) -> Iterator[Any]:
         yield from self._compliant_series.__iter__()
 
+    def rank(
+        self: Self,
+        method: Literal["average", "min", "max", "dense", "ordinal"] = "average",
+        *,
+        descending: bool = False,
+    ) -> Self:
+        """
+        Assign ranks to data, dealing with ties appropriately.
+
+        Notes:
+            The resulting dtype may differ between backends.
+
+        Arguments:
+            method: The method used to assign ranks to tied elements.
+                The following methods are available (default is 'average'):
+
+                - 'average' : The average of the ranks that would have been assigned to
+                  all the tied values is assigned to each value.
+                - 'min' : The minimum of the ranks that would have been assigned to all
+                    the tied values is assigned to each value. (This is also referred to
+                    as "competition" ranking.)
+                - 'max' : The maximum of the ranks that would have been assigned to all
+                    the tied values is assigned to each value.
+                - 'dense' : Like 'min', but the rank of the next highest element is
+                   assigned the rank immediately after those assigned to the tied
+                   elements.
+                - 'ordinal' : All values are given a distinct rank, corresponding to the
+                    order that the values occur in the Series.
+
+            descending: Rank in descending order.
+
+        Examples:
+            >>> import narwhals as nw
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> import pyarrow as pa
+            >>> data = [3, 6, 1, 1, 6]
+
+            We define a dataframe-agnostic function that computes the dense rank for
+            the data:
+
+            >>> @nw.narwhalify
+            ... def func(s):
+            ...     return s.rank(method="dense")
+
+            We can then pass any supported library such as pandas, Polars, or PyArrow:
+
+            >>> func(pl.Series(data))  # doctest:+NORMALIZE_WHITESPACE
+            shape: (5,)
+            Series: '' [u32]
+            [
+               2
+               3
+               1
+               1
+               3
+            ]
+
+            >>> func(pd.Series(data))
+            0    2.0
+            1    3.0
+            2    1.0
+            3    1.0
+            4    3.0
+            dtype: float64
+
+            >>> func(pa.chunked_array([data]))  # doctest:+ELLIPSIS
+            <pyarrow.lib.ChunkedArray object at ...>
+            [
+              [
+                2,
+                3,
+                1,
+                1,
+                3
+              ]
+            ]
+        """
+        supported_rank_methods = {"average", "min", "max", "dense", "ordinal"}
+        if method not in supported_rank_methods:
+            msg = f"Ranking method must be one of {supported_rank_methods}. Found '{method}'"
+            raise ValueError(msg)
+
+        return self._from_compliant_series(
+            self._compliant_series.rank(method=method, descending=descending)
+        )
+
     @property
     def str(self: Self) -> SeriesStringNamespace[Self]:
         return SeriesStringNamespace(self)
@@ -3245,7 +3332,7 @@ def to_datetime(self: Self, format: str | None = None) -> T:  # noqa: A002
             ... def func(s):
             ...     return s.str.to_datetime(format="%Y-%m-%d")
 
-            We can then pass any supported library such as pandas, Polars, or PyArrow::
+            We can then pass any supported library such as pandas, Polars, or PyArrow:
 
             >>> func(s_pd)
             0   2020-01-01