-
Notifications
You must be signed in to change notification settings - Fork 205
feat: add DataFrame.top_k and LazyFrame.top_k
#2977
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
181a723
a1fb7c6
6cd961b
e920e66
6cee5de
d69f663
32d9708
e2b76e1
5c8ffdb
68a505f
69f6bc5
ad7ea79
cafb18d
c57a406
bb1f349
dbc0c72
e45b151
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -43,6 +43,7 @@ | |
| - shape | ||
| - sort | ||
| - tail | ||
| - top_k | ||
| - to_arrow | ||
| - to_dict | ||
| - to_native | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ | |
| - sink_parquet | ||
| - sort | ||
| - tail | ||
| - top_k | ||
| - to_native | ||
| - unique | ||
| - unpivot | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -239,6 +239,14 @@ def sort( | |
| self._compliant_frame.sort(*by, descending=descending, nulls_last=nulls_last) | ||
| ) | ||
|
|
||
| def top_k( | ||
| self, k: int, *, by: str | Iterable[str], reverse: bool | Sequence[bool] = False | ||
| ) -> Self: | ||
| flatten_by = flatten([by]) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add a check that if From polars: df = pl.DataFrame(
{
"a": ["a", "b", "a", "b", "b", "c"],
"b": [2, 1, 1, 3, 2, 1],
}
)
df.top_k(4, by=["b", "a"], reverse=[True])
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @raisadz I would still prefer to add a check at this level to also align the error with polars (notice that the output of
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think there's some other places where this would be useful (like |
||
| return self._with_compliant( | ||
| self._compliant_frame.top_k(k, by=flatten_by, reverse=reverse) | ||
| ) | ||
|
|
||
| def join( | ||
| self, | ||
| other: Self, | ||
|
|
@@ -1855,6 +1863,43 @@ def sort( | |
| """ | ||
| return super().sort(by, *more_by, descending=descending, nulls_last=nulls_last) | ||
|
|
||
| def top_k( | ||
| self, k: int, *, by: str | Iterable[str], reverse: bool | Sequence[bool] = False | ||
| ) -> Self: | ||
| r"""Return the `k` largest rows. | ||
|
|
||
| Non-null elements are always preferred over null elements, | ||
| regardless of the value of reverse. The output is not guaranteed | ||
| to be in any particular order, sort the outputs afterwards if you wish the output to be sorted. | ||
|
|
||
| Arguments: | ||
| k: Number of rows to return. | ||
| by: Column(s) used to determine the top rows. Accepts expression input. Strings are parsed as column names. | ||
| reverse: Consider the k smallest elements of the by column(s) (instead of the k largest). | ||
| This can be specified per column by passing a sequence of booleans. | ||
|
|
||
| Returns: | ||
| The dataframe with the `k` largest rows. | ||
|
|
||
| Examples: | ||
| >>> import pandas as pd | ||
| >>> import narwhals as nw | ||
| >>> df_native = pd.DataFrame( | ||
| ... {"a": ["a", "b", "a", "b", None, "c"], "b": [2, 1, 1, 3, 2, 1]} | ||
| ... ) | ||
| >>> nw.from_native(df_native).top_k(4, by=["b", "a"]) | ||
| ββββββββββββββββββββ | ||
| |Narwhals DataFrame| | ||
| |------------------| | ||
| | a b | | ||
| | 3 b 3 | | ||
| | 0 a 2 | | ||
| | 4 None 2 | | ||
| | 5 c 1 | | ||
| ββββββββββββββββββββ | ||
| """ | ||
| return super().top_k(k, by=by, reverse=reverse) | ||
|
|
||
| def join( | ||
| self, | ||
| other: Self, | ||
|
|
@@ -3190,6 +3235,48 @@ def sort( | |
| """ | ||
| return super().sort(by, *more_by, descending=descending, nulls_last=nulls_last) | ||
|
|
||
| def top_k( | ||
| self, k: int, *, by: str | Iterable[str], reverse: bool | Sequence[bool] = False | ||
| ) -> Self: | ||
| r"""Return the `k` largest rows. | ||
|
|
||
| Non-null elements are always preferred over null elements, | ||
| regardless of the value of reverse. The output is not guaranteed | ||
| to be in any particular order, sort the outputs afterwards if you wish the output to be sorted. | ||
|
|
||
| Arguments: | ||
| k: Number of rows to return. | ||
| by: Column(s) used to determine the top rows. Accepts expression input. Strings are parsed as column names. | ||
| reverse: Consider the k smallest elements of the by column(s) (instead of the k largest). | ||
| This can be specified per column by passing a sequence of booleans. | ||
|
|
||
| Returns: | ||
| The LazyFrame with the `k` largest rows. | ||
|
|
||
| Examples: | ||
| >>> import duckdb | ||
| >>> import narwhals as nw | ||
| >>> df_native = duckdb.sql( | ||
| ... "SELECT * FROM VALUES ('a', 2), ('b', 1), ('a', 1), ('b', 3), (NULL, 2), ('c', 1) df(a, b)" | ||
| ... ) | ||
| >>> df = nw.from_native(df_native) | ||
| >>> df.top_k(4, by=["b", "a"]) | ||
| βββββββββββββββββββββ | ||
| |Narwhals LazyFrame | | ||
| |-------------------| | ||
| |βββββββββββ¬ββββββββ| | ||
| |β a β b β| | ||
| |β varchar β int32 β| | ||
| |βββββββββββΌββββββββ€| | ||
| |β b β 3 β| | ||
| |β a β 2 β| | ||
| |β NULL β 2 β| | ||
| |β c β 1 β| | ||
| |βββββββββββ΄ββββββββ| | ||
| βββββββββββββββββββββ | ||
| """ | ||
| return super().top_k(k, by=by, reverse=reverse) | ||
|
|
||
| def join( | ||
| self, | ||
| other: Self, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import narwhals as nw | ||
| from tests.utils import Constructor, assert_equal_data | ||
|
|
||
|
|
||
| def test_top_k(constructor: Constructor) -> None: | ||
| data = {"a": ["a", "f", "a", "d", "b", "c"], "b c": [2, 4, 5, 3, 6, 1]} | ||
| df = nw.from_native(constructor(data)) | ||
| result = df.top_k(4, by="b c") | ||
| expected = {"a": ["a", "b", "d", "f"], "b c": [5, 6, 3, 4]} | ||
| assert_equal_data(result.sort("a"), expected) | ||
| df = nw.from_native(constructor(data)) | ||
| result = df.top_k(4, by="b c", reverse=True) | ||
| expected = {"a": ["a", "c", "d", "f"], "b c": [2, 1, 3, 4]} | ||
| assert_equal_data(result.sort(by="a"), expected) | ||
|
|
||
|
|
||
| def test_top_k_by_multiple(constructor: Constructor) -> None: | ||
| data = { | ||
| "a": ["a", "f", "a", "d", "b", "c"], | ||
| "b": [2, 2, 2, 3, 1, 1], | ||
| "sf_c": ["k", "d", "s", "a", "a", "r"], | ||
| } | ||
| df = nw.from_native(constructor(data)) | ||
| result = df.top_k(4, by=["b", "sf_c"], reverse=True) | ||
| expected = { | ||
| "a": ["b", "f", "a", "c"], | ||
| "b": [1, 2, 2, 1], | ||
| "sf_c": ["a", "d", "k", "r"], | ||
| } | ||
| assert_equal_data(result.sort("sf_c"), expected) | ||
| data = { | ||
| "a": ["a", "f", "a", "d", "b", "c"], | ||
| "b": [2, 2, 2, 3, 1, 1], | ||
| "sf_c": ["k", "d", "s", "a", "a", "r"], | ||
| } | ||
| df = nw.from_native(constructor(data)) | ||
| result = df.top_k(4, by=["b", "sf_c"], reverse=[False, True]) | ||
| expected = { | ||
| "a": ["d", "f", "a", "a"], | ||
| "b": [3, 2, 2, 2], | ||
| "sf_c": ["a", "d", "k", "s"], | ||
| } | ||
| assert_equal_data(result.sort("sf_c"), expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you prefix the variable name with an underscore (
_df) you can avoid the# noqa: F841flag. It's hacky I know