Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DOCS] Naming consistency of length functions #2942

Merged
merged 7 commits into from
Oct 5, 2024
14 changes: 14 additions & 0 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import math
import os
import warnings
from datetime import date, datetime, time
from decimal import Decimal
from typing import (
Expand Down Expand Up @@ -2936,6 +2937,19 @@ def count(self, mode: CountMode = CountMode.Valid) -> Expression:
def lengths(self) -> Expression:
"""Gets the length of each list

Returns:
Expression: a UInt64 expression which is the length of each list
"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you replace this docstring with something along the lines of

"(DEPRECATED) Please use Expression.list.length instead"

warnings.warn(
"This function will be deprecated from Daft version >= 0.3.5! Instead, please use 'Expression.list.length'",
category=DeprecationWarning,
)

return Expression._from_pyexpr(native.list_count(self._expr, CountMode.All))

def length(self) -> Expression:
"""Gets the length of each list

Returns:
Expression: a UInt64 expression which is the length of each list
"""
Expand Down
9 changes: 9 additions & 0 deletions daft/series.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import warnings
from typing import Any, Literal, TypeVar

from daft.arrow_utils import ensure_array, ensure_chunked_array
Expand Down Expand Up @@ -927,6 +928,14 @@ def iceberg_truncate(self, w: int) -> Series:

class SeriesListNamespace(SeriesNamespace):
def lengths(self) -> Series:
warnings.warn(
"This function will be deprecated from Daft version >= 0.3.5! Instead, please use 'length'",
category=DeprecationWarning,
)

return Series._from_pyseries(self._series.list_count(CountMode.All))

def length(self) -> Series:
return Series._from_pyseries(self._series.list_count(CountMode.All))

def get(self, idx: Series, default: Series) -> Series:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/api_docs/expressions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ List
:template: autosummary/accessor_method.rst

Expression.list.join
Expression.list.lengths
Expression.list.length
Expression.list.get
Expression.list.slice
Expression.list.chunk
Expand Down
16 changes: 8 additions & 8 deletions tests/series/test_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def test_cast_binary_to_fixed_size_binary():
assert casted.to_pylist() == [b"abc", b"def", None, b"bcd", None]


def test_cast_binary_to_fixed_size_binary_fails_with_variable_lengths():
def test_cast_binary_to_fixed_size_binary_fails_with_variable_length():
data = [b"abc", b"def", None, b"bcd", None, b"long"]

input = Series.from_pylist(data)
Expand Down Expand Up @@ -368,7 +368,7 @@ def test_series_cast_python_to_list(dtype) -> None:
assert t.datatype() == target_dtype
assert len(t) == len(data)

assert t.list.lengths().to_pylist() == [3, 3, 3, 3, 2, 2, None]
assert t.list.length().to_pylist() == [3, 3, 3, 3, 2, 2, None]

pydata = t.to_pylist()
assert pydata[-1] is None
Expand Down Expand Up @@ -397,7 +397,7 @@ def test_series_cast_python_to_fixed_size_list(dtype) -> None:
assert t.datatype() == target_dtype
assert len(t) == len(data)

assert t.list.lengths().to_pylist() == [3, 3, 3, 3, 3, 3, None]
assert t.list.length().to_pylist() == [3, 3, 3, 3, 3, 3, None]

pydata = t.to_pylist()
assert pydata[-1] is None
Expand Down Expand Up @@ -426,7 +426,7 @@ def test_series_cast_python_to_embedding(dtype) -> None:
assert t.datatype() == target_dtype
assert len(t) == len(data)

assert t.list.lengths().to_pylist() == [3, 3, 3, 3, 3, 3, None]
assert t.list.length().to_pylist() == [3, 3, 3, 3, 3, 3, None]

pydata = t.to_pylist()
assert pydata[-1] is None
Expand All @@ -448,7 +448,7 @@ def test_series_cast_list_to_embedding(dtype) -> None:
assert t.datatype() == target_dtype
assert len(t) == len(data)

assert t.list.lengths().to_pylist() == [3, 3, 3, None]
assert t.list.length().to_pylist() == [3, 3, 3, None]

pydata = t.to_pylist()
assert pydata[-1] is None
Expand All @@ -473,7 +473,7 @@ def test_series_cast_numpy_to_image() -> None:
assert t.datatype() == target_dtype
assert len(t) == len(data)

assert t.list.lengths().to_pylist() == [12, 27, None]
assert t.list.length().to_pylist() == [12, 27, None]

pydata = t.to_pylist()
assert pydata[-1] is None
Expand All @@ -495,7 +495,7 @@ def test_series_cast_numpy_to_image_infer_mode() -> None:
assert t.datatype() == target_dtype
assert len(t) == len(data)

assert t.list.lengths().to_pylist() == [4, 27, None]
assert t.list.length().to_pylist() == [4, 27, None]

pydata = t.to_arrow().to_pylist()
assert pydata[0] == {
Expand Down Expand Up @@ -536,7 +536,7 @@ def test_series_cast_python_to_fixed_shape_image() -> None:
assert t.datatype() == target_dtype
assert len(t) == len(data)

assert t.list.lengths().to_pylist() == [12, 12, None]
assert t.list.length().to_pylist() == [12, 12, None]

pydata = t.to_pylist()
assert pydata[-1] is None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,12 @@ def test_fixed_list_count(fixed_table):

result = fixed_table.eval_expression_list([col("col").list.count(CountMode.Null)])
assert result.to_pydict() == {"col": [0, 0, 1, 2, None]}


def test_list_length(fixed_table):
with pytest.warns(DeprecationWarning):
lengths_result = fixed_table.eval_expression_list([col("col").list.lengths()])
length_result = fixed_table.eval_expression_list([col("col").list.length()])

assert lengths_result.to_pydict() == {"col": [2, 2, 2, 2, None]}
assert length_result.to_pydict() == {"col": [2, 2, 2, 2, None]}
2 changes: 1 addition & 1 deletion tutorials/delta_lake/2-distributed-batch-inference.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@
"\n",
"# Prune data\n",
"df = df.limit(NUM_ROWS)\n",
"df = df.where(df[\"object\"].list.lengths() == 1)"
"df = df.where(df[\"object\"].list.length() == 1)"
]
},
{
Expand Down
Loading