diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 01b7c1ff8b..a5f8858d4f 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -29,6 +29,7 @@ - exp - fill_null - filter + - from_iterable - from_numpy - gather_every - head diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index a60b2f06bb..311f934409 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -15,6 +15,7 @@ chunked_array, extract_native, floordiv_compat, + is_array_or_scalar, lit, narwhals_to_native_dtype, native_to_narwhals_dtype, @@ -156,10 +157,15 @@ def from_iterable( dtype: IntoDType | None = None, ) -> Self: version = context._version - dtype_pa = narwhals_to_native_dtype(dtype, version) if dtype else None - return cls.from_native( - chunked_array([data], dtype_pa), name=name, context=context - ) + if dtype is not None: + dtype_pa: pa.DataType | None = narwhals_to_native_dtype(dtype, version) + if is_array_or_scalar(data): + data = data.cast(dtype_pa) + dtype_pa = None + native = data if cls._is_native(data) else chunked_array([data], dtype_pa) + else: + native = chunked_array([data]) + return cls.from_native(native, context=context, name=name) def _from_scalar(self, value: Any) -> Self: if hasattr(value, "as_py"): diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index 0fc9b5d85d..a6778a6eb9 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -80,6 +80,11 @@ def extract_py_scalar(value: Any, /) -> Any: return maybe_extract_py_scalar(value, return_py_scalar=True) +def is_array_or_scalar(obj: Any) -> TypeIs[ArrayOrScalar]: + """Return True for any base `pyarrow` container.""" + return isinstance(obj, (pa.ChunkedArray, pa.Array, pa.Scalar)) + + def chunked_array( arr: ArrayOrScalar | list[Iterable[Any]], dtype: pa.DataType | None = None, / ) -> ChunkedArrayAny: @@ -87,7 +92,7 @@ def chunked_array( return arr if isinstance(arr, list): return pa.chunked_array(arr, dtype) - return pa.chunked_array([arr], arr.type) + return pa.chunked_array([arr], dtype) def nulls_like(n: int, series: ArrowSeries) -> ArrayAny: diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 0f219d1db2..3f6180b766 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -6,6 +6,8 @@ from narwhals._polars.utils import ( BACKEND_VERSION, + SERIES_ACCEPTS_PD_INDEX, + SERIES_RESPECTS_DTYPE, PolarsAnyNamespace, PolarsCatNamespace, PolarsDateTimeNamespace, @@ -19,7 +21,7 @@ native_to_narwhals_dtype, ) from narwhals._utils import Implementation, requires -from narwhals.dependencies import is_numpy_array_1d +from narwhals.dependencies import is_numpy_array_1d, is_pandas_index if TYPE_CHECKING: from collections.abc import Iterable, Iterator, Mapping, Sequence @@ -48,6 +50,7 @@ T = TypeVar("T") IncludeBreakpoint: TypeAlias = Literal[False, True] +Incomplete: TypeAlias = Any # Series methods where PolarsSeries just defers to Polars.Series directly. INHERITED_METHODS = frozenset( @@ -180,9 +183,15 @@ def from_iterable( ) -> Self: version = context._version dtype_pl = narwhals_to_native_dtype(dtype, version) if dtype else None - # NOTE: `Iterable` is fine, annotation is overly narrow - # https://github.com/pola-rs/polars/blob/82d57a4ee41f87c11ca1b1af15488459727efdd7/py-polars/polars/series/series.py#L332-L333 - native = pl.Series(name=name, values=cast("Sequence[Any]", data), dtype=dtype_pl) + values: Incomplete = data + if SERIES_RESPECTS_DTYPE: + native = pl.Series(name, values, dtype=dtype_pl) + else: # pragma: no cover + if (not SERIES_ACCEPTS_PD_INDEX) and is_pandas_index(values): + values = values.to_series() + native = pl.Series(name, values) + if dtype_pl: + native = native.cast(dtype_pl) return cls.from_native(native, context=context) @staticmethod @@ -565,8 +574,6 @@ def _bins_from_bin_count(self, bin_count: int) -> pl.Series: # pragma: no cover returns bins that range from -inf to +inf and has bin_count + 1 bins. for compat: convert `bin_count=` call to `bins=` """ - from typing import cast - lower = cast("float", self.native.min()) upper = cast("float", self.native.max()) diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 712ce526aa..8a365ddd00 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -2,7 +2,7 @@ import abc from functools import lru_cache -from typing import TYPE_CHECKING, Any, ClassVar, Protocol, TypeVar, overload +from typing import TYPE_CHECKING, Any, ClassVar, Final, Protocol, TypeVar, overload import polars as pl @@ -49,6 +49,15 @@ BACKEND_VERSION = Implementation.POLARS._backend_version() """Static backend version for `polars`.""" +SERIES_RESPECTS_DTYPE: Final[bool] = BACKEND_VERSION >= (0, 20, 26) +"""`pl.Series(dtype=...)` fixed in https://github.com/pola-rs/polars/pull/15962 + +Includes `SERIES_ACCEPTS_PD_INDEX`. +""" + +SERIES_ACCEPTS_PD_INDEX: Final[bool] = BACKEND_VERSION >= (0, 20, 7) +"""`pl.Series(values: pd.Index)` fixed in https://github.com/pola-rs/polars/pull/14087""" + @overload def extract_native(obj: _StoresNative[NativeT]) -> NativeT: ... diff --git a/narwhals/series.py b/narwhals/series.py index af9094de58..57030c31ed 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1,7 +1,7 @@ from __future__ import annotations import math -from collections.abc import Iterator, Mapping, Sequence +from collections.abc import Iterable, Iterator, Mapping, Sequence from typing import TYPE_CHECKING, Any, Callable, ClassVar, Generic, Literal, overload from narwhals._utils import ( @@ -13,9 +13,10 @@ is_compliant_series, is_eager_allowed, is_index_selector, + qualified_type_name, supports_arrow_c_stream, ) -from narwhals.dependencies import is_numpy_array_1d, is_numpy_scalar +from narwhals.dependencies import is_numpy_array, is_numpy_array_1d, is_numpy_scalar from narwhals.dtypes import _validate_dtype, _validate_into_dtype from narwhals.exceptions import ComputeError, InvalidOperationError from narwhals.series_cat import SeriesCatNamespace @@ -157,6 +158,71 @@ def from_numpy( ) raise ValueError(msg) + @classmethod + def from_iterable( + cls, + name: str, + values: Iterable[Any], + dtype: IntoDType | None = None, + *, + backend: ModuleType | Implementation | str, + ) -> Series[Any]: + """Construct a Series from an iterable. + + Arguments: + name: Name of resulting Series. + values: One-dimensional data represented as an iterable. + dtype: (Narwhals) dtype. If not provided, the native library + may auto-infer it from `values`. + backend: specifies which eager backend instantiate to. + + `backend` can be specified in various ways + + - As `Implementation.` with `BACKEND` being `PANDAS`, `PYARROW`, + `POLARS`, `MODIN` or `CUDF`. + - As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`. + - Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`. + + Returns: + A new Series + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> + >>> values = [4, 1, 3, 2] + >>> nw.Series.from_iterable("a", values, dtype=nw.UInt32, backend="pandas") + ┌──────────────────────┐ + | Narwhals Series | + |----------------------| + |0 4 | + |1 1 | + |2 3 | + |3 2 | + |Name: a, dtype: uint32| + └──────────────────────┘ + """ + if is_numpy_array(values): + return cls.from_numpy(name, values, dtype, backend=backend) + if dtype: + _validate_into_dtype(dtype) + if not isinstance(values, Iterable): + msg = f"Expected values to be an iterable, got: {qualified_type_name(values)!r}." + raise TypeError(msg) + implementation = Implementation.from_backend(backend) + if is_eager_allowed(implementation): + ns = cls._version.namespace.from_backend(implementation).compliant + compliant = ns._series.from_iterable( + values, context=ns, name=name, dtype=dtype + ) + return cls(compliant, level="full") + msg = ( + f"{implementation} support in Narwhals is lazy-only, but `Series.from_iterable` is an eager-only function.\n\n" + "Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n" + f" nw.Series.from_iterable('a', [1,2,3], backend='pyarrow').to_frame().lazy('{implementation}')" + ) + raise ValueError(msg) + @property def implementation(self) -> Implementation: """Return implementation of native Series. diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index d553a93f58..fb6dbf9721 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -311,6 +311,18 @@ def from_numpy( result = super().from_numpy(name, values, dtype, backend=backend) return cast("Series[Any]", result) + @classmethod + def from_iterable( + cls, + name: str, + values: Iterable[Any], + dtype: IntoDType | None = None, + *, + backend: ModuleType | Implementation | str, + ) -> Series[Any]: + result = super().from_iterable(name, values, dtype, backend=backend) + return cast("Series[Any]", result) + @property def _dataframe(self) -> type[DataFrame[Any]]: return DataFrame diff --git a/narwhals/stable/v2/__init__.py b/narwhals/stable/v2/__init__.py index bbff47f0b0..ed6fe091ec 100644 --- a/narwhals/stable/v2/__init__.py +++ b/narwhals/stable/v2/__init__.py @@ -189,6 +189,8 @@ def collect( class Series(NwSeries[IntoSeriesT]): + _version = Version.V2 + @inherit_doc(NwSeries) def __init__( self, series: Any, *, level: Literal["full", "lazy", "interchange"] @@ -203,6 +205,20 @@ def __init__( def _dataframe(self) -> type[DataFrame[Any]]: return DataFrame + # TODO @dangotbanned: Fix `from_numpy` override missing in `v2` in another PR + + @classmethod + def from_iterable( + cls, + name: str, + values: Iterable[Any], + dtype: IntoDType | None = None, + *, + backend: ModuleType | Implementation | str, + ) -> Series[Any]: + result = super().from_iterable(name, values, dtype, backend=backend) + return cast("Series[Any]", result) + def to_frame(self) -> DataFrame[Any]: return _stableify(super().to_frame()) diff --git a/tests/conftest.py b/tests/conftest.py index a55fa01270..7a0b1b11ab 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -327,3 +327,9 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: @pytest.fixture(params=TEST_EAGER_BACKENDS) def eager_backend(request: pytest.FixtureRequest) -> EagerAllowed: return request.param # type: ignore[no-any-return] + + +@pytest.fixture(params=[el for el in TEST_EAGER_BACKENDS if not isinstance(el, str)]) +def eager_implementation(request: pytest.FixtureRequest) -> EagerAllowed: + """Use if a test is heavily parametric, skips `str` backend.""" + return request.param # type: ignore[no-any-return] diff --git a/tests/series_only/from_iterable_test.py b/tests/series_only/from_iterable_test.py new file mode 100644 index 0000000000..733c0728ca --- /dev/null +++ b/tests/series_only/from_iterable_test.py @@ -0,0 +1,195 @@ +from __future__ import annotations + +from collections import deque +from importlib.util import find_spec +from typing import TYPE_CHECKING, Any + +import pytest + +import narwhals as nw +from narwhals._utils import qualified_type_name +from tests.utils import PANDAS_VERSION, assert_equal_series + +if TYPE_CHECKING: + from collections.abc import ( + Callable, + Generator, + Iterable, + Iterator, + KeysView, + Sequence, + ValuesView, + ) + + from typing_extensions import TypeAlias + + from narwhals._namespace import EagerAllowed + from narwhals.typing import IntoDType + + IntoIterable: TypeAlias = Callable[..., Iterable[Any]] + + +class UserDefinedIterable: + def __init__(self, iterable: Iterable[Any]) -> None: + self.iterable: Iterable[Any] = iterable + + def __iter__(self) -> Iterator[Any]: + yield from self.iterable + + +def generator_function(iterable: Iterable[Any]) -> Generator[Any, Any, None]: + yield from iterable + + +def generator_expression(iterable: Iterable[Any]) -> Generator[Any, None, None]: + return (element for element in iterable) + + +def dict_keys(iterable: Iterable[Any]) -> KeysView[Any]: + return dict.fromkeys(iterable).keys() + + +def dict_values(iterable: Iterable[Any]) -> ValuesView[Any]: + return dict(enumerate(iterable)).values() + + +_INTO_ITER_3RD_PARTY: list[IntoIterable] = [] + +if find_spec("numpy"): # pragma: no cover + import numpy as np + + _INTO_ITER_3RD_PARTY.append(np.array) +else: # pragma: no cover + ... +if find_spec("pandas"): # pragma: no cover + import pandas as pd + + _INTO_ITER_3RD_PARTY.extend([pd.Index, pd.array, pd.Series]) +else: # pragma: no cover + ... +if find_spec("polars"): # pragma: no cover + import polars as pl + + _INTO_ITER_3RD_PARTY.append(pl.Series) +else: # pragma: no cover + ... +if find_spec("pyarrow"): # pragma: no cover + import pyarrow as pa + + def chunked_array(iterable: Any) -> Iterable[Any]: + return pa.chunked_array([iterable]) + + _INTO_ITER_3RD_PARTY.extend([pa.array, chunked_array]) +else: # pragma: no cover + ... + +_INTO_ITER_STDLIB: tuple[IntoIterable, ...] = ( + list, + tuple, + iter, + deque, + generator_function, + generator_expression, +) +_INTO_ITER_STDLIB_EXOTIC: tuple[IntoIterable, ...] = dict_keys, dict_values +INTO_ITER: tuple[IntoIterable, ...] = ( + *_INTO_ITER_STDLIB, + *_INTO_ITER_STDLIB_EXOTIC, + UserDefinedIterable, + *_INTO_ITER_3RD_PARTY, +) + + +def _ids_into_iter(obj: Any) -> str: + module: str = "" + if (obj_module := obj.__module__) and obj_module != __name__: + module = obj.__module__ + name = qualified_type_name(obj) + if name in {"function", "builtin_function_or_method"} or "_cython" in name: + return f"{module}.{obj.__qualname__}" if module else obj.__qualname__ + return name.removeprefix(__name__).strip(".") + + +@pytest.mark.parametrize( + ("values", "dtype"), + [ + ((4, 1, 2), nw.Int32), + ((-1, 5, 100), None), + ((2.1, 2.7, 2.0), nw.Float64), + (("one", "two"), nw.String), + ], + ids=["Int32", "no-dtype", "Float64", "String"], +) +@pytest.mark.parametrize("into_iter", INTO_ITER, ids=_ids_into_iter) +def test_series_from_iterable( + eager_implementation: EagerAllowed, + values: Sequence[Any], + dtype: IntoDType, + into_iter: IntoIterable, + request: pytest.FixtureRequest, +) -> None: + name = "b" + iterable = into_iter(values) + test_name = request.node.name + request.applymarker( + pytest.mark.xfail( + ("polars-pandas" in test_name and "array" in test_name), + raises=TypeError, + reason="Polars doesn't support `pd.array`.\nhttps://github.com/pola-rs/polars/issues/22757", + ) + ) + request.applymarker( + pytest.mark.xfail( + ( + "pandas-polars" in test_name + and "String" in test_name + and PANDAS_VERSION >= (3,) + ), + reason=( + "Pandas nightly suddenly raising on String `pl.Series` in:\n" + "https://github.com/pandas-dev/pandas/blob/3ea783ea21e22035cf0a3605cfde3178e9348ee1/pandas/core/arrays/string_arrow.py#L202-L204" + ), + ) + ) + if ( + "pandas-pyarrow" in test_name + and "array-String" in test_name + and PANDAS_VERSION < (2, 1) + ): # pragma: no cover + pytest.skip( + "pandas being pandas with strings https://github.com/narwhals-dev/narwhals/pull/2933#issuecomment-3156009516" + ) + result = nw.Series.from_iterable(name, iterable, dtype, backend=eager_implementation) + if dtype: + assert result.dtype == dtype + assert_equal_series(result, values, name) + + +@pytest.mark.parametrize(("values", "expected_dtype"), [((4, 1, 2), nw.Int64)]) +def test_series_from_iterable_infer( + eager_backend: EagerAllowed, values: Sequence[Any], expected_dtype: IntoDType +) -> None: + name = "b" + result = nw.Series.from_iterable(name, values, backend=eager_backend) + assert result.dtype == expected_dtype + assert_equal_series(result, values, name) + + +def test_series_from_iterable_not_eager() -> None: + backend = "sqlframe" + pytest.importorskip(backend) + with pytest.raises(ValueError, match="lazy-only"): + nw.Series.from_iterable("", [1, 2, 3], backend=backend) + + +def test_series_from_iterable_numpy_not_1d(eager_backend: EagerAllowed) -> None: + pytest.importorskip("numpy") + import numpy as np + + with pytest.raises(ValueError, match="only.+1D numpy arrays"): + nw.Series.from_iterable("", np.array([[0], [2]]), backend=eager_backend) + + +def test_series_from_iterable_not_iterable(eager_backend: EagerAllowed) -> None: + with pytest.raises(TypeError, match="iterable.+got.+int"): + nw.Series.from_iterable("", 2000, backend=eager_backend) # type: ignore[arg-type] diff --git a/tests/series_only/from_numpy_test.py b/tests/series_only/from_numpy_test.py index 68fc3f5444..6e1bd6e6d4 100644 --- a/tests/series_only/from_numpy_test.py +++ b/tests/series_only/from_numpy_test.py @@ -9,7 +9,7 @@ import numpy as np import narwhals as nw -from tests.utils import assert_equal_data +from tests.utils import assert_equal_series if TYPE_CHECKING: from collections.abc import Sequence @@ -23,12 +23,6 @@ NAME = "a" -def assert_equal_series( - result: nw.Series[Any], expected: Sequence[Any], name: str -) -> None: - assert_equal_data(result.to_frame(), {name: expected}) - - def test_series_from_numpy(eager_backend: EagerAllowed) -> None: expected = [5, 2, 0, 1] result = nw.Series.from_numpy(NAME, arr, backend=eager_backend) diff --git a/tests/utils.py b/tests/utils.py index 482e4362e1..eee7684f79 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -146,6 +146,12 @@ def assert_equal_data(result: Any, expected: Mapping[str, Any]) -> None: ) +def assert_equal_series( + result: nw.Series[Any], expected: Sequence[Any], name: str +) -> None: + assert_equal_data(result.to_frame(), {name: expected}) + + def maybe_get_modin_df(df_pandas: pd.DataFrame) -> Any: """Convert a pandas DataFrame to a Modin DataFrame if Modin is available.""" try: diff --git a/tests/v1_test.py b/tests/v1_test.py index db88b95a51..91dacc4e94 100644 --- a/tests/v1_test.py +++ b/tests/v1_test.py @@ -1,6 +1,7 @@ # Test assorted functions which we overwrite in stable.v1 from __future__ import annotations +from collections import deque from contextlib import nullcontext as does_not_raise from datetime import datetime, timedelta from typing import TYPE_CHECKING, Any, Callable, cast @@ -37,6 +38,7 @@ Constructor, ConstructorEager, assert_equal_data, + assert_equal_series, ) if TYPE_CHECKING: @@ -1049,3 +1051,30 @@ def test_series_from_numpy( if dtype: assert result.dtype == dtype assert_equal_data(result.to_frame(), {name: expected}) + + +@pytest.mark.parametrize( + ("dtype", "expected"), + [ + (None, [5, 2, 0, 1]), + (nw_v1.Int64, [5, 2, 0, 1]), + (nw_v1.String, ("a", "b", "c")), + (nw_v1.Float64, [5.0, 2.0, 0.0, 1.0]), + ( + nw_v1.Datetime("ns"), + deque([datetime(2005, 1, 1, 10), datetime(2002, 1, 1, 10, 43)]), + ), + ], + ids=str, +) +def test_series_from_iterable( + eager_backend: EagerAllowed, dtype: IntoDType | None, expected: Sequence[Any] +) -> None: + data = expected + name = "abc" + result = nw_v1.Series.from_iterable(name, data, backend=eager_backend, dtype=dtype) + assert result._version is Version.V1 + assert isinstance(result, nw_v1.Series) + if dtype: + assert result.dtype == dtype + assert_equal_series(result, expected, name) diff --git a/tests/v2_test.py b/tests/v2_test.py index 507daf11b9..c560d37100 100644 --- a/tests/v2_test.py +++ b/tests/v2_test.py @@ -2,6 +2,8 @@ from __future__ import annotations +from collections import deque +from datetime import datetime from typing import TYPE_CHECKING, Any import numpy as np @@ -10,12 +12,21 @@ import narwhals.stable.v2 as nw_v2 from narwhals.utils import Version -from tests.utils import PANDAS_VERSION, Constructor, assert_equal_data +from tests.utils import ( + PANDAS_VERSION, + Constructor, + assert_equal_data, + assert_equal_series, +) if TYPE_CHECKING: + from collections.abc import Sequence + from typing_extensions import assert_type + from narwhals._namespace import EagerAllowed from narwhals.stable.v2.typing import IntoDataFrameT + from narwhals.typing import IntoDType def test_toplevel() -> None: @@ -332,3 +343,41 @@ def test_imports() -> None: from narwhals.stable.v2.dtypes import Enum # noqa: F401 from narwhals.stable.v2.selectors import datetime # noqa: F401 from narwhals.stable.v2.typing import IntoDataFrame # noqa: F401 + + +@pytest.mark.parametrize( + ("dtype", "expected"), + [ + (None, [5, 2, 0, 1]), + (nw_v2.Int64, [5, 2, 0, 1]), + (nw_v2.String, ("a", "b", "c")), + (nw_v2.Float64, [5.0, 2.0, 0.0, 1.0]), + ( + nw_v2.Datetime("ms"), + deque([datetime(2005, 1, 1, 10), datetime(2002, 1, 1, 10, 43)]), + ), + ], + ids=str, +) +def test_series_from_iterable( + eager_backend: EagerAllowed, + dtype: IntoDType | None, + expected: Sequence[Any], + request: pytest.FixtureRequest, +) -> None: + data = expected + name = "abc" + result = nw_v2.Series.from_iterable(name, data, backend=eager_backend, dtype=dtype) + assert result._version is Version.V2 + assert isinstance(result, nw_v2.Series) + if dtype: + request.applymarker( + pytest.mark.xfail( + result.implementation.is_pandas_like() + and dtype.is_temporal() + and PANDAS_VERSION < (2,), + reason='Pandas does not support "ms" or "us" time units before version 2.0', + ) + ) + assert result.dtype == dtype + assert_equal_series(result, expected, name) diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 15f7876653..cff6ede54c 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -66,6 +66,7 @@ def read_documented_members(source: str | Path) -> list[str]: "arg_min", "arg_true", "dtype", + "from_iterable", "from_numpy", "gather_every", "implementation",