diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index 6749db244a..ae69bfe39b 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -25,7 +25,7 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - name: install-minimum-versions - run: uv pip install pipdeptree tox virtualenv setuptools pandas==0.25.3 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" scipy==1.5.0 scikit-learn==1.1.0 duckdb==1.0 tzdata --system + run: uv pip install pipdeptree tox virtualenv setuptools pandas==0.25.3 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" scipy==1.5.0 scikit-learn==1.1.0 duckdb==1.0 tzdata backports.zoneinfo --system - name: install-reqs run: | uv pip install -e ".[tests]" --system @@ -62,7 +62,7 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - name: install-pretty-old-versions - run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.5.0 scikit-learn==1.1.0 duckdb==1.0 tzdata --system + run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.5.0 scikit-learn==1.1.0 duckdb==1.0 tzdata backports.zoneinfo --system - name: install-reqs run: uv pip install -e ".[tests]" --system - name: show-deps diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 6bb570e536..790e4565b8 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -28,7 +28,7 @@ jobs: cache-dependency-glob: "pyproject.toml" - name: install-reqs # Python3.8 is technically at end-of-life, so we don't test everything - run: uv pip install -e ".[tests, core]" --system + run: uv pip install -e ".[tests, core]" backports.zoneinfo --system - name: show-deps run: uv pip freeze - name: Run pytest diff --git a/narwhals/_duckdb/expr_str.py b/narwhals/_duckdb/expr_str.py index b7804662f4..6f29898494 100644 --- a/narwhals/_duckdb/expr_str.py +++ b/narwhals/_duckdb/expr_str.py @@ -108,7 +108,7 @@ def replace(self: Self, pattern: str, value: str, *, literal: bool, n: int) -> N def to_datetime(self: Self, format: str | None) -> DuckDBExpr: # noqa: A002 if format is None: - msg = "Cannot infer format with DuckDB backend" + msg = "Cannot infer format with DuckDB backend, please specify `format` explicitly." raise NotImplementedError(msg) return self._compliant_expr._from_call( diff --git a/narwhals/_duckdb/utils.py b/narwhals/_duckdb/utils.py index 5fa205279e..bffccac4f5 100644 --- a/narwhals/_duckdb/utils.py +++ b/narwhals/_duckdb/utils.py @@ -83,6 +83,10 @@ def native_to_narwhals_dtype(duckdb_dtype: str, version: Version) -> DType: return dtypes.Date() if duckdb_dtype == "TIMESTAMP": return dtypes.Datetime() + if duckdb_dtype == "TIMESTAMP WITH TIME ZONE": + # TODO(marco): is UTC correct, or should we be getting the connection timezone? + # https://github.com/narwhals-dev/narwhals/issues/2165 + return dtypes.Datetime(time_zone="UTC") if duckdb_dtype == "BOOLEAN": return dtypes.Boolean() if duckdb_dtype == "INTERVAL": diff --git a/narwhals/_pandas_like/series_str.py b/narwhals/_pandas_like/series_str.py index 5fae5867e3..6e75908489 100644 --- a/narwhals/_pandas_like/series_str.py +++ b/narwhals/_pandas_like/series_str.py @@ -85,11 +85,23 @@ def split(self: Self, by: str) -> PandasLikeSeries: ) def to_datetime(self: Self, format: str | None) -> PandasLikeSeries: # noqa: A002 - return self._compliant_series._from_native_series( - to_datetime(self._compliant_series._implementation)( + if format is not None and any(x in format for x in ("%z", "Z")): + # We know that the inputs are timezone-aware, so we can directly pass + # `utc=True` for better performance. + return self._compliant_series._from_native_series( + to_datetime(self._compliant_series._implementation, utc=True)( + self._compliant_series._native_series, format=format + ) + ) + result = self._compliant_series._from_native_series( + to_datetime(self._compliant_series._implementation, utc=False)( self._compliant_series._native_series, format=format ) ) + result_time_zone = result.dtype.time_zone # type: ignore[attr-defined] + if result_time_zone is not None and result_time_zone != "UTC": + result = result.dt.convert_time_zone("UTC") + return result def to_uppercase(self: Self) -> PandasLikeSeries: return self._compliant_series._from_native_series( diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 51fa5108c8..a442c64029 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -1,9 +1,9 @@ from __future__ import annotations +import functools import re import warnings from contextlib import suppress -from functools import lru_cache from typing import TYPE_CHECKING from typing import Any from typing import Iterable @@ -356,7 +356,7 @@ def rename( return obj.rename(*args, **kwargs, copy=False) # type: ignore[attr-defined] -@lru_cache(maxsize=16) +@functools.lru_cache(maxsize=16) def non_object_native_to_narwhals_dtype(dtype: str, version: Version) -> DType: dtypes = import_dtypes_module(version) if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}: @@ -679,9 +679,11 @@ def align_series_full_broadcast( return reindexed -def to_datetime(implementation: Implementation) -> Any: +def to_datetime(implementation: Implementation, *, utc: bool) -> Any: if implementation in PANDAS_LIKE_IMPLEMENTATION: - return implementation.to_native_namespace().to_datetime + return functools.partial( + implementation.to_native_namespace().to_datetime, utc=utc + ) else: # pragma: no cover msg = f"Expected pandas-like implementation ({PANDAS_LIKE_IMPLEMENTATION}), found {implementation}" diff --git a/narwhals/_spark_like/utils.py b/narwhals/_spark_like/utils.py index 6332f35d47..c02a100c92 100644 --- a/narwhals/_spark_like/utils.py +++ b/narwhals/_spark_like/utils.py @@ -55,6 +55,8 @@ def native_to_narwhals_dtype( if isinstance(dtype, native.TimestampNTZType): return dtypes.Datetime() if isinstance(dtype, native.TimestampType): + # TODO(marco): is UTC correct, or should we be getting the connection timezone? + # https://github.com/narwhals-dev/narwhals/issues/2165 return dtypes.Datetime(time_zone="UTC") if isinstance(dtype, native.DecimalType): return dtypes.Decimal() diff --git a/narwhals/expr_str.py b/narwhals/expr_str.py index 9206009691..3b837779ba 100644 --- a/narwhals/expr_str.py +++ b/narwhals/expr_str.py @@ -370,6 +370,13 @@ def tail(self: Self, n: int = 5) -> ExprT: def to_datetime(self: Self, format: str | None = None) -> ExprT: # noqa: A002 """Convert to Datetime dtype. + Notes: + - pandas defaults to nanosecond time unit, Polars to microsecond. + Prior to pandas 2.0, nanoseconds were the only time unit supported + in pandas, with no ability to set any other one. The ability to + set the time unit in pandas, if the version permits, will arrive. + - timezone-aware strings are all converted to and parsed as UTC. + Warning: As different backends auto-infer format in different ways, if `format=None` there is no guarantee that the result will be equal. @@ -381,12 +388,6 @@ def to_datetime(self: Self, format: str | None = None) -> ExprT: # noqa: A002 Returns: A new expression. - Notes: - pandas defaults to nanosecond time unit, Polars to microsecond. - Prior to pandas 2.0, nanoseconds were the only time unit supported - in pandas, with no ability to set any other one. The ability to - set the time unit in pandas, if the version permits, will arrive. - Examples: >>> import polars as pl >>> import narwhals as nw diff --git a/narwhals/series_str.py b/narwhals/series_str.py index 98e4280753..1169e0da55 100644 --- a/narwhals/series_str.py +++ b/narwhals/series_str.py @@ -377,10 +377,11 @@ def to_datetime(self: Self, format: str | None = None) -> SeriesT: # noqa: A002 """Parse Series with strings to a Series with Datetime dtype. Notes: - pandas defaults to nanosecond time unit, Polars to microsecond. - Prior to pandas 2.0, nanoseconds were the only time unit supported - in pandas, with no ability to set any other one. The ability to - set the time unit in pandas, if the version permits, will arrive. + - pandas defaults to nanosecond time unit, Polars to microsecond. + Prior to pandas 2.0, nanoseconds were the only time unit supported + in pandas, with no ability to set any other one. The ability to + set the time unit in pandas, if the version permits, will arrive. + - timezone-aware strings are all converted to and parsed as UTC. Warning: As different backends auto-infer format in different ways, if `format=None` diff --git a/tests/expr_and_series/str/to_datetime_test.py b/tests/expr_and_series/str/to_datetime_test.py index 24687af9ec..e5ac785095 100644 --- a/tests/expr_and_series/str/to_datetime_test.py +++ b/tests/expr_and_series/str/to_datetime_test.py @@ -1,5 +1,6 @@ from __future__ import annotations +from contextlib import nullcontext as does_not_raise from datetime import datetime from datetime import timezone from typing import TYPE_CHECKING @@ -9,7 +10,10 @@ import narwhals.stable.v1 as nw from narwhals._arrow.utils import parse_datetime_format +from tests.utils import PANDAS_VERSION +from tests.utils import PYARROW_VERSION from tests.utils import assert_equal_data +from tests.utils import is_pyarrow_windows_no_tzdata if TYPE_CHECKING: from tests.utils import Constructor @@ -28,10 +32,16 @@ def test_to_datetime(constructor: Constructor) -> None: nw.from_native(constructor(data)) .lazy() .select(b=nw.col("a").str.to_datetime(format="%Y-%m-%dT%H:%M:%S")) - .collect() - .item(row=0, column="b") ) - assert str(result) == expected + result_schema = result.collect_schema() + assert isinstance(result_schema["b"], nw.Datetime) + if "sqlframe" in str(constructor): + # https://github.com/eakmanrq/sqlframe/issues/326 + assert result_schema["b"].time_zone == "UTC" # pyright: ignore[reportAttributeAccessIssue] + else: + assert result_schema["b"].time_zone is None # pyright: ignore[reportAttributeAccessIssue] + result_item = result.collect().item(row=0, column="b") + assert str(result_item) == expected def test_to_datetime_series(constructor_eager: ConstructorEager) -> None: @@ -190,3 +200,39 @@ def test_pyarrow_infer_datetime_raise_inconsistent_date_fmt( ) -> None: with pytest.raises(ValueError, match="Unable to infer datetime format. "): parse_datetime_format(pa.chunked_array([data])) + + +@pytest.mark.parametrize("format", [None, "%Y-%m-%dT%H:%M:%S%z"]) +def test_to_datetime_tz_aware( + constructor: Constructor, + request: pytest.FixtureRequest, + format: str | None, # noqa: A002 +) -> None: + if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (13,): + # bugged + pytest.skip() + if "pandas" in str(constructor) and PANDAS_VERSION < (1,): + # "Cannot pass a tz argument when parsing strings with timezone information." + pytest.skip() + if is_pyarrow_windows_no_tzdata(constructor): + pytest.skip() + if "sqlframe" in str(constructor): + # https://github.com/eakmanrq/sqlframe/issues/325 + request.applymarker(pytest.mark.xfail) + context = ( + pytest.raises(NotImplementedError) + if any(x in str(constructor) for x in ("duckdb", "sqlframe")) and format is None + else does_not_raise() + ) + df = nw.from_native(constructor({"a": ["2020-01-01T01:02:03+0100"]})) + with context: + result = df.with_columns(b=nw.col("a").str.to_datetime(format)) + assert isinstance(result.collect_schema()["b"], nw.Datetime) + result_schema = result.lazy().collect().schema + assert result_schema["a"] == nw.String + assert isinstance(result_schema["b"], nw.Datetime) + expected = { + "a": ["2020-01-01T01:02:03+0100"], + "b": [datetime(2020, 1, 1, 0, 2, 3, tzinfo=timezone.utc)], + } + assert_equal_data(result, expected)