diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 0f7f36d970..337413ca31 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -10,6 +10,7 @@ from narwhals._duckdb.utils import ( DeferredTimeZone, F, + catch_duckdb_exception, col, evaluate_exprs, lit, @@ -167,11 +168,17 @@ def simple_select(self, *column_names: str) -> Self: def aggregate(self, *exprs: DuckDBExpr) -> Self: selection = [val.alias(name) for name, val in evaluate_exprs(self, *exprs)] - return self._with_native(self.native.aggregate(selection)) # type: ignore[arg-type] + try: + return self._with_native(self.native.aggregate(selection)) # type: ignore[arg-type] + except Exception as e: # noqa: BLE001 + raise catch_duckdb_exception(e, self) from None def select(self, *exprs: DuckDBExpr) -> Self: selection = (val.alias(name) for name, val in evaluate_exprs(self, *exprs)) - return self._with_native(self.native.select(*selection)) + try: + return self._with_native(self.native.select(*selection)) + except Exception as e: # noqa: BLE001 + raise catch_duckdb_exception(e, self) from None def drop(self, columns: Sequence[str], *, strict: bool) -> Self: columns_to_drop = parse_columns_to_drop(self, columns, strict=strict) @@ -197,12 +204,18 @@ def with_columns(self, *exprs: DuckDBExpr) -> Self: for name in self.columns ] result.extend(value.alias(name) for name, value in new_columns_map.items()) - return self._with_native(self.native.select(*result)) + try: + return self._with_native(self.native.select(*result)) + except Exception as e: # noqa: BLE001 + raise catch_duckdb_exception(e, self) from None def filter(self, predicate: DuckDBExpr) -> Self: # `[0]` is safe as the predicate's expression only returns a single column mask = predicate(self)[0] - return self._with_native(self.native.filter(mask)) + try: + return self._with_native(self.native.filter(mask)) + except Exception as e: # noqa: BLE001 + raise catch_duckdb_exception(e, self) from None @property def schema(self) -> dict[str, DType]: diff --git a/narwhals/_duckdb/utils.py b/narwhals/_duckdb/utils.py index 5de537394d..02effe7978 100644 --- a/narwhals/_duckdb/utils.py +++ b/narwhals/_duckdb/utils.py @@ -6,6 +6,7 @@ import duckdb from narwhals._utils import Version, isinstance_or_issubclass +from narwhals.exceptions import ColumnNotFoundError if TYPE_CHECKING: from collections.abc import Sequence @@ -13,11 +14,13 @@ from duckdb import DuckDBPyRelation, Expression from duckdb.typing import DuckDBPyType + from narwhals._compliant.typing import CompliantLazyFrameAny from narwhals._duckdb.dataframe import DuckDBLazyFrame from narwhals._duckdb.expr import DuckDBExpr from narwhals.dtypes import DType from narwhals.typing import IntoDType + UNITS_DICT = { "y": "year", "q": "quarter", @@ -349,3 +352,20 @@ def window_expression( func = f"{str(expr).removesuffix(')')} ignore nulls)" if ignore_nulls else str(expr) return SQLExpression(f"{func} over ({pb} {ob} {rows})") + + +def catch_duckdb_exception( + exception: Exception, frame: CompliantLazyFrameAny, / +) -> ColumnNotFoundError | Exception: + if isinstance(exception, duckdb.BinderException) and any( + msg in str(exception) + for msg in ( + "not found in FROM clause", + "this column cannot be referenced before it is defined", + ) + ): + return ColumnNotFoundError.from_available_column_names( + available_columns=frame.columns + ) + # Just return exception as-is. + return exception diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index 4a80679b37..ca2779677b 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -7,6 +7,8 @@ from narwhals._namespace import is_native_spark_like from narwhals._spark_like.utils import ( + catch_pyspark_connect_exception, + catch_pyspark_sql_exception, evaluate_exprs, import_functions, import_native_dtypes, @@ -200,7 +202,7 @@ def columns(self) -> list[str]: ) return self._cached_columns - def collect( + def _collect( self, backend: ModuleType | Implementation | str | None, **kwargs: Any ) -> CompliantDataFrameAny: if backend is Implementation.PANDAS: @@ -238,6 +240,16 @@ def collect( msg = f"Unsupported `backend` value: {backend}" # pragma: no cover raise ValueError(msg) # pragma: no cover + def collect( + self, backend: ModuleType | Implementation | str | None, **kwargs: Any + ) -> CompliantDataFrameAny: + if self._implementation.is_pyspark_connect(): + try: + return self._collect(backend, **kwargs) + except Exception as e: # noqa: BLE001 + raise catch_pyspark_connect_exception(e) from None + return self._collect(backend, **kwargs) + def simple_select(self, *column_names: str) -> Self: return self._with_native(self.native.select(*column_names)) @@ -245,22 +257,42 @@ def aggregate(self, *exprs: SparkLikeExpr) -> Self: new_columns = evaluate_exprs(self, *exprs) new_columns_list = [col.alias(col_name) for col_name, col in new_columns] + if self._implementation.is_pyspark(): + try: + return self._with_native(self.native.agg(*new_columns_list)) + except Exception as e: # noqa: BLE001 + raise catch_pyspark_sql_exception(e, self) from None return self._with_native(self.native.agg(*new_columns_list)) def select(self, *exprs: SparkLikeExpr) -> Self: new_columns = evaluate_exprs(self, *exprs) new_columns_list = [col.alias(col_name) for (col_name, col) in new_columns] + if self._implementation.is_pyspark(): # pragma: no cover + try: + return self._with_native(self.native.select(*new_columns_list)) + except Exception as e: # noqa: BLE001 + raise catch_pyspark_sql_exception(e, self) from None return self._with_native(self.native.select(*new_columns_list)) def with_columns(self, *exprs: SparkLikeExpr) -> Self: new_columns = evaluate_exprs(self, *exprs) + if self._implementation.is_pyspark(): # pragma: no cover + try: + return self._with_native(self.native.withColumns(dict(new_columns))) + except Exception as e: # noqa: BLE001 + raise catch_pyspark_sql_exception(e, self) from None + return self._with_native(self.native.withColumns(dict(new_columns))) def filter(self, predicate: SparkLikeExpr) -> Self: # `[0]` is safe as the predicate's expression only returns a single column condition = predicate._call(self)[0] - spark_df = self.native.where(condition) - return self._with_native(spark_df) + if self._implementation.is_pyspark(): + try: + return self._with_native(self.native.where(condition)) + except Exception as e: # noqa: BLE001 + raise catch_pyspark_sql_exception(e, self) from None + return self._with_native(self.native.where(condition)) @property def schema(self) -> dict[str, DType]: diff --git a/narwhals/_spark_like/utils.py b/narwhals/_spark_like/utils.py index 15502e072f..f87da9f403 100644 --- a/narwhals/_spark_like/utils.py +++ b/narwhals/_spark_like/utils.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Any, overload from narwhals._utils import Implementation, isinstance_or_issubclass -from narwhals.exceptions import UnsupportedDTypeError +from narwhals.exceptions import ColumnNotFoundError, UnsupportedDTypeError if TYPE_CHECKING: from types import ModuleType @@ -16,6 +16,7 @@ from sqlframe.base.session import _BaseSession as Session from typing_extensions import TypeAlias + from narwhals._compliant.typing import CompliantLazyFrameAny from narwhals._spark_like.dataframe import SparkLikeLazyFrame from narwhals._spark_like.expr import SparkLikeExpr from narwhals._utils import Version @@ -292,3 +293,31 @@ def true_divide(F: Any, left: Column, right: Column) -> Column: # noqa: N803 # PySpark before 3.5 doesn't have `try_divide`, SQLFrame doesn't have it. divide = getattr(F, "try_divide", operator.truediv) return divide(left, right) + + +def catch_pyspark_sql_exception( + exception: Exception, frame: CompliantLazyFrameAny, / +) -> ColumnNotFoundError | Exception: # pragma: no cover + from pyspark.errors import AnalysisException + + if isinstance(exception, AnalysisException) and str(exception).startswith( + "[UNRESOLVED_COLUMN.WITH_SUGGESTION]" + ): + return ColumnNotFoundError.from_available_column_names( + available_columns=frame.columns + ) + # Just return exception as-is. + return exception + + +def catch_pyspark_connect_exception( + exception: Exception, / +) -> ColumnNotFoundError | Exception: # pragma: no cover + from pyspark.errors.exceptions.connect import AnalysisException + + if isinstance(exception, AnalysisException) and str(exception).startswith( + "[UNRESOLVED_COLUMN.WITH_SUGGESTION]" + ): + return ColumnNotFoundError(str(exception)) + # Just return exception as-is. + return exception diff --git a/narwhals/exceptions.py b/narwhals/exceptions.py index e5b4c0331f..d18f5d0dd4 100644 --- a/narwhals/exceptions.py +++ b/narwhals/exceptions.py @@ -43,6 +43,16 @@ def from_missing_and_available_column_names( ) return ColumnNotFoundError(message) + @classmethod + def from_available_column_names( + cls, available_columns: Collection[str] + ) -> ColumnNotFoundError: + message = ( + "The selected columns were not found." + f"\n\nHint: Did you mean one of these columns: {list(available_columns)}?" + ) + return ColumnNotFoundError(message) + class ComputeError(NarwhalsError): """Exception raised when the underlying computation could not be evaluated.""" diff --git a/tests/conftest.py b/tests/conftest.py index 26c8f114a7..56363ec9b5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,7 +25,7 @@ from narwhals._spark_like.dataframe import SQLFrameDataFrame from narwhals.typing import NativeFrame, NativeLazyFrame - from tests.utils import Constructor, ConstructorEager + from tests.utils import Constructor, ConstructorEager, ConstructorLazy Data: TypeAlias = "dict[str, list[Any]]" @@ -246,7 +246,7 @@ def ibis_lazy_constructor(obj: Data) -> ibis.Table: # pragma: no cover "cudf": cudf_constructor, "polars[eager]": polars_eager_constructor, } -LAZY_CONSTRUCTORS: dict[str, Constructor] = { +LAZY_CONSTRUCTORS: dict[str, ConstructorLazy] = { "dask": dask_lazy_p2_constructor, "polars[lazy]": polars_lazy_constructor, "duckdb": duckdb_lazy_constructor, diff --git a/tests/frame/drop_test.py b/tests/frame/drop_test.py index 4194184914..233ab3003b 100644 --- a/tests/frame/drop_test.py +++ b/tests/frame/drop_test.py @@ -25,16 +25,9 @@ def test_drop(constructor: Constructor, to_drop: list[str], expected: list[str]) assert df.drop(*to_drop).collect_schema().names() == expected -@pytest.mark.parametrize( - ("strict", "context"), - [(True, pytest.raises(ColumnNotFoundError, match="z")), (False, does_not_raise())], -) +@pytest.mark.parametrize("strict", [True, False]) def test_drop_strict( - request: pytest.FixtureRequest, - constructor: Constructor, - context: Any, - *, - strict: bool, + request: pytest.FixtureRequest, constructor: Constructor, *, strict: bool ) -> None: if "polars_lazy" in str(constructor) and POLARS_VERSION < (1, 0, 0) and strict: request.applymarker(pytest.mark.xfail) @@ -44,6 +37,19 @@ def test_drop_strict( df = nw.from_native(constructor(data)) + context: Any + if strict: + if "polars_lazy" in str(constructor): + msg = r"^(\"z\"|z)" + else: + msg = ( + r"The following columns were not found: \[.*\]" + r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" + ) + context = pytest.raises(ColumnNotFoundError, match=msg) + else: + context = does_not_raise() + with context: names_out = df.drop(to_drop, strict=strict).collect_schema().names() assert names_out == ["b"] diff --git a/tests/frame/filter_test.py b/tests/frame/filter_test.py index d6190425e7..e94aee1be4 100644 --- a/tests/frame/filter_test.py +++ b/tests/frame/filter_test.py @@ -5,7 +5,7 @@ import pytest import narwhals as nw -from narwhals.exceptions import InvalidOperationError +from narwhals.exceptions import ColumnNotFoundError, InvalidOperationError from tests.utils import Constructor, ConstructorEager, assert_equal_data @@ -65,3 +65,30 @@ def test_filter_with_constrains(constructor: Constructor) -> None: expected_expr = {"a": [1, 2], "b": [4, 6]} assert_equal_data(result_expr, expected_expr) + + +def test_filter_missing_column( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + constructor_id = str(request.node.callspec.id) + if any(id_ == constructor_id for id_ in ("sqlframe", "pyspark[connect]", "ibis")): + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 2], "b": [3, 4]} + df = nw.from_native(constructor(data)) + + if "polars" in str(constructor): + msg = r"^unable to find column \"c\"; valid columns: \[\"a\", \"b\"\]" + elif any(id_ == constructor_id for id_ in ("duckdb", "pyspark")): + msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" + else: + msg = ( + r"The following columns were not found: \[.*\]" + r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" + ) + + if "polars_lazy" in str(constructor) and isinstance(df, nw.LazyFrame): + with pytest.raises(ColumnNotFoundError, match=msg): + df.filter(c=5).collect() + else: + with pytest.raises(ColumnNotFoundError, match=msg): + df.filter(c=5) diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index e39acaab69..2496ba7f02 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -11,10 +11,10 @@ from tests.utils import ( DASK_VERSION, DUCKDB_VERSION, - POLARS_VERSION, Constructor, ConstructorEager, assert_equal_data, + maybe_collect, ) @@ -86,49 +86,43 @@ def test_comparison_with_list_error_message() -> None: def test_missing_columns( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if ( - ("pyspark" in str(constructor)) - or "duckdb" in str(constructor) - or "ibis" in str(constructor) - ): + constructor_id = str(request.node.callspec.id) + if any(id_ == constructor_id for id_ in ("sqlframe", "ibis")): + # `sqlframe` raises a different error depending on its underlying backend request.applymarker(pytest.mark.xfail) + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} df = nw.from_native(constructor(data)) selected_columns = ["a", "e", "f"] - msg = ( - r"The following columns were not found: \[.*\]" - r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" - ) - if "polars" in str(constructor): + + if constructor_id == "polars[lazy]": # In the lazy case, Polars only errors when we call `collect`, # and we have no way to recover exactly which columns the user # tried selecting. So, we just emit their message (which varies # across versions...) - msg = "e|f" - if isinstance(df, nw.LazyFrame): - with pytest.raises(ColumnNotFoundError, match=msg): - df.select(selected_columns).collect() - else: - with pytest.raises(ColumnNotFoundError, match=msg): - df.select(selected_columns) - if POLARS_VERSION >= (1,): - # Old Polars versions wouldn't raise an error - # at all here - if isinstance(df, nw.LazyFrame): - with pytest.raises(ColumnNotFoundError, match=msg): - df.drop(selected_columns, strict=True).collect() - else: - with pytest.raises(ColumnNotFoundError, match=msg): - df.drop(selected_columns, strict=True) - else: # pragma: no cover - pass + msg = r"^e" + elif constructor_id == "pyspark[connect]": # pragma: no cover + msg = r"^\[UNRESOLVED_COLUMN.WITH_SUGGESTION\]" + elif any(id_ == constructor_id for id_ in ("duckdb", "pyspark")): + msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" else: - with pytest.raises(ColumnNotFoundError, match=msg): - df.select(selected_columns) - with pytest.raises(ColumnNotFoundError, match=msg): - df.drop(selected_columns, strict=True) - with pytest.raises(ColumnNotFoundError, match=msg): - df.select(nw.col("fdfa")) + msg = ( + r"The following columns were not found: \[.*\]" + r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" + ) + + with pytest.raises(ColumnNotFoundError, match=msg): + maybe_collect(df.select(selected_columns)) + + # for the next two cases the error message is different in Polars + if constructor_id == "polars[lazy]": + msg = r"^fdfa" + elif constructor_id == "polars[eager]": + msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b', 'z'\]?" + with pytest.raises(ColumnNotFoundError, match=msg): + maybe_collect(df.select(nw.col("fdfa"))) + with pytest.raises(ColumnNotFoundError, match=msg): + maybe_collect(df.select(nw.col("fdfa").sum())) def test_left_to_right_broadcasting(constructor: Constructor) -> None: diff --git a/tests/frame/with_columns_test.py b/tests/frame/with_columns_test.py index 49f4ccd1b9..9658d4e678 100644 --- a/tests/frame/with_columns_test.py +++ b/tests/frame/with_columns_test.py @@ -5,8 +5,14 @@ import pytest import narwhals as nw -from narwhals.exceptions import ShapeError -from tests.utils import PYARROW_VERSION, Constructor, ConstructorEager, assert_equal_data +from narwhals.exceptions import ColumnNotFoundError, ShapeError +from tests.utils import ( + PYARROW_VERSION, + Constructor, + ConstructorEager, + assert_equal_data, + maybe_collect, +) def test_with_columns_int_col_name_pandas() -> None: @@ -78,3 +84,29 @@ def test_with_columns_series_shape_mismatch(constructor_eager: ConstructorEager) ] with pytest.raises(ShapeError): df1.with_columns(second=second) + + +def test_with_columns_missing_column( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + constructor_id = str(request.node.callspec.id) + if any(id_ == constructor_id for id_ in ("sqlframe", "ibis")): + # `sqlframe` raises a different error depending on its underlying backend + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 2], "b": [3, 4]} + df = nw.from_native(constructor(data)) + + if "polars" in str(constructor): + msg = r"^c" + elif any(id_ == constructor_id for id_ in ("duckdb", "pyspark")): + msg = r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" + elif constructor_id == "pyspark[connect]": # pragma: no cover + msg = r"^\[UNRESOLVED_COLUMN.WITH_SUGGESTION\]" + else: + msg = ( + r"The following columns were not found: \[.*\]" + r"\n\nHint: Did you mean one of these columns: \['a', 'b'\]?" + ) + + with pytest.raises(ColumnNotFoundError, match=msg): + maybe_collect(df.with_columns(d=nw.col("c") + 1)) diff --git a/tests/utils.py b/tests/utils.py index 13b3c0d2fd..482e4362e1 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -20,7 +20,7 @@ from typing_extensions import TypeAlias - from narwhals.typing import DataFrameLike, NativeFrame, NativeLazyFrame + from narwhals.typing import DataFrameLike, Frame, NativeFrame, NativeLazyFrame def get_module_version_as_tuple(module_name: str) -> tuple[int, ...]: @@ -42,6 +42,7 @@ def get_module_version_as_tuple(module_name: str) -> tuple[int, ...]: Constructor: TypeAlias = Callable[[Any], "NativeLazyFrame | NativeFrame | DataFrameLike"] ConstructorEager: TypeAlias = Callable[[Any], "NativeFrame | DataFrameLike"] +ConstructorLazy: TypeAlias = Callable[[Any], "NativeLazyFrame"] def zip_strict(left: Sequence[Any], right: Sequence[Any]) -> Iterator[Any]: @@ -178,3 +179,14 @@ def uses_pyarrow_backend(constructor: Constructor | ConstructorEager) -> bool: "pandas_pyarrow_constructor", "modin_pyarrow_constructor", } + + +def maybe_collect(df: Frame) -> Frame: + """Collect the DataFrame if it is a LazyFrame. + + Use this function to test specific behaviors during collection. + For example, Polars only errors when we call `collect` in the lazy case. + """ + if isinstance(df, nw.LazyFrame): + return df.collect() + return df # pragma: no cover