-
Notifications
You must be signed in to change notification settings - Fork 179
feat: Add support for spark connect #2417
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6a6e420
27b66f7
6531ced
cdacf63
5f4fe4e
d32d5dc
0e5cc73
8b61581
05e0bcc
73ff548
2a6d742
c646706
97b31ef
9c5a5de
769dbef
bcbec71
2f09e15
94e650e
67eaac3
f297ac4
adc698c
71135cb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -141,43 +141,53 @@ def _with_native(self, df: SQLFrameDataFrame) -> Self: | |
| implementation=self._implementation, | ||
| ) | ||
|
|
||
| def _to_arrow_schema(self) -> pa.Schema: # pragma: no cover | ||
dangotbanned marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| import pyarrow as pa # ignore-banned-import | ||
|
|
||
| from narwhals._arrow.utils import narwhals_to_native_dtype | ||
|
|
||
| schema: list[tuple[str, pa.DataType]] = [] | ||
| nw_schema = self.collect_schema() | ||
| native_schema = self.native.schema | ||
| for key, value in nw_schema.items(): | ||
| try: | ||
| native_dtype = narwhals_to_native_dtype(value, self._version) | ||
| except Exception as exc: # noqa: BLE001,PERF203 | ||
| native_spark_dtype = native_schema[key].dataType # type: ignore[index] | ||
|
Comment on lines
+152
to
+156
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @FBruzzesi Could you address this performance issue ( I'd personally
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In principle we could first try to check if
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dangotbanned sorry for the direct ping - let's figure out what to write - to me the explanation be in ln158 is quite good. I will need your approval to merge π
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey sorry I lost this @FBruzzesi I started trying to address it, but couldn't get the tests working locally π |
||
| # If we can't convert the type, just set it to `pa.null`, and warn. | ||
| # Avoid the warning if we're starting from PySpark's void type. | ||
| # We can avoid the check when we introduce `nw.Null` dtype. | ||
| null_type = self._native_dtypes.NullType # pyright: ignore[reportAttributeAccessIssue] | ||
| if not isinstance(native_spark_dtype, null_type): | ||
| warnings.warn( | ||
| f"Could not convert dtype {native_spark_dtype} to PyArrow dtype, {exc!r}", | ||
| stacklevel=find_stacklevel(), | ||
| ) | ||
| schema.append((key, pa.null())) | ||
| else: | ||
| schema.append((key, native_dtype)) | ||
| return pa.schema(schema) | ||
|
|
||
| def _collect_to_arrow(self) -> pa.Table: | ||
| if self._implementation is Implementation.PYSPARK and self._backend_version < ( | ||
| 4, | ||
| ): | ||
| if self._implementation.is_pyspark() and self._backend_version < (4,): | ||
| import pyarrow as pa # ignore-banned-import | ||
|
|
||
| try: | ||
| return pa.Table.from_batches(self.native._collect_as_arrow()) | ||
| except ValueError as exc: | ||
| if "at least one RecordBatch" in str(exc): | ||
| # Empty dataframe | ||
| from narwhals._arrow.utils import narwhals_to_native_dtype | ||
|
|
||
| data: dict[str, list[Any]] = {} | ||
| schema: list[tuple[str, pa.DataType]] = [] | ||
| current_schema = self.collect_schema() | ||
| for key, value in current_schema.items(): | ||
| data[key] = [] | ||
| try: | ||
| native_dtype = narwhals_to_native_dtype(value, self._version) | ||
| except Exception as exc: # noqa: BLE001 | ||
| native_spark_dtype = self.native.schema[key].dataType # type: ignore[index] | ||
| # If we can't convert the type, just set it to `pa.null`, and warn. | ||
| # Avoid the warning if we're starting from PySpark's void type. | ||
| # We can avoid the check when we introduce `nw.Null` dtype. | ||
| null_type = self._native_dtypes.NullType # pyright: ignore[reportAttributeAccessIssue] | ||
| if not isinstance(native_spark_dtype, null_type): | ||
| warnings.warn( | ||
| f"Could not convert dtype {native_spark_dtype} to PyArrow dtype, {exc!r}", | ||
| stacklevel=find_stacklevel(), | ||
| ) | ||
| schema.append((key, pa.null())) | ||
| else: | ||
| schema.append((key, native_dtype)) | ||
| return pa.Table.from_pydict(data, schema=pa.schema(schema)) | ||
|
|
||
| data: dict[str, list[Any]] = {k: [] for k in self.columns} | ||
| pa_schema = self._to_arrow_schema() | ||
| return pa.Table.from_pydict(data, schema=pa_schema) | ||
| else: # pragma: no cover | ||
| raise | ||
| elif self._implementation.is_pyspark_connect() and self._backend_version < (4,): | ||
| import pyarrow as pa # ignore-banned-import | ||
|
|
||
| pa_schema = self._to_arrow_schema() | ||
| return pa.Table.from_pandas(self.native.toPandas(), schema=pa_schema) | ||
| else: | ||
| return self.native.toArrow() | ||
|
|
||
|
|
@@ -293,7 +303,7 @@ def drop(self, columns: Sequence[str], *, strict: bool) -> Self: | |
| return self._with_native(self.native.drop(*columns_to_drop)) | ||
|
|
||
| def head(self, n: int) -> Self: | ||
| return self._with_native(self.native.limit(num=n)) | ||
| return self._with_native(self.native.limit(n)) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No argument named |
||
|
|
||
| def group_by( | ||
| self, keys: Sequence[str] | Sequence[SparkLikeExpr], *, drop_null_keys: bool | ||
|
|
@@ -445,7 +455,7 @@ def explode(self, columns: Sequence[str]) -> Self: | |
| ) | ||
| raise NotImplementedError(msg) | ||
|
|
||
| if self._implementation.is_pyspark(): | ||
| if self._implementation.is_pyspark() or self._implementation.is_pyspark_connect(): | ||
| return self._with_native( | ||
| self.native.select( | ||
| *[ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -75,7 +75,8 @@ def broadcast(self, kind: Literal[ExprKind.AGGREGATION, ExprKind.LITERAL]) -> Se | |
|
|
||
| def func(df: SparkLikeLazyFrame) -> Sequence[Column]: | ||
| return [ | ||
| result.over(df._Window().partitionBy(df._F.lit(1))) for result in self(df) | ||
| result.over(self._Window().partitionBy(self._F.lit(1))) | ||
| for result in self(df) | ||
| ] | ||
|
|
||
| return self.__class__( | ||
|
|
@@ -438,7 +439,8 @@ def mean(self) -> Self: | |
| def median(self) -> Self: | ||
| def _median(_input: Column) -> Column: | ||
| if ( | ||
| self._implementation.is_pyspark() | ||
| self._implementation | ||
| in {Implementation.PYSPARK, Implementation.PYSPARK_CONNECT} | ||
| and (pyspark := get_pyspark()) is not None | ||
| and parse_version(pyspark) < (3, 4) | ||
| ): # pragma: no cover | ||
|
|
@@ -772,7 +774,7 @@ def _rank(_input: Column) -> Column: | |
| else: | ||
| order_by_cols = [self._F.asc_nulls_last(_input)] | ||
|
|
||
| window = self._Window().orderBy(order_by_cols) | ||
| window = self._Window().partitionBy(self._F.lit(1)).orderBy(order_by_cols) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Left over from #2429 |
||
| count_window = self._Window().partitionBy(_input) | ||
|
|
||
| if method == "max": | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice