-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add more checks for spark-connect linter (#2092)
## Changes Add more checks to detect code incompatibilities with UC Shared Clusters: - use of Python UDF unsupported eval types - spark.catalog.X APIs on DBR < 14.3 - use of commandContext ### Tests - [ ] manually tested - [x] added unit tests - [ ] added integration tests - [ ] verified on staging environment (screenshot attached)
- Loading branch information
1 parent
b2a2ae4
commit b564fe3
Showing
15 changed files
with
288 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
23 changes: 23 additions & 0 deletions
23
tests/unit/source_code/samples/functional/spark-connect/catalog-api_13_3.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# ucx[session-state] {"dbr_version": [13, 3]} | ||
# ucx[catalog-api-in-shared-clusters:+1:0:+1:13] spark.catalog functions require DBR 14.3 LTS or above on UC Shared Clusters | ||
spark.catalog.tableExists("table") | ||
# ucx[catalog-api-in-shared-clusters:+1:0:+1:13] spark.catalog functions require DBR 14.3 LTS or above on UC Shared Clusters | ||
spark.catalog.listDatabases() | ||
|
||
|
||
def catalog(): | ||
pass | ||
|
||
|
||
catalog() | ||
|
||
|
||
class Fatalog: | ||
def tableExists(self, x): ... | ||
class Foo: | ||
def catalog(self): | ||
Fatalog() | ||
|
||
|
||
x = Foo() | ||
x.catalog.tableExists("...") |
10 changes: 10 additions & 0 deletions
10
tests/unit/source_code/samples/functional/spark-connect/catalog-api_14_3.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# ucx[session-state] {"dbr_version": [14, 3]} | ||
spark.catalog.tableExists("table") | ||
spark.catalog.listDatabases() | ||
|
||
|
||
def catalog(): | ||
pass | ||
|
||
|
||
catalog() |
6 changes: 6 additions & 0 deletions
6
tests/unit/source_code/samples/functional/spark-connect/command-context.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# ucx[to-json-in-shared-clusters:+1:6:+1:80] toJson() is not available on UC Shared Clusters. Use toSafeJson() on DBR 13.3 LTS or above to get a subset of command context information. | ||
print(dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson()) | ||
dbutils.notebook.entry_point.getDbutils().notebook().getContext().toSafeJson() | ||
notebook = dbutils.notebook.entry_point.getDbutils().notebook() | ||
# ucx[to-json-in-shared-clusters:+1:0:+1:30] toJson() is not available on UC Shared Clusters. Use toSafeJson() on DBR 13.3 LTS or above to get a subset of command context information. | ||
notebook.getContext().toJson() |
File renamed without changes.
53 changes: 53 additions & 0 deletions
53
tests/unit/source_code/samples/functional/spark-connect/python-udfs_13_3.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# ucx[session-state] {"dbr_version": [13, 3]} | ||
from pyspark.sql.functions import udf, udtf, lit | ||
import pandas as pd | ||
|
||
|
||
@udf(returnType='int') | ||
def slen(s): | ||
return len(s) | ||
|
||
|
||
# ucx[python-udf-in-shared-clusters:+1:1:+1:37] Arrow UDFs require DBR 14.3 LTS or above on UC Shared Clusters | ||
@udf(returnType='int', useArrow=True) | ||
def arrow_slen(s): | ||
return len(s) | ||
|
||
|
||
df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) | ||
df.select(slen("name"), arrow_slen("name")).show() | ||
|
||
slen1 = udf(lambda s: len(s), returnType='int') | ||
# ucx[python-udf-in-shared-clusters:+1:14:+1:68] Arrow UDFs require DBR 14.3 LTS or above on UC Shared Clusters | ||
arrow_slen1 = udf(lambda s: len(s), returnType='int', useArrow=True) | ||
|
||
df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) | ||
|
||
df.select(slen1("name"), arrow_slen1("name")).show() | ||
|
||
df = spark.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v")) | ||
|
||
|
||
def subtract_mean(pdf: pd.DataFrame) -> pd.DataFrame: | ||
v = pdf.v | ||
return pdf.assign(v=v - v.mean()) | ||
|
||
|
||
# ucx[python-udf-in-shared-clusters:+1:0:+1:73] applyInPandas require DBR 14.3 LTS or above on UC Shared Clusters | ||
df.groupby("id").applyInPandas(subtract_mean, schema="id long, v double").show() | ||
|
||
|
||
class SquareNumbers: | ||
def eval(self, start: int, end: int): | ||
for num in range(start, end + 1): | ||
yield (num, num * num) | ||
|
||
|
||
# ucx[python-udf-in-shared-clusters:+1:13:+1:69] udtf require DBR 14.3 LTS or above on UC Shared Clusters | ||
square_num = udtf(SquareNumbers, returnType="num: int, squared: int") | ||
square_num(lit(1), lit(3)).show() | ||
|
||
from pyspark.sql.types import IntegerType | ||
|
||
# ucx[python-udf-in-shared-clusters:+1:0:+1:73] Cannot register Java UDF from Python code on UC Shared Clusters. Use a %scala cell to register the Scala UDF using spark.udf.register. | ||
spark.udf.registerJavaFunction("func", "org.example.func", IntegerType()) |
Oops, something went wrong.