-
Notifications
You must be signed in to change notification settings - Fork 87
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add more checks for spark-connect linter #2092
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,7 +21,13 @@ | |
|
||
|
||
class LinterContext: | ||
def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSessionState | None = None): | ||
def __init__( | ||
self, | ||
index: MigrationIndex | None = None, | ||
session_state: CurrentSessionState | None = None, | ||
dbr_version: tuple[int, int] | None = None, | ||
is_serverless: bool = False, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we move |
||
): | ||
self._index = index | ||
session_state = CurrentSessionState() if not session_state else session_state | ||
|
||
|
@@ -41,8 +47,8 @@ def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSe | |
|
||
python_linters += [ | ||
DBFSUsageLinter(session_state), | ||
DBRv8d0Linter(dbr_version=None), | ||
SparkConnectLinter(is_serverless=False), | ||
DBRv8d0Linter(dbr_version=dbr_version), | ||
SparkConnectLinter(dbr_version=dbr_version, is_serverless=is_serverless), | ||
DbutilsLinter(session_state), | ||
] | ||
sql_linters.append(FromDbfsFolder()) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
from collections.abc import Iterator | ||
from dataclasses import dataclass | ||
|
||
from astroid import Attribute, Call, Name, NodeNG # type: ignore | ||
from astroid import Attribute, Call, Const, Name, NodeNG # type: ignore | ||
from databricks.labs.ucx.source_code.base import ( | ||
Advice, | ||
Failure, | ||
|
@@ -172,14 +172,84 @@ def _match_jvm_log(self, node: NodeNG) -> Iterator[Advice]: | |
) | ||
|
||
|
||
@dataclass | ||
class UDFMatcher(SharedClusterMatcher): | ||
_DBR_14_2_BELOW_NOT_SUPPORTED = ["applyInPandas", "mapInPandas", "applyInPandasWithState", "udtf", "pandas_udf"] | ||
|
||
dbr_version: tuple[int, int] | None | ||
|
||
def lint(self, node: NodeNG) -> Iterator[Advice]: | ||
if not isinstance(node, Call): | ||
return | ||
function_name = Tree.get_function_name(node) | ||
|
||
if function_name == 'registerJavaFunction': | ||
yield Failure.from_node( | ||
code='python-udf-in-shared-clusters', | ||
message=f'Cannot register Java UDF from Python code on {self._cluster_type_str()}. ' | ||
f'Use a %scala cell to register the Scala UDF using spark.udf.register.', | ||
node=node, | ||
) | ||
|
||
if ( | ||
function_name in UDFMatcher._DBR_14_2_BELOW_NOT_SUPPORTED | ||
and self.dbr_version | ||
and self.dbr_version < (14, 3) | ||
): | ||
yield Failure.from_node( | ||
code='python-udf-in-shared-clusters', | ||
message=f'{function_name} require DBR 14.3 LTS or above on {self._cluster_type_str()}', | ||
node=node, | ||
) | ||
|
||
if function_name == 'udf' and self.dbr_version and self.dbr_version < (14, 3): | ||
for keyword in node.keywords: | ||
if keyword.arg == 'useArrow' and isinstance(keyword.value, Const) and keyword.value.value: | ||
yield Failure.from_node( | ||
code='python-udf-in-shared-clusters', | ||
message=f'Arrow UDFs require DBR 14.3 LTS or above on {self._cluster_type_str()}', | ||
node=node, | ||
) | ||
|
||
|
||
class CatalogApiMatcher(SharedClusterMatcher): | ||
def lint(self, node: NodeNG) -> Iterator[Advice]: | ||
if not isinstance(node, Attribute): | ||
return | ||
if node.attrname == 'catalog' and Tree.get_full_attribute_name(node).endswith('spark.catalog'): | ||
yield Failure.from_node( | ||
code='catalog-api-in-shared-clusters', | ||
message=f'spark.catalog functions require DBR 14.3 LTS or above on {self._cluster_type_str()}', | ||
node=node, | ||
) | ||
|
||
|
||
class CommandContextMatcher(SharedClusterMatcher): | ||
def lint(self, node: NodeNG) -> Iterator[Advice]: | ||
if not isinstance(node, Call): | ||
return | ||
function_name = Tree.get_full_function_name(node) | ||
if function_name and function_name.endswith('getContext.toJson'): | ||
yield Failure.from_node( | ||
code='toJson-in-shared-clusters', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use lowercase for codes, as i suspect that's how LSP would normalise everything. |
||
message=f'toJson() is not available on {self._cluster_type_str()}. ' | ||
f'Use toSafeJson() on DBR 13.3 LTS or above to get a subset of command context information.', | ||
node=node, | ||
) | ||
|
||
|
||
class SparkConnectLinter(PythonLinter): | ||
def __init__(self, is_serverless: bool = False): | ||
def __init__(self, dbr_version: tuple[int, int] | None = None, is_serverless: bool = False): | ||
self._matchers = [ | ||
JvmAccessMatcher(is_serverless=is_serverless), | ||
RDDApiMatcher(is_serverless=is_serverless), | ||
SparkSqlContextMatcher(is_serverless=is_serverless), | ||
LoggingMatcher(is_serverless=is_serverless), | ||
UDFMatcher(is_serverless=is_serverless, dbr_version=dbr_version), | ||
CommandContextMatcher(is_serverless=is_serverless), | ||
] | ||
if dbr_version and dbr_version < (14, 3): | ||
self._matchers.append(CatalogApiMatcher(is_serverless=is_serverless)) | ||
|
||
def lint_tree(self, tree: Tree) -> Iterator[Advice]: | ||
for matcher in self._matchers: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# ucx[catalog-api-in-shared-clusters:+1:0:+1:13] spark.catalog functions require DBR 14.3 LTS or above on UC Shared Clusters | ||
spark.catalog.tableExists("table") | ||
# ucx[catalog-api-in-shared-clusters:+1:0:+1:13] spark.catalog functions require DBR 14.3 LTS or above on UC Shared Clusters | ||
spark.catalog.listDatabases() | ||
|
||
|
||
def catalog(): | ||
nfx marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pass | ||
|
||
|
||
catalog() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
spark.catalog.tableExists("table") | ||
spark.catalog.listDatabases() | ||
|
||
|
||
def catalog(): | ||
pass | ||
|
||
|
||
catalog() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# ucx[toJson-in-shared-clusters:+1:6:+1:80] toJson() is not available on UC Shared Clusters. Use toSafeJson() on DBR 13.3 LTS or above to get a subset of command context information. | ||
print(dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson()) | ||
dbutils.notebook.entry_point.getDbutils().notebook().getContext().toSafeJson() | ||
notebook = dbutils.notebook.entry_point.getDbutils().notebook() | ||
# ucx[toJson-in-shared-clusters:+1:0:+1:30] toJson() is not available on UC Shared Clusters. Use toSafeJson() on DBR 13.3 LTS or above to get a subset of command context information. | ||
notebook.getContext().toJson() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from pyspark.sql.functions import udf, udtf, lit | ||
import pandas as pd | ||
|
||
|
||
@udf(returnType='int') | ||
def slen(s): | ||
return len(s) | ||
|
||
|
||
# ucx[python-udf-in-shared-clusters:+1:1:+1:37] Arrow UDFs require DBR 14.3 LTS or above on UC Shared Clusters | ||
@udf(returnType='int', useArrow=True) | ||
def arrow_slen(s): | ||
return len(s) | ||
|
||
|
||
df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) | ||
df.select(slen("name"), arrow_slen("name")).show() | ||
|
||
slen1 = udf(lambda s: len(s), returnType='int') | ||
# ucx[python-udf-in-shared-clusters:+1:14:+1:68] Arrow UDFs require DBR 14.3 LTS or above on UC Shared Clusters | ||
arrow_slen1 = udf(lambda s: len(s), returnType='int', useArrow=True) | ||
|
||
df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) | ||
|
||
df.select(slen1("name"), arrow_slen1("name")).show() | ||
|
||
df = spark.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v")) | ||
|
||
|
||
def subtract_mean(pdf: pd.DataFrame) -> pd.DataFrame: | ||
v = pdf.v | ||
return pdf.assign(v=v - v.mean()) | ||
|
||
|
||
# ucx[python-udf-in-shared-clusters:+1:0:+1:73] applyInPandas require DBR 14.3 LTS or above on UC Shared Clusters | ||
df.groupby("id").applyInPandas(subtract_mean, schema="id long, v double").show() | ||
|
||
|
||
class SquareNumbers: | ||
def eval(self, start: int, end: int): | ||
for num in range(start, end + 1): | ||
yield (num, num * num) | ||
|
||
|
||
# ucx[python-udf-in-shared-clusters:+1:13:+1:69] udtf require DBR 14.3 LTS or above on UC Shared Clusters | ||
square_num = udtf(SquareNumbers, returnType="num: int, squared: int") | ||
square_num(lit(1), lit(3)).show() | ||
|
||
from pyspark.sql.types import IntegerType | ||
|
||
# ucx[python-udf-in-shared-clusters:+1:0:+1:73] Cannot register Java UDF from Python code on UC Shared Clusters. Use a %scala cell to register the Scala UDF using spark.udf.register. | ||
spark.udf.registerJavaFunction("func", "org.example.func", IntegerType()) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from pyspark.sql.functions import udf, udtf, lit | ||
import pandas as pd | ||
|
||
|
||
@udf(returnType='int') | ||
def slen(s): | ||
return len(s) | ||
|
||
|
||
@udf(returnType='int', useArrow=True) | ||
def arrow_slen(s): | ||
return len(s) | ||
|
||
|
||
df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) | ||
df.select(slen("name"), arrow_slen("name")).show() | ||
|
||
slen1 = udf(lambda s: len(s), returnType='int') | ||
arrow_slen1 = udf(lambda s: len(s), returnType='int', useArrow=True) | ||
|
||
df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) | ||
|
||
df.select(slen1("name"), arrow_slen1("name")).show() | ||
|
||
df = spark.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v")) | ||
|
||
|
||
def subtract_mean(pdf: pd.DataFrame) -> pd.DataFrame: | ||
v = pdf.v | ||
return pdf.assign(v=v - v.mean()) | ||
|
||
|
||
df.groupby("id").applyInPandas(subtract_mean, schema="id long, v double").show() | ||
|
||
from pyspark.sql.types import IntegerType | ||
|
||
# ucx[python-udf-in-shared-clusters:+1:0:+1:73] Cannot register Java UDF from Python code on UC Shared Clusters. Use a %scala cell to register the Scala UDF using spark.udf.register. | ||
spark.udf.registerJavaFunction("func", "org.example.func", IntegerType()) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,6 +62,13 @@ class Functional: | |
) | ||
_location = Path(__file__).parent / 'samples/functional' | ||
|
||
TEST_DBR_VERSION = { | ||
'python-udfs_13_3.py': (13, 3), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. files can be moved and renamed by people unaware of this property. don't you want to introduce technically, we can make it into processing instrustion and be in the parser flow, but it's more complicated. |
||
'catalog-api_13_3.py': (13, 3), | ||
'python-udfs_14_3.py': (14, 3), | ||
'catalog-api_14_3.py': (14, 3), | ||
} | ||
|
||
@classmethod | ||
def all(cls) -> list['Functional']: | ||
return [Functional(_) for _ in cls._location.glob('**/*.py')] | ||
|
@@ -104,7 +111,9 @@ def _lint(self) -> Iterable[Advice]: | |
) | ||
session_state = CurrentSessionState() | ||
session_state.named_parameters = {"my-widget": "my-path.py"} | ||
ctx = LinterContext(migration_index, session_state) | ||
ctx = LinterContext( | ||
migration_index, session_state, dbr_version=Functional.TEST_DBR_VERSION.get(self.path.name, None) | ||
) | ||
linter = FileLinter(ctx, self.path) | ||
return linter.lint() | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
make dbr version a property of
CurrentSessionState
as well. we'll have few more and we should have just one container to pass along, not many parameters we may forget about.