Skip to content

Commit

Permalink
Support inferred values when linting DBFS mounts (#1868)
Browse files Browse the repository at this point in the history
## Changes
Implement value inference
Improve consistency of generated advices

### Linked issues
Progresses #1205

---------

Co-authored-by: Eric Vergnaud <[email protected]>
  • Loading branch information
ericvergnaud and ericvergnaud authored Jun 10, 2024
1 parent b9ca2e5 commit 6c67a13
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 56 deletions.
54 changes: 32 additions & 22 deletions src/databricks/labs/ucx/source_code/linters/dbfs.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import logging
from collections.abc import Iterable

from astroid import Call, Const # type: ignore
from astroid import Call, Const, InferenceError, NodeNG # type: ignore
import sqlglot
from sqlglot.expressions import Table

from databricks.labs.ucx.source_code.base import Advice, Linter, Advisory, Deprecation
from databricks.labs.ucx.source_code.base import Advice, Linter, Deprecation
from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor


logger = logging.getLogger(__file__)


class DetectDbfsVisitor(TreeVisitor):
"""
Visitor that detects file system paths in Python code and checks them
Expand All @@ -21,30 +25,36 @@ def __init__(self) -> None:

def visit_call(self, node: Call):
for arg in node.args:
if isinstance(arg, Const) and isinstance(arg.value, str):
value = arg.value
if any(value.startswith(prefix) for prefix in self._fs_prefixes):
deprecation = Deprecation.from_node(
code='dbfs-usage', message=f"Deprecated file system path in call to: {value}", node=arg
)
self._advices.append(deprecation.replace(end_col=arg.col_offset + len(value)))
# Record the location of the reported constant, so we do not double report
self._reported_locations.add((arg.lineno, arg.col_offset))
self._visit_arg(arg)

def _visit_arg(self, arg: NodeNG):
try:
for inferred in arg.inferred():
if isinstance(inferred, Const) and isinstance(inferred.value, str):
self._check_str_constant(arg, inferred)
except InferenceError as e:
logger.debug(f"Could not infer value of {arg.as_string()}", exc_info=e)

def visit_const(self, node: Const):
# Constant strings yield Advisories
if isinstance(node.value, str):
self._check_str_constant(node)

def _check_str_constant(self, node: Const):
# Check if the location has been reported before
if (node.lineno, node.col_offset) not in self._reported_locations:
value = node.value
if any(value.startswith(prefix) for prefix in self._fs_prefixes):
advisory = Advisory.from_node(
code='dbfs-usage', message=f"Possible deprecated file system path: {value}", node=node
)
self._advices.append(advisory.replace(end_col=node.col_offset + len(value)))
self._check_str_constant(node, node)

def _check_str_constant(self, source_node, const_node: Const):
if self._already_reported(source_node, const_node):
return
value = const_node.value
if any(value.startswith(prefix) for prefix in self._fs_prefixes):
advisory = Deprecation.from_node(
code='dbfs-usage', message=f"Deprecated file system path: {value}", node=source_node
)
self._advices.append(advisory)

def _already_reported(self, *nodes: NodeNG):
reported = any((node.lineno, node.col_offset) in self._reported_locations for node in nodes)
for node in nodes:
self._reported_locations.add((node.lineno, node.col_offset))
return reported

def get_advices(self) -> Iterable[Advice]:
yield from self._advices
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/source_code/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job,
expected_messages = {
"second_notebook:3 [direct-filesystem-access] The use of default dbfs: references is deprecated: /mnt/something",
"some_file.py:0 [direct-filesystem-access] The use of default dbfs: references is deprecated: /mnt/foo/bar",
"some_file.py:0 [dbfs-usage] Deprecated file system path in call to: /mnt/foo/bar",
"second_notebook:3 [dbfs-usage] Deprecated file system path in call to: /mnt/something",
"some_file.py:0 [dbfs-usage] Deprecated file system path: /mnt/foo/bar",
"second_notebook:3 [dbfs-usage] Deprecated file system path: /mnt/something",
}

entrypoint = WorkspacePath(ws, f"~/linter-{make_random(4)}").expanduser()
Expand Down
14 changes: 11 additions & 3 deletions tests/unit/source_code/linters/test_dbfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class TestDetectDBFS:
('SOME_CONSTANT = 42; load_data(SOME_CONSTANT)', 0),
],
)
def test_detects_dbfs_str_const_paths(self, code, expected):
def test_detects_dbfs_paths(self, code, expected):
linter = DBFSUsageLinter()
advices = list(linter.lint(code))
for advice in advices:
Expand All @@ -33,8 +33,16 @@ def test_detects_dbfs_str_const_paths(self, code, expected):
('spark.read.parquet("/mnt/foo/bar")', 1),
('spark.read.parquet("dbfs:/mnt/foo/bar")', 1),
('spark.read.parquet("dbfs://mnt/foo/bar")', 1),
# Would need a stateful linter to detect this next one
('DBFS="dbfs:/mnt/foo/bar"; spark.read.parquet(DBFS)', 0),
('DBFS="dbfs:/mnt/foo/bar"; spark.read.parquet(DBFS)', 1),
(
"""
DBFS1="dbfs:/mnt/foo/bar1"
systems=[DBFS1, "dbfs:/mnt/foo/bar2"]
for system in systems:
spark.read.parquet(system)
""",
2,
),
],
)
def test_dbfs_usage_linter(self, code, expected):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# COMMAND ----------

# ucx[dbfs-usage:+2:23:+2:40] Deprecated file system path in call to: /mnt/things/e/f/g
# ucx[dbfs-usage:+2:23:+2:42] Deprecated file system path: /mnt/things/e/f/g
# ucx[direct-filesystem-access:+1:8:+1:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g
display(spark.read.csv('/mnt/things/e/f/g'))

Expand Down
56 changes: 28 additions & 28 deletions tests/unit/source_code/test_notebook_linter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from databricks.sdk.service.workspace import Language

from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
from databricks.labs.ucx.source_code.base import Deprecation, Advisory, Advice
from databricks.labs.ucx.source_code.base import Deprecation, Advice
from databricks.labs.ucx.source_code.notebooks.sources import NotebookLinter

index = MigrationIndex([])
Expand Down Expand Up @@ -41,19 +41,19 @@
),
Deprecation(
code='direct-filesystem-access',
message='The use of default dbfs: references is deprecated: ' '/mnt/things/e/f/g',
message='The use of default dbfs: references is deprecated: /mnt/things/e/f/g',
start_line=14,
start_col=8,
end_line=14,
end_col=43,
),
Deprecation(
code='dbfs-usage',
message='Deprecated file system path in call to: /mnt/things/e/f/g',
message='Deprecated file system path: /mnt/things/e/f/g',
start_line=14,
start_col=23,
end_line=14,
end_col=40,
end_col=42,
),
],
),
Expand Down Expand Up @@ -100,11 +100,11 @@
),
Deprecation(
code='dbfs-usage',
message='Deprecated file system path in call to: /mnt/things/e/f/g',
message='Deprecated file system path: /mnt/things/e/f/g',
start_line=5,
start_col=23,
end_line=5,
end_col=40,
end_col=42,
),
Deprecation(
code='dbfs-query',
Expand Down Expand Up @@ -194,77 +194,77 @@
end_line=17,
end_col=40,
),
Advisory(
Deprecation(
code='dbfs-usage',
message='Possible deprecated file system path: dbfs:/...',
message='Deprecated file system path: dbfs:/...',
start_line=7,
start_col=7,
end_line=7,
end_col=16,
end_col=18,
),
Advisory(
Deprecation(
code='dbfs-usage',
message='Possible deprecated file system path: /dbfs/mnt',
message='Deprecated file system path: /dbfs/mnt',
start_line=8,
start_col=7,
end_line=8,
end_col=16,
end_col=18,
),
Advisory(
Deprecation(
code='dbfs-usage',
message='Possible deprecated file system path: /mnt/',
message='Deprecated file system path: /mnt/',
start_line=9,
start_col=7,
end_line=9,
end_col=12,
end_col=14,
),
Advisory(
Deprecation(
code='dbfs-usage',
message='Possible deprecated file system path: dbfs:/...',
message='Deprecated file system path: dbfs:/...',
start_line=10,
start_col=7,
end_line=10,
end_col=16,
end_col=18,
),
Deprecation(
code='dbfs-usage',
message='Deprecated file system path in call to: /dbfs/mnt/data',
message='Deprecated file system path: /dbfs/mnt/data',
start_line=11,
start_col=10,
end_line=11,
end_col=24,
end_col=26,
),
Deprecation(
code='dbfs-usage',
message='Deprecated file system path in call to: /dbfs/mnt/data',
message='Deprecated file system path: /dbfs/mnt/data',
start_line=13,
start_col=10,
end_line=13,
end_col=24,
end_col=26,
),
Deprecation(
code='dbfs-usage',
message='Deprecated file system path in call to: /mnt/foo/bar',
message='Deprecated file system path: /mnt/foo/bar',
start_line=15,
start_col=19,
end_line=15,
end_col=31,
end_col=33,
),
Deprecation(
code='dbfs-usage',
message='Deprecated file system path in call to: dbfs:/mnt/foo/bar',
message='Deprecated file system path: dbfs:/mnt/foo/bar',
start_line=16,
start_col=19,
end_line=16,
end_col=36,
end_col=38,
),
Deprecation(
code='dbfs-usage',
message='Deprecated file system path in call to: dbfs://mnt/foo/bar',
message='Deprecated file system path: dbfs://mnt/foo/bar',
start_line=17,
start_col=19,
end_line=17,
end_col=37,
end_col=39,
),
Deprecation(
code='dbfs-query',
Expand Down

0 comments on commit 6c67a13

Please sign in to comment.