From 6c67a1315b36f7174ded5e075b9373f6520e0347 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 10 Jun 2024 19:32:03 +0200 Subject: [PATCH] Support inferred values when linting DBFS mounts (#1868) ## Changes Implement value inference Improve consistency of generated advices ### Linked issues Progresses #1205 --------- Co-authored-by: Eric Vergnaud --- .../labs/ucx/source_code/linters/dbfs.py | 54 ++++++++++-------- tests/integration/source_code/test_jobs.py | 4 +- tests/unit/source_code/linters/test_dbfs.py | 14 ++++- .../functional/file-access/direct-fs.py | 2 +- .../unit/source_code/test_notebook_linter.py | 56 +++++++++---------- 5 files changed, 74 insertions(+), 56 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/dbfs.py b/src/databricks/labs/ucx/source_code/linters/dbfs.py index b7f709a06b..0ab732d21e 100644 --- a/src/databricks/labs/ucx/source_code/linters/dbfs.py +++ b/src/databricks/labs/ucx/source_code/linters/dbfs.py @@ -1,13 +1,17 @@ +import logging from collections.abc import Iterable -from astroid import Call, Const # type: ignore +from astroid import Call, Const, InferenceError, NodeNG # type: ignore import sqlglot from sqlglot.expressions import Table -from databricks.labs.ucx.source_code.base import Advice, Linter, Advisory, Deprecation +from databricks.labs.ucx.source_code.base import Advice, Linter, Deprecation from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor +logger = logging.getLogger(__file__) + + class DetectDbfsVisitor(TreeVisitor): """ Visitor that detects file system paths in Python code and checks them @@ -21,30 +25,36 @@ def __init__(self) -> None: def visit_call(self, node: Call): for arg in node.args: - if isinstance(arg, Const) and isinstance(arg.value, str): - value = arg.value - if any(value.startswith(prefix) for prefix in self._fs_prefixes): - deprecation = Deprecation.from_node( - code='dbfs-usage', message=f"Deprecated file system path in call to: {value}", node=arg - ) - self._advices.append(deprecation.replace(end_col=arg.col_offset + len(value))) - # Record the location of the reported constant, so we do not double report - self._reported_locations.add((arg.lineno, arg.col_offset)) + self._visit_arg(arg) + + def _visit_arg(self, arg: NodeNG): + try: + for inferred in arg.inferred(): + if isinstance(inferred, Const) and isinstance(inferred.value, str): + self._check_str_constant(arg, inferred) + except InferenceError as e: + logger.debug(f"Could not infer value of {arg.as_string()}", exc_info=e) def visit_const(self, node: Const): # Constant strings yield Advisories if isinstance(node.value, str): - self._check_str_constant(node) - - def _check_str_constant(self, node: Const): - # Check if the location has been reported before - if (node.lineno, node.col_offset) not in self._reported_locations: - value = node.value - if any(value.startswith(prefix) for prefix in self._fs_prefixes): - advisory = Advisory.from_node( - code='dbfs-usage', message=f"Possible deprecated file system path: {value}", node=node - ) - self._advices.append(advisory.replace(end_col=node.col_offset + len(value))) + self._check_str_constant(node, node) + + def _check_str_constant(self, source_node, const_node: Const): + if self._already_reported(source_node, const_node): + return + value = const_node.value + if any(value.startswith(prefix) for prefix in self._fs_prefixes): + advisory = Deprecation.from_node( + code='dbfs-usage', message=f"Deprecated file system path: {value}", node=source_node + ) + self._advices.append(advisory) + + def _already_reported(self, *nodes: NodeNG): + reported = any((node.lineno, node.col_offset) in self._reported_locations for node in nodes) + for node in nodes: + self._reported_locations.add((node.lineno, node.col_offset)) + return reported def get_advices(self) -> Iterable[Advice]: yield from self._advices diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index 751329c823..20cea4a1c4 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -114,8 +114,8 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, expected_messages = { "second_notebook:3 [direct-filesystem-access] The use of default dbfs: references is deprecated: /mnt/something", "some_file.py:0 [direct-filesystem-access] The use of default dbfs: references is deprecated: /mnt/foo/bar", - "some_file.py:0 [dbfs-usage] Deprecated file system path in call to: /mnt/foo/bar", - "second_notebook:3 [dbfs-usage] Deprecated file system path in call to: /mnt/something", + "some_file.py:0 [dbfs-usage] Deprecated file system path: /mnt/foo/bar", + "second_notebook:3 [dbfs-usage] Deprecated file system path: /mnt/something", } entrypoint = WorkspacePath(ws, f"~/linter-{make_random(4)}").expanduser() diff --git a/tests/unit/source_code/linters/test_dbfs.py b/tests/unit/source_code/linters/test_dbfs.py index fc2f562fd2..88c8b8f805 100644 --- a/tests/unit/source_code/linters/test_dbfs.py +++ b/tests/unit/source_code/linters/test_dbfs.py @@ -16,7 +16,7 @@ class TestDetectDBFS: ('SOME_CONSTANT = 42; load_data(SOME_CONSTANT)', 0), ], ) - def test_detects_dbfs_str_const_paths(self, code, expected): + def test_detects_dbfs_paths(self, code, expected): linter = DBFSUsageLinter() advices = list(linter.lint(code)) for advice in advices: @@ -33,8 +33,16 @@ def test_detects_dbfs_str_const_paths(self, code, expected): ('spark.read.parquet("/mnt/foo/bar")', 1), ('spark.read.parquet("dbfs:/mnt/foo/bar")', 1), ('spark.read.parquet("dbfs://mnt/foo/bar")', 1), - # Would need a stateful linter to detect this next one - ('DBFS="dbfs:/mnt/foo/bar"; spark.read.parquet(DBFS)', 0), + ('DBFS="dbfs:/mnt/foo/bar"; spark.read.parquet(DBFS)', 1), + ( + """ +DBFS1="dbfs:/mnt/foo/bar1" +systems=[DBFS1, "dbfs:/mnt/foo/bar2"] +for system in systems: + spark.read.parquet(system) +""", + 2, + ), ], ) def test_dbfs_usage_linter(self, code, expected): diff --git a/tests/unit/source_code/samples/functional/file-access/direct-fs.py b/tests/unit/source_code/samples/functional/file-access/direct-fs.py index 7d2eccf4a8..9c3f83228d 100644 --- a/tests/unit/source_code/samples/functional/file-access/direct-fs.py +++ b/tests/unit/source_code/samples/functional/file-access/direct-fs.py @@ -3,7 +3,7 @@ # COMMAND ---------- -# ucx[dbfs-usage:+2:23:+2:40] Deprecated file system path in call to: /mnt/things/e/f/g +# ucx[dbfs-usage:+2:23:+2:42] Deprecated file system path: /mnt/things/e/f/g # ucx[direct-filesystem-access:+1:8:+1:43] The use of default dbfs: references is deprecated: /mnt/things/e/f/g display(spark.read.csv('/mnt/things/e/f/g')) diff --git a/tests/unit/source_code/test_notebook_linter.py b/tests/unit/source_code/test_notebook_linter.py index 8879c4135a..d507f03b38 100644 --- a/tests/unit/source_code/test_notebook_linter.py +++ b/tests/unit/source_code/test_notebook_linter.py @@ -2,7 +2,7 @@ from databricks.sdk.service.workspace import Language from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.base import Deprecation, Advisory, Advice +from databricks.labs.ucx.source_code.base import Deprecation, Advice from databricks.labs.ucx.source_code.notebooks.sources import NotebookLinter index = MigrationIndex([]) @@ -41,7 +41,7 @@ ), Deprecation( code='direct-filesystem-access', - message='The use of default dbfs: references is deprecated: ' '/mnt/things/e/f/g', + message='The use of default dbfs: references is deprecated: /mnt/things/e/f/g', start_line=14, start_col=8, end_line=14, @@ -49,11 +49,11 @@ ), Deprecation( code='dbfs-usage', - message='Deprecated file system path in call to: /mnt/things/e/f/g', + message='Deprecated file system path: /mnt/things/e/f/g', start_line=14, start_col=23, end_line=14, - end_col=40, + end_col=42, ), ], ), @@ -100,11 +100,11 @@ ), Deprecation( code='dbfs-usage', - message='Deprecated file system path in call to: /mnt/things/e/f/g', + message='Deprecated file system path: /mnt/things/e/f/g', start_line=5, start_col=23, end_line=5, - end_col=40, + end_col=42, ), Deprecation( code='dbfs-query', @@ -194,77 +194,77 @@ end_line=17, end_col=40, ), - Advisory( + Deprecation( code='dbfs-usage', - message='Possible deprecated file system path: dbfs:/...', + message='Deprecated file system path: dbfs:/...', start_line=7, start_col=7, end_line=7, - end_col=16, + end_col=18, ), - Advisory( + Deprecation( code='dbfs-usage', - message='Possible deprecated file system path: /dbfs/mnt', + message='Deprecated file system path: /dbfs/mnt', start_line=8, start_col=7, end_line=8, - end_col=16, + end_col=18, ), - Advisory( + Deprecation( code='dbfs-usage', - message='Possible deprecated file system path: /mnt/', + message='Deprecated file system path: /mnt/', start_line=9, start_col=7, end_line=9, - end_col=12, + end_col=14, ), - Advisory( + Deprecation( code='dbfs-usage', - message='Possible deprecated file system path: dbfs:/...', + message='Deprecated file system path: dbfs:/...', start_line=10, start_col=7, end_line=10, - end_col=16, + end_col=18, ), Deprecation( code='dbfs-usage', - message='Deprecated file system path in call to: /dbfs/mnt/data', + message='Deprecated file system path: /dbfs/mnt/data', start_line=11, start_col=10, end_line=11, - end_col=24, + end_col=26, ), Deprecation( code='dbfs-usage', - message='Deprecated file system path in call to: /dbfs/mnt/data', + message='Deprecated file system path: /dbfs/mnt/data', start_line=13, start_col=10, end_line=13, - end_col=24, + end_col=26, ), Deprecation( code='dbfs-usage', - message='Deprecated file system path in call to: /mnt/foo/bar', + message='Deprecated file system path: /mnt/foo/bar', start_line=15, start_col=19, end_line=15, - end_col=31, + end_col=33, ), Deprecation( code='dbfs-usage', - message='Deprecated file system path in call to: dbfs:/mnt/foo/bar', + message='Deprecated file system path: dbfs:/mnt/foo/bar', start_line=16, start_col=19, end_line=16, - end_col=36, + end_col=38, ), Deprecation( code='dbfs-usage', - message='Deprecated file system path in call to: dbfs://mnt/foo/bar', + message='Deprecated file system path: dbfs://mnt/foo/bar', start_line=17, start_col=19, end_line=17, - end_col=37, + end_col=39, ), Deprecation( code='dbfs-query',