From 2f7ec5835ffe669f6fcc6bb059f1a61ddc85daf6 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 13 Jun 2024 17:47:33 +0200 Subject: [PATCH 01/14] support magic commands in python cells --- src/databricks/labs/ucx/source_code/graph.py | 80 ++++++++++++++++++- .../labs/ucx/source_code/linters/imports.py | 10 +-- .../labs/ucx/source_code/notebooks/cells.py | 30 +------ .../ucx/source_code/notebooks/commands.py | 48 +++++++++++ .../unit/source_code/notebooks/test_cells.py | 41 +++------- .../source_code/notebooks/test_commands.py | 32 ++++++++ 6 files changed, 176 insertions(+), 65 deletions(-) create mode 100644 src/databricks/labs/ucx/source_code/notebooks/commands.py create mode 100644 tests/unit/source_code/notebooks/test_commands.py diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index f5f3691bcd..be0bf65b04 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -1,13 +1,20 @@ from __future__ import annotations import abc +import base64 import logging from dataclasses import dataclass from pathlib import Path from collections.abc import Callable - -from astroid import ImportFrom, NodeNG # type: ignore - +from typing import TypeVar, cast + +from astroid import ( # type: ignore + Call, + Const, + ImportFrom, + Name, + NodeNG, +) from databricks.labs.ucx.source_code.base import Advisory from databricks.labs.ucx.source_code.linters.imports import ( DbutilsLinter, @@ -15,6 +22,7 @@ NotebookRunCall, SysPathChange, UnresolvedPath, + ProblemFactory, ) from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -171,6 +179,7 @@ def build_graph_from_python_source(self, python_code: str) -> list[DependencyPro Returns: A list of dependency problems; position information is relative to the python code itself. """ + python_code = self.convert_magic_commands(python_code) problems: list[DependencyProblem] = [] try: tree = Tree.parse(python_code) @@ -183,7 +192,9 @@ def build_graph_from_python_source(self, python_code: str) -> list[DependencyPro import_problems: list[DependencyProblem] import_sources, import_problems = ImportSource.extract_from_tree(tree, DependencyProblem.from_node) problems.extend(import_problems) - nodes = syspath_changes + run_calls + import_sources + magic_commands, command_problems = MagicCommand.extract_from_tree(tree, DependencyProblem.from_node) + problems.extend(command_problems) + nodes = syspath_changes + run_calls + import_sources + magic_commands # need to execute things in intertwined sequence so concat and sort for base_node in sorted(nodes, key=lambda node: (node.node.lineno, node.node.col_offset)): for problem in self._process_node(base_node): @@ -197,6 +208,16 @@ def build_graph_from_python_source(self, python_code: str) -> list[DependencyPro problems.append(problem) return problems + @classmethod + def convert_magic_commands(cls, python_code: str): + lines = python_code.split("\n") + for i, line in enumerate(lines): + if not line.startswith("%"): + continue + content = base64.b64encode(line.encode()) # use base64 to simplify escaping + lines[i] = f"magic_command('{content.decode('ascii')}')" + return "\n".join(lines) + def _process_node(self, base_node: NodeBase): if isinstance(base_node, SysPathChange): yield from self._mutate_path_lookup(base_node) @@ -204,9 +225,14 @@ def _process_node(self, base_node: NodeBase): yield from self._register_notebook(base_node) elif isinstance(base_node, ImportSource): yield from self._register_import(base_node) + elif isinstance(base_node, MagicCommand): + yield from self._execute_command(base_node) else: logger.warning(f"Can't process {NodeBase.__name__} of type {type(base_node).__name__}") + def _execute_command(self, base_node: MagicCommand): + return base_node.execute(return_type=list[DependencyProblem], command="build_dependency_graph", graph=self) + def _register_import(self, base_node: ImportSource): prefix = "" if isinstance(base_node.node, ImportFrom) and base_node.node.level is not None: @@ -480,3 +506,49 @@ class MaybeGraph: @property def failed(self): return len(self.problems) > 0 + + +T = TypeVar("T") + +# non-top-level import is required to avoid cyclic dependency +# pylint: disable=wrong-import-position, cyclic-import +from databricks.labs.ucx.source_code.notebooks.commands import PipCommand # noqa: E402 + + +class MagicCommand(NodeBase): + + @classmethod + def extract_from_tree( + cls, tree: Tree, problem_factory: ProblemFactory + ) -> tuple[list[MagicCommand], list[DependencyProblem]]: + problems: list[DependencyProblem] = [] + commands: list[MagicCommand] = [] + try: + nodes = tree.locate(Call, [("magic_command", Name)]) + for command in cls._make_commands_for_magic_command_call_nodes(nodes): + commands.append(command) + return commands, problems + except Exception as e: # pylint: disable=broad-except + problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root) + problems.append(problem) + return [], problems + + @classmethod + def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]): + for node in nodes: + arg = node.args[0] + if isinstance(arg, Const): + yield MagicCommand(node, arg.value) + + def __init__(self, node: NodeNG, command: str): + super().__init__(node) + data = base64.b64decode(command) + self._command = data.decode("ascii") + + def execute(self, return_type: T, **kwargs) -> T: + """so many different things can happen here that we give up type hints""" + if self._command.startswith("%pip"): + cmd = PipCommand(self._command) + return cmd.execute(return_type, **kwargs) + logger.debug(f"Unsupported magic command {self._command[:self._command.index(' ')]}") + return cast(T, None) diff --git a/src/databricks/labs/ucx/source_code/linters/imports.py b/src/databricks/labs/ucx/source_code/linters/imports.py index a9a7e60c90..e5405f8e9e 100644 --- a/src/databricks/labs/ucx/source_code/linters/imports.py +++ b/src/databricks/labs/ucx/source_code/linters/imports.py @@ -21,15 +21,15 @@ logger = logging.getLogger(__name__) -P = TypeVar("P") -ProblemFactory = Callable[[str, str, NodeNG], P] +T = TypeVar("T") +ProblemFactory = Callable[[str, str, NodeNG], T] class ImportSource(NodeBase): @classmethod - def extract_from_tree(cls, tree: Tree, problem_factory: ProblemFactory) -> tuple[list[ImportSource], list[P]]: - problems: list[P] = [] + def extract_from_tree(cls, tree: Tree, problem_factory: ProblemFactory) -> tuple[list[ImportSource], list[T]]: + problems: list[T] = [] sources: list[ImportSource] = [] try: # pylint: disable=too-many-try-statements nodes = tree.locate(Import, []) @@ -61,7 +61,7 @@ def _make_sources_for_import_from_nodes(cls, nodes: list[ImportFrom]) -> Iterabl yield ImportSource(node, node.modname) @classmethod - def _make_sources_for_import_call_nodes(cls, nodes: list[Call], problem_factory: ProblemFactory, problems: list[P]): + def _make_sources_for_import_call_nodes(cls, nodes: list[Call], problem_factory: ProblemFactory, problems: list[T]): for node in nodes: arg = node.args[0] if isinstance(arg, Const): diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index 41298c6dbb..e147a50b99 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -1,16 +1,16 @@ from __future__ import annotations import logging -import re -import shlex from abc import ABC, abstractmethod from ast import parse as parse_python from enum import Enum from pathlib import Path + from sqlglot import parse as parse_sql, ParseError as SQLParseError from databricks.sdk.service.workspace import Language from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem +from databricks.labs.ucx.source_code.notebooks.commands import PipCommand # use a specific logger for sqlglot warnings so we can disable them selectively sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot") @@ -203,32 +203,8 @@ def language(self): def is_runnable(self) -> bool: return True # TODO - @staticmethod - def _split(code) -> list[str]: - """Split pip cell code into multiple arguments - - Note: - PipCell should be a pip command, i.e. single line possible spanning multilines escaped with backslashes. - - Sources: - https://docs.databricks.com/en/libraries/notebooks-python-libraries.html#manage-libraries-with-pip-commands - """ - match = re.search(r"(? list[DependencyProblem]: - argv = self._split(self.original_code) - if len(argv) == 1: - return [DependencyProblem("library-install-failed", "Missing command after '%pip'")] - if argv[1] != "install": - return [DependencyProblem("library-install-failed", f"Unsupported %pip command: {argv[1]}")] - if len(argv) == 2: - return [DependencyProblem("library-install-failed", "Missing arguments after '%pip install'")] - return graph.register_library(*argv[2:]) # Skipping %pip install + return PipCommand(self.original_code).build_dependency_graph(graph) class CellLanguage(Enum): diff --git a/src/databricks/labs/ucx/source_code/notebooks/commands.py b/src/databricks/labs/ucx/source_code/notebooks/commands.py new file mode 100644 index 0000000000..ce012622e1 --- /dev/null +++ b/src/databricks/labs/ucx/source_code/notebooks/commands.py @@ -0,0 +1,48 @@ +import re +import shlex +from typing import TypeVar, cast + +from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem + +T = TypeVar("T") + + +class PipCommand: + + def __init__(self, code: str): + self._code = code + + def execute(self, _: T, **kwargs) -> T: + command = kwargs["command"] + if command == "build_dependency_graph" and "graph" in kwargs: + graph = kwargs["graph"] + assert isinstance(graph, DependencyGraph) + return cast(T, self.build_dependency_graph(graph)) + return cast(T, None) + + def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: + argv = self._split(self._code) + if len(argv) == 1: + return [DependencyProblem("library-install-failed", "Missing command after '%pip'")] + if argv[1] != "install": + return [DependencyProblem("library-install-failed", f"Unsupported %pip command: {argv[1]}")] + if len(argv) == 2: + return [DependencyProblem("library-install-failed", "Missing arguments after '%pip install'")] + return graph.register_library(*argv[2:]) # Skipping %pip install + + @staticmethod + def _split(code) -> list[str]: + """Split pip cell code into multiple arguments + + Note: + PipCell should be a pip command, i.e. single line possible spanning multilines escaped with backslashes. + + Sources: + https://docs.databricks.com/en/libraries/notebooks-python-libraries.html#manage-libraries-with-pip-commands + """ + match = re.search(r"(? Date: Fri, 14 Jun 2024 10:49:15 +0200 Subject: [PATCH 02/14] address comments --- src/databricks/labs/ucx/source_code/graph.py | 67 ++------------- .../labs/ucx/source_code/notebooks/cells.py | 4 +- .../ucx/source_code/notebooks/commands.py | 48 ----------- .../labs/ucx/source_code/notebooks/magic.py | 82 +++++++++++++++++++ .../source_code/notebooks/test_commands.py | 4 +- 5 files changed, 95 insertions(+), 110 deletions(-) delete mode 100644 src/databricks/labs/ucx/source_code/notebooks/commands.py create mode 100644 src/databricks/labs/ucx/source_code/notebooks/magic.py diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index be0bf65b04..06bb533350 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -6,13 +6,9 @@ from dataclasses import dataclass from pathlib import Path from collections.abc import Callable -from typing import TypeVar, cast from astroid import ( # type: ignore - Call, - Const, ImportFrom, - Name, NodeNG, ) from databricks.labs.ucx.source_code.base import Advisory @@ -22,7 +18,6 @@ NotebookRunCall, SysPathChange, UnresolvedPath, - ProblemFactory, ) from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -144,7 +139,7 @@ def local_dependencies(self) -> set[Dependency]: def all_paths(self) -> set[Path]: # TODO: remove this public method, as it'll throw false positives # for package imports, like certifi. a WorkflowTask is also a dependency, - # but it does not exist on a filesyste + # but it does not exist on a filesystem return {d.path for d in self.all_dependencies} def all_relative_names(self) -> set[str]: @@ -179,7 +174,7 @@ def build_graph_from_python_source(self, python_code: str) -> list[DependencyPro Returns: A list of dependency problems; position information is relative to the python code itself. """ - python_code = self.convert_magic_commands(python_code) + python_code = self._convert_magic_commands(python_code) problems: list[DependencyProblem] = [] try: tree = Tree.parse(python_code) @@ -209,13 +204,13 @@ def build_graph_from_python_source(self, python_code: str) -> list[DependencyPro return problems @classmethod - def convert_magic_commands(cls, python_code: str): + def _convert_magic_commands(cls, python_code: str): lines = python_code.split("\n") for i, line in enumerate(lines): if not line.startswith("%"): continue - content = base64.b64encode(line.encode()) # use base64 to simplify escaping - lines[i] = f"magic_command('{content.decode('ascii')}')" + data = base64.b64encode(line.encode()) + lines[i] = f"magic_command(b'{data.decode('ascii')}')" return "\n".join(lines) def _process_node(self, base_node: NodeBase): @@ -226,13 +221,10 @@ def _process_node(self, base_node: NodeBase): elif isinstance(base_node, ImportSource): yield from self._register_import(base_node) elif isinstance(base_node, MagicCommand): - yield from self._execute_command(base_node) + yield from base_node.build_dependency_graph(self) else: logger.warning(f"Can't process {NodeBase.__name__} of type {type(base_node).__name__}") - def _execute_command(self, base_node: MagicCommand): - return base_node.execute(return_type=list[DependencyProblem], command="build_dependency_graph", graph=self) - def _register_import(self, base_node: ImportSource): prefix = "" if isinstance(base_node.node, ImportFrom) and base_node.node.level is not None: @@ -446,7 +438,7 @@ class DependencyProblem: code: str message: str source_path: Path = Path(MISSING_SOURCE_PATH) - # Lines and colums are both 0-based: the first line is line 0. + # Lines and columns are both 0-based: the first line is line 0. start_line: int = -1 start_col: int = -1 end_line: int = -1 @@ -494,7 +486,7 @@ def from_node(code: str, message: str, node: NodeNG) -> DependencyProblem: start_line=(node.lineno or 1) - 1, start_col=node.col_offset, end_line=(node.end_lineno or 1) - 1, - end_col=(node.end_col_offset), + end_col=(node.end_col_offset or 0), ) @@ -508,47 +500,6 @@ def failed(self): return len(self.problems) > 0 -T = TypeVar("T") - # non-top-level import is required to avoid cyclic dependency # pylint: disable=wrong-import-position, cyclic-import -from databricks.labs.ucx.source_code.notebooks.commands import PipCommand # noqa: E402 - - -class MagicCommand(NodeBase): - - @classmethod - def extract_from_tree( - cls, tree: Tree, problem_factory: ProblemFactory - ) -> tuple[list[MagicCommand], list[DependencyProblem]]: - problems: list[DependencyProblem] = [] - commands: list[MagicCommand] = [] - try: - nodes = tree.locate(Call, [("magic_command", Name)]) - for command in cls._make_commands_for_magic_command_call_nodes(nodes): - commands.append(command) - return commands, problems - except Exception as e: # pylint: disable=broad-except - problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root) - problems.append(problem) - return [], problems - - @classmethod - def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]): - for node in nodes: - arg = node.args[0] - if isinstance(arg, Const): - yield MagicCommand(node, arg.value) - - def __init__(self, node: NodeNG, command: str): - super().__init__(node) - data = base64.b64decode(command) - self._command = data.decode("ascii") - - def execute(self, return_type: T, **kwargs) -> T: - """so many different things can happen here that we give up type hints""" - if self._command.startswith("%pip"): - cmd = PipCommand(self._command) - return cmd.execute(return_type, **kwargs) - logger.debug(f"Unsupported magic command {self._command[:self._command.index(' ')]}") - return cast(T, None) +from databricks.labs.ucx.source_code.notebooks.commands import MagicCommand # noqa: E402 diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index e147a50b99..56d27efde4 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -10,7 +10,7 @@ from databricks.sdk.service.workspace import Language from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem -from databricks.labs.ucx.source_code.notebooks.commands import PipCommand +from databricks.labs.ucx.source_code.notebooks.commands import PipMagic # use a specific logger for sqlglot warnings so we can disable them selectively sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot") @@ -204,7 +204,7 @@ def is_runnable(self) -> bool: return True # TODO def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: - return PipCommand(self.original_code).build_dependency_graph(graph) + return PipMagic(self.original_code).build_dependency_graph(graph) class CellLanguage(Enum): diff --git a/src/databricks/labs/ucx/source_code/notebooks/commands.py b/src/databricks/labs/ucx/source_code/notebooks/commands.py deleted file mode 100644 index ce012622e1..0000000000 --- a/src/databricks/labs/ucx/source_code/notebooks/commands.py +++ /dev/null @@ -1,48 +0,0 @@ -import re -import shlex -from typing import TypeVar, cast - -from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem - -T = TypeVar("T") - - -class PipCommand: - - def __init__(self, code: str): - self._code = code - - def execute(self, _: T, **kwargs) -> T: - command = kwargs["command"] - if command == "build_dependency_graph" and "graph" in kwargs: - graph = kwargs["graph"] - assert isinstance(graph, DependencyGraph) - return cast(T, self.build_dependency_graph(graph)) - return cast(T, None) - - def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: - argv = self._split(self._code) - if len(argv) == 1: - return [DependencyProblem("library-install-failed", "Missing command after '%pip'")] - if argv[1] != "install": - return [DependencyProblem("library-install-failed", f"Unsupported %pip command: {argv[1]}")] - if len(argv) == 2: - return [DependencyProblem("library-install-failed", "Missing arguments after '%pip install'")] - return graph.register_library(*argv[2:]) # Skipping %pip install - - @staticmethod - def _split(code) -> list[str]: - """Split pip cell code into multiple arguments - - Note: - PipCell should be a pip command, i.e. single line possible spanning multilines escaped with backslashes. - - Sources: - https://docs.databricks.com/en/libraries/notebooks-python-libraries.html#manage-libraries-with-pip-commands - """ - match = re.search(r"(? tuple[list[MagicCommand], list[DependencyProblem]]: + problems: list[DependencyProblem] = [] + commands: list[MagicCommand] = [] + try: + nodes = tree.locate(Call, [("magic_command", Name)]) + for command in cls._make_commands_for_magic_command_call_nodes(nodes): + commands.append(command) + return commands, problems + except Exception as e: # pylint: disable=broad-except + problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root) + problems.append(problem) + return [], problems + + @classmethod + def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]): + for node in nodes: + arg = node.args[0] + if isinstance(arg, Const): + yield MagicCommand(node, arg.value) + + def __init__(self, node: NodeNG, command: str): + super().__init__(node) + data = base64.b64decode(command) + self._command = data.decode("ascii") + + def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: + if self._command.startswith("%pip"): + cmd = PipMagic(self._command) + return cmd.build_dependency_graph(graph) + logger.debug(f"Unsupported magic command {self._command[:self._command.index(' ')]}") + return [] + + +class PipMagic: + + def __init__(self, code: str): + self._code = code + + def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: + argv = self._split(self._code) + if len(argv) == 1: + return [DependencyProblem("library-install-failed", "Missing command after '%pip'")] + if argv[1] != "install": + return [DependencyProblem("library-install-failed", f"Unsupported %pip command: {argv[1]}")] + if len(argv) == 2: + return [DependencyProblem("library-install-failed", "Missing arguments after '%pip install'")] + return graph.register_library(*argv[2:]) # Skipping %pip install + + @staticmethod + def _split(code) -> list[str]: + """Split pip cell code into multiple arguments + + Note: + PipCell should be a pip command, i.e. single line possible spanning multilines escaped with backslashes. + + Sources: + https://docs.databricks.com/en/libraries/notebooks-python-libraries.html#manage-libraries-with-pip-commands + """ + match = re.search(r"(? Date: Fri, 14 Jun 2024 11:07:13 +0200 Subject: [PATCH 03/14] address comments --- src/databricks/labs/ucx/source_code/graph.py | 6 ++---- src/databricks/labs/ucx/source_code/notebooks/cells.py | 2 +- src/databricks/labs/ucx/source_code/notebooks/magic.py | 6 ++---- .../notebooks/{test_commands.py => test_magic.py} | 2 +- 4 files changed, 6 insertions(+), 10 deletions(-) rename tests/unit/source_code/notebooks/{test_commands.py => test_magic.py} (95%) diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index 06bb533350..63547c7942 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -1,7 +1,6 @@ from __future__ import annotations import abc -import base64 import logging from dataclasses import dataclass from pathlib import Path @@ -209,8 +208,7 @@ def _convert_magic_commands(cls, python_code: str): for i, line in enumerate(lines): if not line.startswith("%"): continue - data = base64.b64encode(line.encode()) - lines[i] = f"magic_command(b'{data.decode('ascii')}')" + lines[i] = f"magic_command({line.encode()!r})" return "\n".join(lines) def _process_node(self, base_node: NodeBase): @@ -502,4 +500,4 @@ def failed(self): # non-top-level import is required to avoid cyclic dependency # pylint: disable=wrong-import-position, cyclic-import -from databricks.labs.ucx.source_code.notebooks.commands import MagicCommand # noqa: E402 +from databricks.labs.ucx.source_code.notebooks.magic import MagicCommand # noqa: E402 diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index 56d27efde4..68e4936763 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -10,7 +10,7 @@ from databricks.sdk.service.workspace import Language from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem -from databricks.labs.ucx.source_code.notebooks.commands import PipMagic +from databricks.labs.ucx.source_code.notebooks.magic import PipMagic # use a specific logger for sqlglot warnings so we can disable them selectively sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot") diff --git a/src/databricks/labs/ucx/source_code/notebooks/magic.py b/src/databricks/labs/ucx/source_code/notebooks/magic.py index e3506ea655..e811fe5169 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/magic.py +++ b/src/databricks/labs/ucx/source_code/notebooks/magic.py @@ -1,6 +1,5 @@ from __future__ import annotations -import base64 import re import shlex @@ -36,10 +35,9 @@ def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]): if isinstance(arg, Const): yield MagicCommand(node, arg.value) - def __init__(self, node: NodeNG, command: str): + def __init__(self, node: NodeNG, command: bytes): super().__init__(node) - data = base64.b64decode(command) - self._command = data.decode("ascii") + self._command = command.decode() def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: if self._command.startswith("%pip"): diff --git a/tests/unit/source_code/notebooks/test_commands.py b/tests/unit/source_code/notebooks/test_magic.py similarity index 95% rename from tests/unit/source_code/notebooks/test_commands.py rename to tests/unit/source_code/notebooks/test_magic.py index 1f47913e8d..741275e5f9 100644 --- a/tests/unit/source_code/notebooks/test_commands.py +++ b/tests/unit/source_code/notebooks/test_magic.py @@ -1,6 +1,6 @@ import pytest -from databricks.labs.ucx.source_code.notebooks.commands import PipMagic +from databricks.labs.ucx.source_code.notebooks.magic import PipMagic @pytest.mark.parametrize( From f24ab05b22e8f60e1676b51710c11dfe0377d9e0 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 14 Jun 2024 11:14:46 +0200 Subject: [PATCH 04/14] fix merge issues --- tests/unit/source_code/notebooks/test_cells.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/source_code/notebooks/test_cells.py b/tests/unit/source_code/notebooks/test_cells.py index a39402d45d..1254a7d48a 100644 --- a/tests/unit/source_code/notebooks/test_cells.py +++ b/tests/unit/source_code/notebooks/test_cells.py @@ -172,6 +172,6 @@ def test_parses_python_cell_with_magic_commands(simple_dependency_resolver, mock """ cell = PythonCell(code, original_offset=1) dependency = Dependency(FileLoader(), Path("")) - graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup) + graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) problems = cell.build_dependency_graph(graph) assert not problems From 5433d90a770838844b0f8e355fc4dbe3221c428e Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 14 Jun 2024 12:06:34 +0200 Subject: [PATCH 05/14] raise problem with unsupported magics --- src/databricks/labs/ucx/source_code/graph.py | 11 +--------- .../labs/ucx/source_code/notebooks/magic.py | 19 ++++++++++++++--- .../unit/source_code/notebooks/test_magic.py | 21 ++++++++++++++++++- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index e9f0d67bc3..e7f5bc348f 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -175,7 +175,7 @@ def build_graph_from_python_source(self, python_code: str) -> list[DependencyPro Returns: A list of dependency problems; position information is relative to the python code itself. """ - python_code = self._convert_magic_commands(python_code) + python_code = MagicCommand.convert_magic_lines_to_magic_commands(python_code) problems: list[DependencyProblem] = [] try: tree = Tree.parse(python_code) @@ -204,15 +204,6 @@ def build_graph_from_python_source(self, python_code: str) -> list[DependencyPro problems.append(problem) return problems - @classmethod - def _convert_magic_commands(cls, python_code: str): - lines = python_code.split("\n") - for i, line in enumerate(lines): - if not line.startswith("%"): - continue - lines[i] = f"magic_command({line.encode()!r})" - return "\n".join(lines) - def _process_node(self, base_node: NodeBase): if isinstance(base_node, SysPathChange): yield from self._mutate_path_lookup(base_node) diff --git a/src/databricks/labs/ucx/source_code/notebooks/magic.py b/src/databricks/labs/ucx/source_code/notebooks/magic.py index e811fe5169..4a4beb1a2e 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/magic.py +++ b/src/databricks/labs/ucx/source_code/notebooks/magic.py @@ -5,13 +5,22 @@ from astroid import Call, Name, Const, NodeNG # type: ignore -from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem, logger +from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem from databricks.labs.ucx.source_code.linters.imports import ProblemFactory from databricks.labs.ucx.source_code.linters.python_ast import NodeBase, Tree class MagicCommand(NodeBase): + @classmethod + def convert_magic_lines_to_magic_commands(cls, python_code: str): + lines = python_code.split("\n") + for i, line in enumerate(lines): + if not line.startswith("%"): + continue + lines[i] = f"magic_command({line.encode()!r})" + return "\n".join(lines) + @classmethod def extract_from_tree( cls, tree: Tree, problem_factory: ProblemFactory @@ -43,8 +52,12 @@ def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProbl if self._command.startswith("%pip"): cmd = PipMagic(self._command) return cmd.build_dependency_graph(graph) - logger.debug(f"Unsupported magic command {self._command[:self._command.index(' ')]}") - return [] + problem = DependencyProblem.from_node( + code='unsupported-magic-line', + message=f"magic line '{self._command}' is not supported yet", + node=self.node + ) + return [problem] class PipMagic: diff --git a/tests/unit/source_code/notebooks/test_magic.py b/tests/unit/source_code/notebooks/test_magic.py index 741275e5f9..dc193ba35d 100644 --- a/tests/unit/source_code/notebooks/test_magic.py +++ b/tests/unit/source_code/notebooks/test_magic.py @@ -1,6 +1,12 @@ +from pathlib import Path + import pytest -from databricks.labs.ucx.source_code.notebooks.magic import PipMagic +from databricks.labs.ucx.source_code.base import CurrentSessionState +from databricks.labs.ucx.source_code.graph import DependencyProblem, DependencyGraph, Dependency +from databricks.labs.ucx.source_code.linters.files import FileLoader +from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.notebooks.magic import PipMagic, MagicCommand @pytest.mark.parametrize( @@ -30,3 +36,16 @@ ) def test_pip_command_split(code, split): assert PipMagic._split(code) == split # pylint: disable=protected-access + + +def test_unsupported_magic_raises_problem(simple_dependency_resolver, mock_path_lookup): + source = """ +%unsupported stuff '"%#@! +""" + converted = MagicCommand.convert_magic_lines_to_magic_commands(source) + tree = Tree.parse(converted) + commands, _ = MagicCommand.extract_from_tree(tree, DependencyProblem.from_node) + dependency = Dependency(FileLoader(), Path("")) + graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) + problems = commands[0].build_dependency_graph(graph) + assert problems[0].code == "unsupported-magic-line" From 13258d5434290ed1462d8f483b7f0f515ede8813 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 14 Jun 2024 18:58:04 +0200 Subject: [PATCH 06/14] parse magic commands when linting --- src/databricks/labs/ucx/source_code/linters/dbfs.py | 2 ++ src/databricks/labs/ucx/source_code/linters/imports.py | 5 +++++ src/databricks/labs/ucx/source_code/linters/pyspark.py | 2 ++ src/databricks/labs/ucx/source_code/linters/spark_connect.py | 2 ++ .../labs/ucx/source_code/linters/table_creation.py | 3 ++- src/databricks/labs/ucx/source_code/notebooks/magic.py | 4 +--- 6 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/dbfs.py b/src/databricks/labs/ucx/source_code/linters/dbfs.py index 21c7f012c0..aae70bef98 100644 --- a/src/databricks/labs/ucx/source_code/linters/dbfs.py +++ b/src/databricks/labs/ucx/source_code/linters/dbfs.py @@ -7,6 +7,7 @@ from databricks.labs.ucx.source_code.base import Advice, Linter, Deprecation, CurrentSessionState from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor, InferredValue +from databricks.labs.ucx.source_code.notebooks.magic import MagicCommand logger = logging.getLogger(__name__) @@ -80,6 +81,7 @@ def lint(self, code: str) -> Iterable[Advice]: """ Lints the code looking for file system paths that are deprecated """ + code = MagicCommand.convert_magic_lines_to_magic_commands(code) tree = Tree.parse(code) visitor = DetectDbfsVisitor(self._session_state) visitor.visit(tree.node) diff --git a/src/databricks/labs/ucx/source_code/linters/imports.py b/src/databricks/labs/ucx/source_code/linters/imports.py index bdeb9dd02f..b4a1900661 100644 --- a/src/databricks/labs/ucx/source_code/linters/imports.py +++ b/src/databricks/labs/ucx/source_code/linters/imports.py @@ -116,6 +116,11 @@ def __init__(self, session_state: CurrentSessionState): self._session_state = session_state def lint(self, code: str) -> Iterable[Advice]: + # can't find a way to avoid this cyclic import + # pylint: disable=import-outside-toplevel, cyclic-import + from databricks.labs.ucx.source_code.notebooks.magic import MagicCommand + + code = MagicCommand.convert_magic_lines_to_magic_commands(code) tree = Tree.parse(code) nodes = self.list_dbutils_notebook_run_calls(tree) for node in nodes: diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index 12201fd488..09081edb99 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -13,6 +13,7 @@ Failure, CurrentSessionState, ) +from databricks.labs.ucx.source_code.notebooks.magic import MagicCommand from databricks.labs.ucx.source_code.queries import FromTable from databricks.labs.ucx.source_code.linters.python_ast import Tree, InferredValue @@ -332,6 +333,7 @@ def name(self) -> str: def lint(self, code: str) -> Iterable[Advice]: try: + code = MagicCommand.convert_magic_lines_to_magic_commands(code) tree = Tree.parse(code) except AstroidSyntaxError as e: yield Failure('syntax-error', str(e), 0, 0, 0, 0) diff --git a/src/databricks/labs/ucx/source_code/linters/spark_connect.py b/src/databricks/labs/ucx/source_code/linters/spark_connect.py index 35f5230a7a..b9a2cf3303 100644 --- a/src/databricks/labs/ucx/source_code/linters/spark_connect.py +++ b/src/databricks/labs/ucx/source_code/linters/spark_connect.py @@ -9,6 +9,7 @@ Linter, ) from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.notebooks.magic import MagicCommand @dataclass @@ -182,6 +183,7 @@ def __init__(self, is_serverless: bool = False): ] def lint(self, code: str) -> Iterator[Advice]: + code = MagicCommand.convert_magic_lines_to_magic_commands(code) tree = Tree.parse(code) for matcher in self._matchers: yield from matcher.lint_tree(tree.node) diff --git a/src/databricks/labs/ucx/source_code/linters/table_creation.py b/src/databricks/labs/ucx/source_code/linters/table_creation.py index 95f00903f8..133b17bdcc 100644 --- a/src/databricks/labs/ucx/source_code/linters/table_creation.py +++ b/src/databricks/labs/ucx/source_code/linters/table_creation.py @@ -10,6 +10,7 @@ Linter, ) from databricks.labs.ucx.source_code.linters.python_ast import Tree +from databricks.labs.ucx.source_code.notebooks.magic import MagicCommand @dataclass @@ -114,7 +115,7 @@ def __init__(self, dbr_version: tuple[int, int] | None): def lint(self, code: str) -> Iterable[Advice]: if self._skip_dbr: return - + code = MagicCommand.convert_magic_lines_to_magic_commands(code) tree = Tree.parse(code) for node in tree.walk(): yield from self._linter.lint(node) diff --git a/src/databricks/labs/ucx/source_code/notebooks/magic.py b/src/databricks/labs/ucx/source_code/notebooks/magic.py index 4a4beb1a2e..6c93c46deb 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/magic.py +++ b/src/databricks/labs/ucx/source_code/notebooks/magic.py @@ -53,9 +53,7 @@ def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProbl cmd = PipMagic(self._command) return cmd.build_dependency_graph(graph) problem = DependencyProblem.from_node( - code='unsupported-magic-line', - message=f"magic line '{self._command}' is not supported yet", - node=self.node + code='unsupported-magic-line', message=f"magic line '{self._command}' is not supported yet", node=self.node ) return [problem] From 532765658d166d04d5d43e725e0501b14f48de1e Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 17 Jun 2024 16:16:31 +0200 Subject: [PATCH 07/14] remove cyclic imports --- src/databricks/labs/ucx/source_code/graph.py | 101 ++--------------- .../labs/ucx/source_code/linters/dbfs.py | 3 +- .../labs/ucx/source_code/linters/files.py | 6 +- .../labs/ucx/source_code/linters/imports.py | 6 +- .../labs/ucx/source_code/linters/pyspark.py | 3 +- .../ucx/source_code/linters/python_ast.py | 9 ++ .../ucx/source_code/linters/spark_connect.py | 3 +- .../ucx/source_code/linters/table_creation.py | 3 +- .../labs/ucx/source_code/notebooks/cells.py | 103 +++++++++++++++++- .../labs/ucx/source_code/notebooks/magic.py | 9 -- .../source_code/linters/test_python_ast.py | 14 +++ .../unit/source_code/notebooks/test_magic.py | 2 +- 12 files changed, 142 insertions(+), 120 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index e7f5bc348f..0197bd2267 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -7,18 +7,9 @@ from collections.abc import Callable from astroid import ( # type: ignore - ImportFrom, NodeNG, ) from databricks.labs.ucx.source_code.base import Advisory, CurrentSessionState -from databricks.labs.ucx.source_code.linters.imports import ( - DbutilsLinter, - ImportSource, - NotebookRunCall, - SysPathChange, - UnresolvedPath, -) -from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase from databricks.labs.ucx.source_code.path_lookup import PathLookup logger = logging.Logger(__name__) @@ -169,89 +160,20 @@ def visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: s return True return False - def build_graph_from_python_source(self, python_code: str) -> list[DependencyProblem]: - """Check python code for dependency-related problems. - - Returns: - A list of dependency problems; position information is relative to the python code itself. - """ - python_code = MagicCommand.convert_magic_lines_to_magic_commands(python_code) - problems: list[DependencyProblem] = [] - try: - tree = Tree.parse(python_code) - except Exception as e: # pylint: disable=broad-except - problems.append(DependencyProblem('parse-error', f"Could not parse Python code: {e}")) - return problems - syspath_changes = SysPathChange.extract_from_tree(self._session_state, tree) - run_calls = DbutilsLinter.list_dbutils_notebook_run_calls(tree) - import_sources: list[ImportSource] - import_problems: list[DependencyProblem] - import_sources, import_problems = ImportSource.extract_from_tree(tree, DependencyProblem.from_node) - problems.extend(import_problems) - magic_commands, command_problems = MagicCommand.extract_from_tree(tree, DependencyProblem.from_node) - problems.extend(command_problems) - nodes = syspath_changes + run_calls + import_sources + magic_commands - # need to execute things in intertwined sequence so concat and sort - for base_node in sorted(nodes, key=lambda node: (node.node.lineno, node.node.col_offset)): - for problem in self._process_node(base_node): - # Astroid line numbers are 1-based. - problem = problem.replace( - start_line=base_node.node.lineno - 1, - start_col=base_node.node.col_offset, - end_line=(base_node.node.end_lineno or 1) - 1, - end_col=base_node.node.end_col_offset or 0, - ) - problems.append(problem) - return problems - - def _process_node(self, base_node: NodeBase): - if isinstance(base_node, SysPathChange): - yield from self._mutate_path_lookup(base_node) - elif isinstance(base_node, NotebookRunCall): - yield from self._register_notebook(base_node) - elif isinstance(base_node, ImportSource): - yield from self._register_import(base_node) - elif isinstance(base_node, MagicCommand): - yield from base_node.build_dependency_graph(self) - else: - logger.warning(f"Can't process {NodeBase.__name__} of type {type(base_node).__name__}") - - def _register_import(self, base_node: ImportSource): - prefix = "" - if isinstance(base_node.node, ImportFrom) and base_node.node.level is not None: - prefix = "." * base_node.node.level - name = base_node.name or "" - yield from self.register_import(prefix + name) - - def _register_notebook(self, base_node: NotebookRunCall): - has_unresolved, paths = base_node.get_notebook_paths(self._session_state) - if has_unresolved: - yield DependencyProblem( - 'dependency-cannot-compute', - f"Can't check dependency from {base_node.node.as_string()} because the expression cannot be computed", - ) - for path in paths: - yield from self.register_notebook(Path(path)) - - def _mutate_path_lookup(self, change: SysPathChange): - if isinstance(change, UnresolvedPath): - yield DependencyProblem( - 'sys-path-cannot-compute', - f"Can't update sys.path from {change.node.as_string()} because the expression cannot be computed", - ) - return - path = Path(change.path) - if not path.is_absolute(): - path = self._path_lookup.cwd / path - if change.is_append: - self._path_lookup.append_path(path) - return - self._path_lookup.prepend_path(path) + def new_graph_builder_context(self): + return GraphBuilderContext(parent=self, path_lookup=self._path_lookup, session_state=self._session_state) def __repr__(self): return f"" +@dataclass +class GraphBuilderContext: + parent: DependencyGraph + path_lookup: PathLookup + session_state: CurrentSessionState + + class Dependency(abc.ABC): def __init__(self, loader: DependencyLoader, path: Path): @@ -489,8 +411,3 @@ class MaybeGraph: @property def failed(self): return len(self.problems) > 0 - - -# non-top-level import is required to avoid cyclic dependency -# pylint: disable=wrong-import-position, cyclic-import -from databricks.labs.ucx.source_code.notebooks.magic import MagicCommand # noqa: E402 diff --git a/src/databricks/labs/ucx/source_code/linters/dbfs.py b/src/databricks/labs/ucx/source_code/linters/dbfs.py index aae70bef98..682dd51fb0 100644 --- a/src/databricks/labs/ucx/source_code/linters/dbfs.py +++ b/src/databricks/labs/ucx/source_code/linters/dbfs.py @@ -7,7 +7,6 @@ from databricks.labs.ucx.source_code.base import Advice, Linter, Deprecation, CurrentSessionState from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor, InferredValue -from databricks.labs.ucx.source_code.notebooks.magic import MagicCommand logger = logging.getLogger(__name__) @@ -81,7 +80,7 @@ def lint(self, code: str) -> Iterable[Advice]: """ Lints the code looking for file system paths that are deprecated """ - code = MagicCommand.convert_magic_lines_to_magic_commands(code) + code = Tree.convert_magic_lines_to_magic_commands(code) tree = Tree.parse(code) visitor = DetectDbfsVisitor(self._session_state) visitor.visit(tree.node) diff --git a/src/databricks/labs/ucx/source_code/linters/files.py b/src/databricks/labs/ucx/source_code/linters/files.py index da05636dbe..c3e6e2962c 100644 --- a/src/databricks/labs/ucx/source_code/linters/files.py +++ b/src/databricks/labs/ucx/source_code/linters/files.py @@ -14,7 +14,7 @@ from databricks.labs.blueprint.tui import Prompts from databricks.labs.ucx.source_code.linters.context import LinterContext -from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage +from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, GraphBuilder from databricks.labs.ucx.source_code.graph import ( BaseImportResolver, BaseFileResolver, @@ -40,7 +40,9 @@ def __init__(self, path: Path, source: str, language: Language): def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: if self._language is CellLanguage.PYTHON: - return parent.build_graph_from_python_source(self._original_code) + context = parent.new_graph_builder_context() + builder = GraphBuilder(context) + return builder.build_graph_from_python_source(self._original_code) # supported language that does not generate dependencies if self._language is CellLanguage.SQL: return [] diff --git a/src/databricks/labs/ucx/source_code/linters/imports.py b/src/databricks/labs/ucx/source_code/linters/imports.py index b4a1900661..0eec688771 100644 --- a/src/databricks/labs/ucx/source_code/linters/imports.py +++ b/src/databricks/labs/ucx/source_code/linters/imports.py @@ -116,11 +116,7 @@ def __init__(self, session_state: CurrentSessionState): self._session_state = session_state def lint(self, code: str) -> Iterable[Advice]: - # can't find a way to avoid this cyclic import - # pylint: disable=import-outside-toplevel, cyclic-import - from databricks.labs.ucx.source_code.notebooks.magic import MagicCommand - - code = MagicCommand.convert_magic_lines_to_magic_commands(code) + code = Tree.convert_magic_lines_to_magic_commands(code) tree = Tree.parse(code) nodes = self.list_dbutils_notebook_run_calls(tree) for node in nodes: diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index 09081edb99..8eeb4193bb 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -13,7 +13,6 @@ Failure, CurrentSessionState, ) -from databricks.labs.ucx.source_code.notebooks.magic import MagicCommand from databricks.labs.ucx.source_code.queries import FromTable from databricks.labs.ucx.source_code.linters.python_ast import Tree, InferredValue @@ -333,7 +332,7 @@ def name(self) -> str: def lint(self, code: str) -> Iterable[Advice]: try: - code = MagicCommand.convert_magic_lines_to_magic_commands(code) + code = Tree.convert_magic_lines_to_magic_commands(code) tree = Tree.parse(code) except AstroidSyntaxError as e: yield Failure('syntax-error', str(e), 0, 0, 0, 0) diff --git a/src/databricks/labs/ucx/source_code/linters/python_ast.py b/src/databricks/labs/ucx/source_code/linters/python_ast.py index 40766e6290..c9f6c5bd50 100644 --- a/src/databricks/labs/ucx/source_code/linters/python_ast.py +++ b/src/databricks/labs/ucx/source_code/linters/python_ast.py @@ -27,6 +27,15 @@ def parse(code: str): root = parse(code) return Tree(root) + @classmethod + def convert_magic_lines_to_magic_commands(cls, python_code: str): + lines = python_code.split("\n") + for i, line in enumerate(lines): + if not line.startswith("%"): + continue + lines[i] = f"magic_command({line.encode()!r})" + return "\n".join(lines) + def __init__(self, node: NodeNG): self._node: NodeNG = node diff --git a/src/databricks/labs/ucx/source_code/linters/spark_connect.py b/src/databricks/labs/ucx/source_code/linters/spark_connect.py index b9a2cf3303..8c9057a3fd 100644 --- a/src/databricks/labs/ucx/source_code/linters/spark_connect.py +++ b/src/databricks/labs/ucx/source_code/linters/spark_connect.py @@ -9,7 +9,6 @@ Linter, ) from databricks.labs.ucx.source_code.linters.python_ast import Tree -from databricks.labs.ucx.source_code.notebooks.magic import MagicCommand @dataclass @@ -183,7 +182,7 @@ def __init__(self, is_serverless: bool = False): ] def lint(self, code: str) -> Iterator[Advice]: - code = MagicCommand.convert_magic_lines_to_magic_commands(code) + code = Tree.convert_magic_lines_to_magic_commands(code) tree = Tree.parse(code) for matcher in self._matchers: yield from matcher.lint_tree(tree.node) diff --git a/src/databricks/labs/ucx/source_code/linters/table_creation.py b/src/databricks/labs/ucx/source_code/linters/table_creation.py index 133b17bdcc..7e68f4d353 100644 --- a/src/databricks/labs/ucx/source_code/linters/table_creation.py +++ b/src/databricks/labs/ucx/source_code/linters/table_creation.py @@ -10,7 +10,6 @@ Linter, ) from databricks.labs.ucx.source_code.linters.python_ast import Tree -from databricks.labs.ucx.source_code.notebooks.magic import MagicCommand @dataclass @@ -115,7 +114,7 @@ def __init__(self, dbr_version: tuple[int, int] | None): def lint(self, code: str) -> Iterable[Advice]: if self._skip_dbr: return - code = MagicCommand.convert_magic_lines_to_magic_commands(code) + code = Tree.convert_magic_lines_to_magic_commands(code) tree = Tree.parse(code) for node in tree.walk(): yield from self._linter.lint(node) diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index 68e4936763..eab1709645 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -6,14 +6,24 @@ from enum import Enum from pathlib import Path +from astroid import ImportFrom # type: ignore from sqlglot import parse as parse_sql, ParseError as SQLParseError from databricks.sdk.service.workspace import Language -from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem -from databricks.labs.ucx.source_code.notebooks.magic import PipMagic +from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem, GraphBuilderContext +from databricks.labs.ucx.source_code.linters.imports import ( + SysPathChange, + DbutilsLinter, + ImportSource, + NotebookRunCall, + UnresolvedPath, +) +from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase +from databricks.labs.ucx.source_code.notebooks.magic import PipMagic, MagicCommand # use a specific logger for sqlglot warnings so we can disable them selectively sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot") +logger = logging.getLogger(__name__) NOTEBOOK_HEADER = "Databricks notebook source" CELL_SEPARATOR = "COMMAND ----------" @@ -88,7 +98,9 @@ def is_runnable(self) -> bool: return True def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: - python_dependency_problems = parent.build_graph_from_python_source(self._original_code) + context = parent.new_graph_builder_context() + builder = GraphBuilder(context) + python_dependency_problems = builder.build_graph_from_python_source(self._original_code) # Position information for the Python code is within the code and needs to be mapped to the location within the parent nodebook. return [ problem.replace( @@ -363,3 +375,88 @@ def wrap_with_magic(self, code: str, cell_language: CellLanguage) -> str: if code.endswith('./'): lines.append('\n') return "\n".join(lines) + + +class GraphBuilder: + + def __init__(self, context: GraphBuilderContext): + self._context = context + + def build_graph_from_python_source(self, python_code: str) -> list[DependencyProblem]: + """Check python code for dependency-related problems. + + Returns: + A list of dependency problems; position information is relative to the python code itself. + """ + problems: list[DependencyProblem] = [] + try: + python_code = Tree.convert_magic_lines_to_magic_commands(python_code) + tree = Tree.parse(python_code) + except Exception as e: # pylint: disable=broad-except + problems.append(DependencyProblem('parse-error', f"Could not parse Python code: {e}")) + return problems + syspath_changes = SysPathChange.extract_from_tree(self._context.session_state, tree) + run_calls = DbutilsLinter.list_dbutils_notebook_run_calls(tree) + import_sources: list[ImportSource] + import_problems: list[DependencyProblem] + import_sources, import_problems = ImportSource.extract_from_tree(tree, DependencyProblem.from_node) + problems.extend(import_problems) + magic_commands, command_problems = MagicCommand.extract_from_tree(tree, DependencyProblem.from_node) + problems.extend(command_problems) + nodes = syspath_changes + run_calls + import_sources + magic_commands + # need to execute things in intertwined sequence so concat and sort + for base_node in sorted(nodes, key=lambda node: (node.node.lineno, node.node.col_offset)): + for problem in self._process_node(base_node): + # Astroid line numbers are 1-based. + problem = problem.replace( + start_line=base_node.node.lineno - 1, + start_col=base_node.node.col_offset, + end_line=(base_node.node.end_lineno or 1) - 1, + end_col=base_node.node.end_col_offset or 0, + ) + problems.append(problem) + return problems + + def _process_node(self, base_node: NodeBase): + if isinstance(base_node, SysPathChange): + yield from self._mutate_path_lookup(base_node) + elif isinstance(base_node, NotebookRunCall): + yield from self._register_notebook(base_node) + elif isinstance(base_node, ImportSource): + yield from self._register_import(base_node) + elif isinstance(base_node, MagicCommand): + yield from base_node.build_dependency_graph(self._context.parent) + else: + logger.warning(f"Can't process {NodeBase.__name__} of type {type(base_node).__name__}") + + def _register_import(self, base_node: ImportSource): + prefix = "" + if isinstance(base_node.node, ImportFrom) and base_node.node.level is not None: + prefix = "." * base_node.node.level + name = base_node.name or "" + yield from self._context.parent.register_import(prefix + name) + + def _register_notebook(self, base_node: NotebookRunCall): + has_unresolved, paths = base_node.get_notebook_paths(self._context.session_state) + if has_unresolved: + yield DependencyProblem( + 'dependency-cannot-compute', + f"Can't check dependency from {base_node.node.as_string()} because the expression cannot be computed", + ) + for path in paths: + yield from self._context.parent.register_notebook(Path(path)) + + def _mutate_path_lookup(self, change: SysPathChange): + if isinstance(change, UnresolvedPath): + yield DependencyProblem( + 'sys-path-cannot-compute', + f"Can't update sys.path from {change.node.as_string()} because the expression cannot be computed", + ) + return + path = Path(change.path) + if not path.is_absolute(): + path = self._context.path_lookup.cwd / path + if change.is_append: + self._context.path_lookup.append_path(path) + return + self._context.path_lookup.prepend_path(path) diff --git a/src/databricks/labs/ucx/source_code/notebooks/magic.py b/src/databricks/labs/ucx/source_code/notebooks/magic.py index 6c93c46deb..c9014ef50e 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/magic.py +++ b/src/databricks/labs/ucx/source_code/notebooks/magic.py @@ -12,15 +12,6 @@ class MagicCommand(NodeBase): - @classmethod - def convert_magic_lines_to_magic_commands(cls, python_code: str): - lines = python_code.split("\n") - for i, line in enumerate(lines): - if not line.startswith("%"): - continue - lines[i] = f"magic_command({line.encode()!r})" - return "\n".join(lines) - @classmethod def extract_from_tree( cls, tree: Tree, problem_factory: ProblemFactory diff --git a/tests/unit/source_code/linters/test_python_ast.py b/tests/unit/source_code/linters/test_python_ast.py index b25bd744de..89ee615e76 100644 --- a/tests/unit/source_code/linters/test_python_ast.py +++ b/tests/unit/source_code/linters/test_python_ast.py @@ -137,6 +137,20 @@ def test_infers_fstring_value(): assert strings == ["Hello abc!"] +def test_infers_string_format_value(): + source = """ +value = "abc" +fstring = "Hello {0}!".format(value) +""" + tree = Tree.parse(source) + nodes = tree.locate(Assign, []) + tree = Tree(nodes[1].value) # value of fstring = ... + values = list(tree.infer_values()) + assert all(value.is_inferred() for value in values) + strings = list(value.as_string() for value in values) + assert strings == ["Hello abc!"] + + def test_infers_fstring_values(): source = """ values_1 = ["abc", "def"] diff --git a/tests/unit/source_code/notebooks/test_magic.py b/tests/unit/source_code/notebooks/test_magic.py index dc193ba35d..3d253cce12 100644 --- a/tests/unit/source_code/notebooks/test_magic.py +++ b/tests/unit/source_code/notebooks/test_magic.py @@ -42,7 +42,7 @@ def test_unsupported_magic_raises_problem(simple_dependency_resolver, mock_path_ source = """ %unsupported stuff '"%#@! """ - converted = MagicCommand.convert_magic_lines_to_magic_commands(source) + converted = Tree.convert_magic_lines_to_magic_commands(source) tree = Tree.parse(converted) commands, _ = MagicCommand.extract_from_tree(tree, DependencyProblem.from_node) dependency = Dependency(FileLoader(), Path("")) From 330bcc6153de4416e4257535ea421dbfa297f5cc Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 17 Jun 2024 16:25:49 +0200 Subject: [PATCH 08/14] move 'magic command' code to 'cells' --- .../labs/ucx/source_code/notebooks/cells.py | 79 +++++++++++++++++- .../labs/ucx/source_code/notebooks/magic.py | 82 ------------------- .../unit/source_code/notebooks/test_cells.py | 49 ++++++++++- .../unit/source_code/notebooks/test_magic.py | 51 ------------ 4 files changed, 124 insertions(+), 137 deletions(-) delete mode 100644 src/databricks/labs/ucx/source_code/notebooks/magic.py delete mode 100644 tests/unit/source_code/notebooks/test_magic.py diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index eab1709645..314fc1fd8c 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -1,15 +1,19 @@ from __future__ import annotations import logging +import re +import shlex from abc import ABC, abstractmethod from ast import parse as parse_python +from collections.abc import Callable from enum import Enum from pathlib import Path -from astroid import ImportFrom # type: ignore +from astroid import Call, Const, ImportFrom, Name, NodeNG # type: ignore from sqlglot import parse as parse_sql, ParseError as SQLParseError from databricks.sdk.service.workspace import Language + from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem, GraphBuilderContext from databricks.labs.ucx.source_code.linters.imports import ( SysPathChange, @@ -19,7 +23,6 @@ UnresolvedPath, ) from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase -from databricks.labs.ucx.source_code.notebooks.magic import PipMagic, MagicCommand # use a specific logger for sqlglot warnings so we can disable them selectively sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot") @@ -460,3 +463,75 @@ def _mutate_path_lookup(self, change: SysPathChange): self._context.path_lookup.append_path(path) return self._context.path_lookup.prepend_path(path) + + +class MagicCommand(NodeBase): + + @classmethod + def extract_from_tree( + cls, tree: Tree, problem_factory: Callable[[str, str, NodeNG], DependencyProblem] + ) -> tuple[list[MagicCommand], list[DependencyProblem]]: + problems: list[DependencyProblem] = [] + commands: list[MagicCommand] = [] + try: + nodes = tree.locate(Call, [("magic_command", Name)]) + for command in cls._make_commands_for_magic_command_call_nodes(nodes): + commands.append(command) + return commands, problems + except Exception as e: # pylint: disable=broad-except + problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root) + problems.append(problem) + return [], problems + + @classmethod + def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]): + for node in nodes: + arg = node.args[0] + if isinstance(arg, Const): + yield MagicCommand(node, arg.value) + + def __init__(self, node: NodeNG, command: bytes): + super().__init__(node) + self._command = command.decode() + + def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: + if self._command.startswith("%pip"): + cmd = PipMagic(self._command) + return cmd.build_dependency_graph(graph) + problem = DependencyProblem.from_node( + code='unsupported-magic-line', message=f"magic line '{self._command}' is not supported yet", node=self.node + ) + return [problem] + + +class PipMagic: + + def __init__(self, code: str): + self._code = code + + def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: + argv = self._split(self._code) + if len(argv) == 1: + return [DependencyProblem("library-install-failed", "Missing command after '%pip'")] + if argv[1] != "install": + return [DependencyProblem("library-install-failed", f"Unsupported %pip command: {argv[1]}")] + if len(argv) == 2: + return [DependencyProblem("library-install-failed", "Missing arguments after '%pip install'")] + return graph.register_library(*argv[2:]) # Skipping %pip install + + @staticmethod + def _split(code) -> list[str]: + """Split pip cell code into multiple arguments + + Note: + PipCell should be a pip command, i.e. single line possible spanning multilines escaped with backslashes. + + Sources: + https://docs.databricks.com/en/libraries/notebooks-python-libraries.html#manage-libraries-with-pip-commands + """ + match = re.search(r"(? tuple[list[MagicCommand], list[DependencyProblem]]: - problems: list[DependencyProblem] = [] - commands: list[MagicCommand] = [] - try: - nodes = tree.locate(Call, [("magic_command", Name)]) - for command in cls._make_commands_for_magic_command_call_nodes(nodes): - commands.append(command) - return commands, problems - except Exception as e: # pylint: disable=broad-except - problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root) - problems.append(problem) - return [], problems - - @classmethod - def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]): - for node in nodes: - arg = node.args[0] - if isinstance(arg, Const): - yield MagicCommand(node, arg.value) - - def __init__(self, node: NodeNG, command: bytes): - super().__init__(node) - self._command = command.decode() - - def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: - if self._command.startswith("%pip"): - cmd = PipMagic(self._command) - return cmd.build_dependency_graph(graph) - problem = DependencyProblem.from_node( - code='unsupported-magic-line', message=f"magic line '{self._command}' is not supported yet", node=self.node - ) - return [problem] - - -class PipMagic: - - def __init__(self, code: str): - self._code = code - - def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: - argv = self._split(self._code) - if len(argv) == 1: - return [DependencyProblem("library-install-failed", "Missing command after '%pip'")] - if argv[1] != "install": - return [DependencyProblem("library-install-failed", f"Unsupported %pip command: {argv[1]}")] - if len(argv) == 2: - return [DependencyProblem("library-install-failed", "Missing arguments after '%pip install'")] - return graph.register_library(*argv[2:]) # Skipping %pip install - - @staticmethod - def _split(code) -> list[str]: - """Split pip cell code into multiple arguments - - Note: - PipCell should be a pip command, i.e. single line possible spanning multilines escaped with backslashes. - - Sources: - https://docs.databricks.com/en/libraries/notebooks-python-libraries.html#manage-libraries-with-pip-commands - """ - match = re.search(r"(? Date: Tue, 18 Jun 2024 12:27:50 +0200 Subject: [PATCH 09/14] support ! as line magic prefix --- src/databricks/labs/ucx/source_code/linters/python_ast.py | 3 ++- src/databricks/labs/ucx/source_code/notebooks/cells.py | 8 ++++---- tests/unit/source_code/notebooks/test_cells.py | 8 ++++---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/python_ast.py b/src/databricks/labs/ucx/source_code/linters/python_ast.py index c9f6c5bd50..bb98620001 100644 --- a/src/databricks/labs/ucx/source_code/linters/python_ast.py +++ b/src/databricks/labs/ucx/source_code/linters/python_ast.py @@ -30,8 +30,9 @@ def parse(code: str): @classmethod def convert_magic_lines_to_magic_commands(cls, python_code: str): lines = python_code.split("\n") + magic_markers = {"%", "!"} for i, line in enumerate(lines): - if not line.startswith("%"): + if len(line) == 0 or line[0] not in magic_markers: continue lines[i] = f"magic_command({line.encode()!r})" return "\n".join(lines) diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index 314fc1fd8c..1a55570a7f 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -495,7 +495,7 @@ def __init__(self, node: NodeNG, command: bytes): self._command = command.decode() def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: - if self._command.startswith("%pip"): + if self._command.startswith("%pip") or self._command.startswith("!pip"): cmd = PipMagic(self._command) return cmd.build_dependency_graph(graph) problem = DependencyProblem.from_node( @@ -512,11 +512,11 @@ def __init__(self, code: str): def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]: argv = self._split(self._code) if len(argv) == 1: - return [DependencyProblem("library-install-failed", "Missing command after '%pip'")] + return [DependencyProblem("library-install-failed", "Missing command after 'pip'")] if argv[1] != "install": - return [DependencyProblem("library-install-failed", f"Unsupported %pip command: {argv[1]}")] + return [DependencyProblem("library-install-failed", f"Unsupported 'pip' command: {argv[1]}")] if len(argv) == 2: - return [DependencyProblem("library-install-failed", "Missing arguments after '%pip install'")] + return [DependencyProblem("library-install-failed", "Missing arguments after 'pip install'")] return graph.register_library(*argv[2:]) # Skipping %pip install @staticmethod diff --git a/tests/unit/source_code/notebooks/test_cells.py b/tests/unit/source_code/notebooks/test_cells.py index 8e01f6e796..bc30f03ae1 100644 --- a/tests/unit/source_code/notebooks/test_cells.py +++ b/tests/unit/source_code/notebooks/test_cells.py @@ -69,7 +69,7 @@ def test_pip_cell_build_dependency_graph_pip_registers_missing_library(): assert len(problems) == 1 assert problems[0].code == "library-install-failed" - assert problems[0].message == "Missing arguments after '%pip install'" + assert problems[0].message == "Missing arguments after 'pip install'" graph.register_library.assert_not_called() @@ -90,14 +90,14 @@ def test_pip_cell_build_dependency_graph_reports_incorrect_syntax(): def test_pip_cell_build_dependency_graph_reports_unsupported_command(): graph = create_autospec(DependencyGraph) - code = "%pip freeze" + code = "!pip freeze" cell = PipCell(code, original_offset=1) problems = cell.build_dependency_graph(graph) assert len(problems) == 1 assert problems[0].code == "library-install-failed" - assert problems[0].message == "Unsupported %pip command: freeze" + assert problems[0].message == "Unsupported 'pip' command: freeze" graph.register_library.assert_not_called() @@ -111,7 +111,7 @@ def test_pip_cell_build_dependency_graph_reports_missing_command(): assert len(problems) == 1 assert problems[0].code == "library-install-failed" - assert problems[0].message == "Missing command after '%pip'" + assert problems[0].message == "Missing command after 'pip'" graph.register_library.assert_not_called() From f0075d23d6800022bd2ba4535b925ba7ddb06965 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 18 Jun 2024 18:11:24 +0200 Subject: [PATCH 10/14] fix-indents-before-parsing --- .../labs/ucx/source_code/linters/dbfs.py | 3 +- .../labs/ucx/source_code/linters/imports.py | 3 +- .../labs/ucx/source_code/linters/pyspark.py | 3 +- .../ucx/source_code/linters/python_ast.py | 33 ++++++++++++++++++- .../ucx/source_code/linters/spark_connect.py | 3 +- .../ucx/source_code/linters/table_creation.py | 3 +- .../labs/ucx/source_code/notebooks/cells.py | 3 +- .../source_code/linters/test_python_ast.py | 28 +++++++++++++++- .../unit/source_code/notebooks/test_cells.py | 3 +- 9 files changed, 66 insertions(+), 16 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/dbfs.py b/src/databricks/labs/ucx/source_code/linters/dbfs.py index beaaf4acae..c92fff5d33 100644 --- a/src/databricks/labs/ucx/source_code/linters/dbfs.py +++ b/src/databricks/labs/ucx/source_code/linters/dbfs.py @@ -80,8 +80,7 @@ def lint(self, code: str) -> Iterable[Advice]: """ Lints the code looking for file system paths that are deprecated """ - code = Tree.convert_magic_lines_to_magic_commands(code) - tree = Tree.parse(code) + tree = Tree.normalize_and_parse(code) visitor = DetectDbfsVisitor(self._session_state) visitor.visit(tree.node) yield from visitor.get_advices() diff --git a/src/databricks/labs/ucx/source_code/linters/imports.py b/src/databricks/labs/ucx/source_code/linters/imports.py index 0eec688771..21ccbdcac3 100644 --- a/src/databricks/labs/ucx/source_code/linters/imports.py +++ b/src/databricks/labs/ucx/source_code/linters/imports.py @@ -116,8 +116,7 @@ def __init__(self, session_state: CurrentSessionState): self._session_state = session_state def lint(self, code: str) -> Iterable[Advice]: - code = Tree.convert_magic_lines_to_magic_commands(code) - tree = Tree.parse(code) + tree = Tree.normalize_and_parse(code) nodes = self.list_dbutils_notebook_run_calls(tree) for node in nodes: yield from self._raise_advice_if_unresolved(node.node, self._session_state) diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index a77bf7c0f2..49a3d2d9d4 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -330,8 +330,7 @@ def name(self) -> str: def lint(self, code: str) -> Iterable[Advice]: try: - code = Tree.convert_magic_lines_to_magic_commands(code) - tree = Tree.parse(code) + tree = Tree.normalize_and_parse(code) except AstroidSyntaxError as e: yield Failure('syntax-error', str(e), 0, 0, 0, 0) return diff --git a/src/databricks/labs/ucx/source_code/linters/python_ast.py b/src/databricks/labs/ucx/source_code/linters/python_ast.py index bb98620001..ee9bb05e52 100644 --- a/src/databricks/labs/ucx/source_code/linters/python_ast.py +++ b/src/databricks/labs/ucx/source_code/linters/python_ast.py @@ -28,7 +28,38 @@ def parse(code: str): return Tree(root) @classmethod - def convert_magic_lines_to_magic_commands(cls, python_code: str): + def normalize_and_parse(cls, code: str): + code = cls.normalize(code) + root = parse(code) + return Tree(root) + + @classmethod + def normalize(cls, code: str): + code = cls._normalize_indents(code) + code = cls._convert_magic_lines_to_magic_commands(code) + return code + + @classmethod + def _normalize_indents(cls, python_code: str): + lines = python_code.split("\n") + for line in lines: + # skip leading ws and comments + if len(line) == 0 or line.startswith('#'): + continue + if not line.startswith(' '): + # first line of code is correctly indented + return python_code + # first line of code is indented when it shouldn't + prefix_count = len(line) - len(line.lstrip(' ')) + prefix_str = ' ' * prefix_count + for i, line_to_fix in enumerate(lines): + if line_to_fix.startswith(prefix_str): + lines[i] = line_to_fix[prefix_count:] + return "\n".join(lines) + return python_code + + @classmethod + def _convert_magic_lines_to_magic_commands(cls, python_code: str): lines = python_code.split("\n") magic_markers = {"%", "!"} for i, line in enumerate(lines): diff --git a/src/databricks/labs/ucx/source_code/linters/spark_connect.py b/src/databricks/labs/ucx/source_code/linters/spark_connect.py index 8c9057a3fd..6b3b2f0d28 100644 --- a/src/databricks/labs/ucx/source_code/linters/spark_connect.py +++ b/src/databricks/labs/ucx/source_code/linters/spark_connect.py @@ -182,7 +182,6 @@ def __init__(self, is_serverless: bool = False): ] def lint(self, code: str) -> Iterator[Advice]: - code = Tree.convert_magic_lines_to_magic_commands(code) - tree = Tree.parse(code) + tree = Tree.normalize_and_parse(code) for matcher in self._matchers: yield from matcher.lint_tree(tree.node) diff --git a/src/databricks/labs/ucx/source_code/linters/table_creation.py b/src/databricks/labs/ucx/source_code/linters/table_creation.py index 7e68f4d353..5d2b27044a 100644 --- a/src/databricks/labs/ucx/source_code/linters/table_creation.py +++ b/src/databricks/labs/ucx/source_code/linters/table_creation.py @@ -114,7 +114,6 @@ def __init__(self, dbr_version: tuple[int, int] | None): def lint(self, code: str) -> Iterable[Advice]: if self._skip_dbr: return - code = Tree.convert_magic_lines_to_magic_commands(code) - tree = Tree.parse(code) + tree = Tree.normalize_and_parse(code) for node in tree.walk(): yield from self._linter.lint(node) diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index 1a55570a7f..9cd9d43a0b 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -393,8 +393,7 @@ def build_graph_from_python_source(self, python_code: str) -> list[DependencyPro """ problems: list[DependencyProblem] = [] try: - python_code = Tree.convert_magic_lines_to_magic_commands(python_code) - tree = Tree.parse(python_code) + tree = Tree.normalize_and_parse(python_code) except Exception as e: # pylint: disable=broad-except problems.append(DependencyProblem('parse-error', f"Could not parse Python code: {e}")) return problems diff --git a/tests/unit/source_code/linters/test_python_ast.py b/tests/unit/source_code/linters/test_python_ast.py index 89ee615e76..bf0f105d7d 100644 --- a/tests/unit/source_code/linters/test_python_ast.py +++ b/tests/unit/source_code/linters/test_python_ast.py @@ -1,5 +1,5 @@ import pytest -from astroid import Assign, Attribute, Call, Const, Expr # type: ignore +from astroid import Assign, AstroidSyntaxError, Attribute, Call, Const, Expr # type: ignore from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.linters.python_ast import Tree @@ -254,3 +254,29 @@ def test_infers_externally_defined_value_set(): values = list(tree.infer_values(state)) strings = list(value.as_string() for value in values) assert strings == ["my-value"] + + +def test_parses_incorrectly_indented_code(): + source = """# DBTITLE 1,Get Sales Data for Analysis + sales = ( + spark + .table('retail_sales') + .join( # limit data to CY 2021 and 2022 + spark.table('date').select('dateKey','date','year').filter('year between 2021 and 2022'), + on='dateKey' + ) + .join( # get product fields needed for analysis + spark.table('product').select('productKey','brandValue','packSizeValueUS'), + on='productKey' + ) + .join( # get brand fields needed for analysis + spark.table('brand_name_mapping').select('brandValue','brandName'), + on='brandValue' + ) + ) +""" + # ensure it would fail if not normalized + with pytest.raises(AstroidSyntaxError): + Tree.parse(source) + Tree.normalize_and_parse(source) + assert True diff --git a/tests/unit/source_code/notebooks/test_cells.py b/tests/unit/source_code/notebooks/test_cells.py index bc30f03ae1..b4d14dc055 100644 --- a/tests/unit/source_code/notebooks/test_cells.py +++ b/tests/unit/source_code/notebooks/test_cells.py @@ -213,8 +213,7 @@ def test_unsupported_magic_raises_problem(simple_dependency_resolver, mock_path_ source = """ %unsupported stuff '"%#@! """ - converted = Tree.convert_magic_lines_to_magic_commands(source) - tree = Tree.parse(converted) + tree = Tree.normalize_and_parse(source) commands, _ = MagicCommand.extract_from_tree(tree, DependencyProblem.from_node) dependency = Dependency(FileLoader(), Path("")) graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) From aa8c726e57e1ef22a138a7db1644a28e624be2a7 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 18 Jun 2024 18:39:45 +0200 Subject: [PATCH 11/14] skip insignificant lines --- src/databricks/labs/ucx/source_code/linters/python_ast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/linters/python_ast.py b/src/databricks/labs/ucx/source_code/linters/python_ast.py index ee9bb05e52..7ece439fa2 100644 --- a/src/databricks/labs/ucx/source_code/linters/python_ast.py +++ b/src/databricks/labs/ucx/source_code/linters/python_ast.py @@ -44,7 +44,7 @@ def _normalize_indents(cls, python_code: str): lines = python_code.split("\n") for line in lines: # skip leading ws and comments - if len(line) == 0 or line.startswith('#'): + if len(line.strip()) == 0 or line.startswith('#'): continue if not line.startswith(' '): # first line of code is correctly indented From e3974f28b1e0209d2d0605f54258cff275e0f830 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 20 Jun 2024 11:54:21 +0200 Subject: [PATCH 12/14] ignore magic line markers with muli-line comments --- .../labs/ucx/source_code/linters/python_ast.py | 12 ++++++++++-- tests/unit/source_code/linters/test_python_ast.py | 11 +++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/linters/python_ast.py b/src/databricks/labs/ucx/source_code/linters/python_ast.py index 7ece439fa2..3a9ca4553f 100644 --- a/src/databricks/labs/ucx/source_code/linters/python_ast.py +++ b/src/databricks/labs/ucx/source_code/linters/python_ast.py @@ -2,6 +2,7 @@ from abc import ABC import logging +import re from collections.abc import Iterable, Iterator, Generator from typing import Any, TypeVar @@ -62,10 +63,17 @@ def _normalize_indents(cls, python_code: str): def _convert_magic_lines_to_magic_commands(cls, python_code: str): lines = python_code.split("\n") magic_markers = {"%", "!"} + in_multi_line_comment = False + pattern = re.compile('"""') for i, line in enumerate(lines): - if len(line) == 0 or line[0] not in magic_markers: + if len(line) == 0: continue - lines[i] = f"magic_command({line.encode()!r})" + if not in_multi_line_comment and line[0] in magic_markers: + lines[i] = f"magic_command({line.encode()!r})" + continue + matches = re.findall(pattern, line) + if len(matches) & 1: + in_multi_line_comment = not in_multi_line_comment return "\n".join(lines) def __init__(self, node: NodeNG): diff --git a/tests/unit/source_code/linters/test_python_ast.py b/tests/unit/source_code/linters/test_python_ast.py index bf0f105d7d..51e0f7ed78 100644 --- a/tests/unit/source_code/linters/test_python_ast.py +++ b/tests/unit/source_code/linters/test_python_ast.py @@ -280,3 +280,14 @@ def test_parses_incorrectly_indented_code(): Tree.parse(source) Tree.normalize_and_parse(source) assert True + + +def test_ignores_magic_marker_in_multiline_comment(): + source = """message_unformatted = u\""" +%s is only supported in Python %s and above.\""" +name="name" +version="version" +formatted=message_unformatted % (name, version) +""" + Tree.normalize_and_parse(source) + assert True From dabc242e0be5b14c719b68aaa4197e00bbd90b98 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 20 Jun 2024 17:24:53 +0200 Subject: [PATCH 13/14] detect utf-8 encoding with bom --- .../labs/ucx/source_code/notebooks/sources.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 1e8fcf4f76..6974f73bf2 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -1,6 +1,8 @@ from __future__ import annotations +import codecs import locale +import os from collections.abc import Iterable from functools import cached_property from pathlib import Path @@ -167,8 +169,16 @@ def __init__(self, ctx: LinterContext, path: Path, content: str | None = None): @cached_property def _source_code(self) -> str: - encoding = locale.getpreferredencoding(False) - return self._path.read_text(encoding) if self._content is None else self._content + return self._path.read_text(self._guess_encoding()) if self._content is None else self._content + + def _guess_encoding(self): + path = self._path.as_posix() + count = min(32, os.path.getsize(path)) + with open(path, 'rb') as _file: + raw = _file.read(count) + if raw.startswith(codecs.BOM_UTF8): + return 'utf-8-sig' + return locale.getpreferredencoding(False) def _file_language(self): return SUPPORTED_EXTENSION_LANGUAGES.get(self._path.suffix.lower()) From 00df7e2573afc1063ec9d3ac16e6e2fbfb6c1153 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 2 Jul 2024 15:14:21 +0200 Subject: [PATCH 14/14] fix merge issue --- tests/unit/source_code/notebooks/test_cells.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/unit/source_code/notebooks/test_cells.py b/tests/unit/source_code/notebooks/test_cells.py index d20139d626..9c1bb9a2f6 100644 --- a/tests/unit/source_code/notebooks/test_cells.py +++ b/tests/unit/source_code/notebooks/test_cells.py @@ -9,7 +9,7 @@ from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver, DependencyProblem from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver from databricks.labs.ucx.source_code.linters.python_ast import Tree -from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, PipCell, PythonCell, PipMagic, MagicCommand +from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, PipCell, PipMagic, MagicCommand from databricks.labs.ucx.source_code.notebooks.loaders import ( NotebookResolver, NotebookLoader, @@ -192,5 +192,17 @@ def test_pip_cell_build_dependency_graph_handles_multiline_code(): ), ], ) -def test_pip_cell_split(code, split): - assert PipCell._split(code) == split # pylint: disable=protected-access +def test_pip_magic_split(code, split): + assert PipMagic._split(code) == split # pylint: disable=protected-access + + +def test_unsupported_magic_raises_problem(simple_dependency_resolver, mock_path_lookup): + source = """ +%unsupported stuff '"%#@! +""" + tree = Tree.normalize_and_parse(source) + commands, _ = MagicCommand.extract_from_tree(tree, DependencyProblem.from_node) + dependency = Dependency(FileLoader(), Path("")) + graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) + problems = commands[0].build_dependency_graph(graph) + assert problems[0].code == "unsupported-magic-line"