databrickslabs · nfx · Jul 10, 2024 · Jun 13, 2024 · Jun 14, 2024 · Jun 14, 2024
@@ -10,6 +10,7 @@
 from pathlib import Path
 
 from astroid import Call, Const, ImportFrom, Name, NodeNG  # type: ignore
+from astroid.exceptions import AstroidSyntaxError  # type: ignore
 from sqlglot import parse as parse_sql, ParseError as SQLParseError
 
 from databricks.sdk.service.workspace import Language
@@ -403,7 +404,8 @@ def build_graph_from_python_source(self, python_code: str) -> list[DependencyPro
         problems: list[DependencyProblem] = []
         try:
             tree = Tree.normalize_and_parse(python_code)
-        except Exception as e:  # pylint: disable=broad-except
+        except AstroidSyntaxError as e:
+            logger.debug(f"Could not parse Python code: {python_code}", exc_info=True)
             problems.append(DependencyProblem('parse-error', f"Could not parse Python code: {e}"))
             return problems
         syspath_changes = SysPathChange.extract_from_tree(self._context.session_state, tree)
@@ -485,11 +487,11 @@ def extract_from_tree(
             nodes = tree.locate(Call, [("magic_command", Name)])
             for command in cls._make_commands_for_magic_command_call_nodes(nodes):
                 commands.append(command)
-            return commands, problems
         except Exception as e:  # pylint: disable=broad-except
+            logger.debug(f"Internal error while checking magic commands in tree: {tree.root}", exc_info=True)
             problem = problem_factory('internal-error', f"While checking magic commands: {e}", tree.root)
             problems.append(problem)
-            return [], problems
+        return commands, problems
 
     @classmethod
     def _make_commands_for_magic_command_call_nodes(cls, nodes: list[Call]):
@@ -527,8 +529,11 @@ def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProbl
             return [DependencyProblem("library-install-failed", "Missing arguments after 'pip install'")]
         return graph.register_library(*argv[2:])  # Skipping %pip install
 
-    @staticmethod
-    def _split(code) -> list[str]:
+    # Cache re-used regex (and ensure issues are raised during class init instead of upon first use).
+    _splitter = re.compile(r"(?<!\\)\n")
+
+    @classmethod
+    def _split(cls, code: str) -> list[str]:
         """Split pip cell code into multiple arguments
 
         Note:
@@ -537,7 +542,7 @@ def _split(code) -> list[str]:
         Sources:
             https://docs.databricks.com/en/libraries/notebooks-python-libraries.html#manage-libraries-with-pip-commands
         """
-        match = re.search(r"(?<!\\)\n", code)
+        match = cls._splitter.search(code)
         if match:
             code = code[: match.start()]  # Remove code after non-escaped newline
         code = code.replace("\\\n", " ")

@@ -97,7 +97,7 @@ def __repr__(self):
 
 
 @pytest.fixture
-def mock_path_lookup():
+def mock_path_lookup() -> PathLookup:
     return MockPathLookup()
 
 

@@ -8,6 +8,7 @@
 from databricks.labs.ucx.source_code.known import KnownList
 from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader
 from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, NotebookResolver
+from databricks.labs.ucx.source_code.path_lookup import PathLookup
 from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
 
 
@@ -48,7 +49,7 @@ def extended_test_index():
 
 
 @pytest.fixture
-def simple_dependency_resolver(mock_path_lookup):
+def simple_dependency_resolver(mock_path_lookup: PathLookup) -> DependencyResolver:
     allow_list = KnownList()
     library_resolver = PythonLibraryResolver(allow_list)
     notebook_resolver = NotebookResolver(NotebookLoader())

@@ -10,10 +10,15 @@
 from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver
 from databricks.labs.ucx.source_code.linters.python_ast import Tree
 from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, PipCell, PipMagic, MagicCommand
+from databricks.labs.ucx.source_code.notebooks.cells import (
+    GraphBuilder,
+    PythonCell,
+)
 from databricks.labs.ucx.source_code.notebooks.loaders import (
     NotebookResolver,
     NotebookLoader,
 )
+from databricks.labs.ucx.source_code.path_lookup import PathLookup
 from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
 from databricks.labs.ucx.source_code.known import KnownList
 
@@ -167,6 +172,58 @@ def test_pip_cell_build_dependency_graph_handles_multiline_code():
     graph.register_library.assert_called_once_with("databricks")
 
 
+def test_graph_builder_parse_error(
+    simple_dependency_resolver: DependencyResolver, mock_path_lookup: PathLookup
+) -> None:
+    """Check that internal parsing errors are caught and logged."""
+    # Fixture.
+    dependency = Dependency(FileLoader(), Path(""))
+    graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState())
+    graph.new_graph_builder_context()
+    builder = GraphBuilder(graph.new_graph_builder_context())
+
+    # Run the test.
+    problems = builder.build_graph_from_python_source("this is not valid python")
+
+    # Check results.
+    assert [
+        problem
+        for problem in problems
+        if problem.code == "parse-error" and problem.message.startswith("Could not parse Python code")
+    ]
+
+
+def test_parses_python_cell_with_magic_commands(simple_dependency_resolver, mock_path_lookup):
+    code = """
+a = 'something'
+%pip install databricks
+b = 'else'
+"""
+    cell = PythonCell(code, original_offset=1)
+    dependency = Dependency(FileLoader(), Path(""))
+    graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState())
+    problems = cell.build_dependency_graph(graph)
+    assert not problems
+
+
+@pytest.mark.xfail(reason="Line-magic as an expression is not supported ", strict=True)
+def test_python_cell_with_expression_magic(
+    simple_dependency_resolver: DependencyResolver, mock_path_lookup: PathLookup
+) -> None:
+    """Line magic (%) can be used in places where expressions are expected; check that this is handled."""
+    # Fixture
+    code = "current_directory = %pwd"
+    cell = PythonCell(code, original_offset=1)
+    dependency = Dependency(FileLoader(), Path(""))
+    graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState())
+
+    # Run the test
+    problems = cell.build_dependency_graph(graph)
+
+    # Verify there were no problems.
+    assert not problems
+
+
 @pytest.mark.parametrize(
     "code,split",
     [
@@ -193,6 +250,12 @@ def test_pip_cell_build_dependency_graph_handles_multiline_code():
     ],
 )
 def test_pip_magic_split(code, split):
+    # Avoid direct protected access to the _split method.
+    class _PipMagicFriend(PipMagic):
+        @classmethod
+        def split(cls, code: str) -> list[str]:
+            return cls._split(code)
+
     assert PipMagic._split(code) == split  # pylint: disable=protected-access