databrickslabs · nfx · Jul 9, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024
@@ -200,3 +200,14 @@ def lint(self, code: str) -> Iterable[Advice]:
                 yield from linter.lint_tree(tree)
         except AstroidSyntaxError as e:
             yield Failure('syntax-error', str(e), 0, 0, 0, 0)
+
+    def process_child_cell(self, code: str):
+        try:
+            tree = Tree.normalize_and_parse(code)
+            if self._tree is None:
+                self._tree = tree
+            else:
+                self._tree.append_statements(tree)
+        except AstroidSyntaxError as e:
+            # error already reported when linting enclosing notebook
+            logger.warning(f"Failed to parse Python cell: {code}", exc_info=e)
@@ -377,11 +377,11 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job):
                 yield from self._lint_file(container, ctx)
 
     def _lint_file(self, file: LocalFile, ctx: LinterContext):
-        linter = FileLinter(ctx, file.path)
+        linter = FileLinter(ctx, self._path_lookup, file.path)
         for advice in linter.lint():
             yield file.path, advice
 
     def _lint_notebook(self, notebook: Notebook, ctx: LinterContext):
-        linter = NotebookLinter(ctx, notebook)
+        linter = NotebookLinter(ctx, self._path_lookup, notebook)
         for advice in linter.lint():
             yield notebook.path, advice
@@ -151,7 +151,7 @@ def _analyze_file(cls, known_distributions, library_root, dist_info, module_path
                 module_ref = module_ref[: -len(suffix)]
         logger.info(f"Processing module: {module_ref}")
         ctx = LinterContext(empty_index)
-        linter = FileLinter(ctx, module_path)
+        linter = FileLinter(ctx, PathLookup.from_sys_path(module_path.parent), module_path)
         known_problems = set()
         for problem in linter.lint():
             known_problems.add(KnownProblem(problem.code, problem.message))

@@ -139,7 +139,7 @@ def _lint_one(self, path: Path) -> Iterable[LocatedAdvice]:
         if path.is_dir():
             return []
         ctx = self._new_linter_context()
-        linter = FileLinter(ctx, path)
+        linter = FileLinter(ctx, self._path_lookup, path)
         return [advice.for_path(path) for advice in linter.lint()]
 
 

@@ -168,22 +168,14 @@ def is_runnable(self) -> bool:
         return True  # TODO
 
     def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
-        command = f'{LANGUAGE_PREFIX}{self.language.magic_name}'
-        lines = self._original_code.split('\n')
-        for idx, line in enumerate(lines):
-            start = line.index(command)
-            if start >= 0:
-                path = line[start + len(command) :]
-                path = path.strip().strip("'").strip('"')
-                if len(path) == 0:
-                    continue
-                notebook_path = Path(path)
-                start_line = self._original_offset + idx
-                problems = parent.register_notebook(notebook_path)
-                return [
-                    problem.replace(start_line=start_line, start_col=0, end_line=start_line, end_col=len(line))
-                    for problem in problems
-                ]
+        path, idx, line = self.read_notebook_path()
+        if path is not None:
+            start_line = self._original_offset + idx
+            problems = parent.register_notebook(path)
+            return [
+                problem.replace(start_line=start_line, start_col=0, end_line=start_line, end_col=len(line))
+                for problem in problems
+            ]
         start_line = self._original_offset
         problem = DependencyProblem(
             'invalid-run-cell',
@@ -195,6 +187,19 @@ def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProb
         )
         return [problem]
 
+    def read_notebook_path(self):
-    def read_notebook_path(self):
+    def maybe_notebook_path() -> Path | None:
+      ... = self._read_notebook_path()
+      
+    def _read_notebook_path(self):
-    def read_notebook_path(self):
+    def maybe_notebook_path() -> Path | None:
+      ... = self._read_notebook_path()
+      
+    def _read_notebook_path(self):
+        command = f'{LANGUAGE_PREFIX}{self.language.magic_name}'
+        lines = self._original_code.split('\n')
+        for idx, line in enumerate(lines):
+            start = line.index(command)
+            if start >= 0:
+                path = line[start + len(command) :]
+                path = path.strip().strip("'").strip('"')
+                if len(path) == 0:
+                    continue
+                return Path(path), idx, line
+        return None, 0, ""
+
     def migrate_notebook_path(self):
         pass
 

@@ -5,15 +5,24 @@
 from collections.abc import Iterable
 from functools import cached_property
 from pathlib import Path
+from typing import cast
 
 from databricks.sdk.service.workspace import Language
 
 from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
-from databricks.labs.ucx.source_code.base import Advice, Failure, Linter
+from databricks.labs.ucx.source_code.base import Advice, Failure, Linter, PythonSequentialLinter
 
 from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyGraph, DependencyProblem
 from databricks.labs.ucx.source_code.linters.context import LinterContext
-from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, Cell, CELL_SEPARATOR, NOTEBOOK_HEADER
+from databricks.labs.ucx.source_code.notebooks.cells import (
+    CellLanguage,
+    Cell,
+    CELL_SEPARATOR,
+    NOTEBOOK_HEADER,
+    RunCell,
+    PythonCell,
+)
+from databricks.labs.ucx.source_code.path_lookup import PathLookup
 
 
 class Notebook(SourceContainer):
@@ -84,22 +93,27 @@ class NotebookLinter:
     to the code cells according to the language of the cell.
     """
 
-    def __init__(self, langs: LinterContext, notebook: Notebook):
-        self._languages: LinterContext = langs
+    def __init__(self, context: LinterContext, path_lookup: PathLookup, notebook: Notebook):
+        self._context: LinterContext = context
+        self._path_lookup = path_lookup
         self._notebook: Notebook = notebook
         # reuse Python linter, which accumulates statements for improved inference
-        self._python_linter = langs.linter(Language.PYTHON)
+        self._python_linter: PythonSequentialLinter = cast(PythonSequentialLinter, context.linter(Language.PYTHON))
 
     @classmethod
-    def from_source(cls, index: MigrationIndex, source: str, default_language: Language) -> 'NotebookLinter':
+    def from_source(
+        cls, index: MigrationIndex, path_lookup: PathLookup, source: str, default_language: Language
+    ) -> NotebookLinter:
         ctx = LinterContext(index)
         notebook = Notebook.parse(Path(""), source, default_language)
         assert notebook is not None
-        return cls(ctx, notebook)
+        return cls(ctx, path_lookup, notebook)
 
     def lint(self) -> Iterable[Advice]:
         for cell in self._notebook.cells:
-            if not self._languages.is_supported(cell.language.language):
+            if isinstance(cell, RunCell):
+                self._load_source_from_run_cell(cell)
+            if not self._context.is_supported(cell.language.language):
                 continue
             linter = self._linter(cell.language.language)
             for advice in linter.lint(cell.original_code):
@@ -111,7 +125,29 @@ def lint(self) -> Iterable[Advice]:
     def _linter(self, language: Language) -> Linter:
         if language is Language.PYTHON:
             return self._python_linter
-        return self._languages.linter(language)
+        return self._context.linter(language)
+
+    def _load_source_from_run_cell(self, run_cell: RunCell):
+        path, _, _ = run_cell.read_notebook_path()
+        if path is None:
+            return  # malformed run cell already reported
+        resolved = self._path_lookup.resolve(path)
+        if resolved is None:
+            return  # already reported during dependency building
+        # TODO deal with workspace notebooks
+        language = SUPPORTED_EXTENSION_LANGUAGES.get(resolved.suffix.lower(), None)
+        # we only support Python for now
+        if language is not Language.PYTHON:
+            return
+        source = resolved.read_text(_guess_encoding(resolved))
+        notebook = Notebook.parse(path, source, language)
+        for cell in notebook.cells:
+            if isinstance(cell, RunCell):
+                self._load_source_from_run_cell(cell)
+                continue
+            if not isinstance(cell, PythonCell):
+                continue
+            self._python_linter.process_child_cell(cell.original_code)
 
     @staticmethod
     def name() -> str:
@@ -124,6 +160,20 @@ def name() -> str:
 }
 
 
+def _guess_encoding(path: Path):
+    # some files encode a unicode BOM (byte-order-mark), so let's use that if available
+    with path.open('rb') as _file:
+        raw = _file.read(4)
+        if raw.startswith(codecs.BOM_UTF32_LE) or raw.startswith(codecs.BOM_UTF32_BE):
+            return 'utf-32'
+        if raw.startswith(codecs.BOM_UTF16_LE) or raw.startswith(codecs.BOM_UTF16_BE):
+            return 'utf-16'
+        if raw.startswith(codecs.BOM_UTF8):
+            return 'utf-8-sig'
+        # no BOM, let's use default encoding
+        return locale.getpreferredencoding(False)
+
+
 class FileLinter:
     _NOT_YET_SUPPORTED_SUFFIXES = {
         '.scala',
@@ -168,30 +218,18 @@ class FileLinter:
         'zip-safe',
     }
 
-    def __init__(self, ctx: LinterContext, path: Path, content: str | None = None):
+    def __init__(self, ctx: LinterContext, path_lookup: PathLookup, path: Path, content: str | None = None):
         self._ctx: LinterContext = ctx
+        self._path_lookup = path_lookup
         self._path: Path = path
         self._content = content
 
     @cached_property
     def _source_code(self) -> str:
         if self._content is None:
-            self._content = self._path.read_text(self._guess_encoding())
+            self._content = self._path.read_text(_guess_encoding(self._path))
         return self._content
 
-    def _guess_encoding(self):
-        # some files encode a unicode BOM (byte-order-mark), so let's use that if available
-        with self._path.open('rb') as _file:
-            raw = _file.read(4)
-            if raw.startswith(codecs.BOM_UTF32_LE) or raw.startswith(codecs.BOM_UTF32_BE):
-                return 'utf-32'
-            if raw.startswith(codecs.BOM_UTF16_LE) or raw.startswith(codecs.BOM_UTF16_BE):
-                return 'utf-16'
-            if raw.startswith(codecs.BOM_UTF8):
-                return 'utf-8-sig'
-            # no BOM, let's use default encoding
-            return locale.getpreferredencoding(False)
-
     def _file_language(self):
         return SUPPORTED_EXTENSION_LANGUAGES.get(self._path.suffix.lower())
 
@@ -243,5 +281,5 @@ def _lint_file(self):
 
     def _lint_notebook(self):
         notebook = Notebook.parse(self._path, self._source_code, self._file_language())
-        notebook_linter = NotebookLinter(self._ctx, notebook)
+        notebook_linter = NotebookLinter(self._ctx, self._path_lookup, notebook)
         yield from notebook_linter.lint()
@@ -10,15 +10,15 @@
 
 
 @pytest.mark.parametrize("path, content", [("xyz.py", "a = 3"), ("xyz.sql", "select * from dual")])
-def test_file_linter_lints_supported_language(path, content, migration_index):
-    linter = FileLinter(LinterContext(migration_index), Path(path), content)
+def test_file_linter_lints_supported_language(path, content, migration_index, mock_path_lookup):
+    linter = FileLinter(LinterContext(migration_index), mock_path_lookup, Path(path), content)
     advices = list(linter.lint())
     assert not advices
 
 
 @pytest.mark.parametrize("path", ["xyz.scala", "xyz.r", "xyz.sh"])
-def test_file_linter_lints_not_yet_supported_language(path, migration_index):
-    linter = FileLinter(LinterContext(migration_index), Path(path), "")
+def test_file_linter_lints_not_yet_supported_language(path, migration_index, mock_path_lookup):
+    linter = FileLinter(LinterContext(migration_index), mock_path_lookup, Path(path), "")
     advices = list(linter.lint())
     assert [advice.code for advice in advices] == ["unsupported-language"]
 
@@ -29,8 +29,8 @@ def source_code(self):
         return self._source_code
 
 
-def test_checks_encoding_of_pseudo_file(migration_index):
-    linter = FriendFileLinter(LinterContext(migration_index), Path("whatever"), "a=b")
+def test_checks_encoding_of_pseudo_file(migration_index, mock_path_lookup):
+    linter = FriendFileLinter(LinterContext(migration_index), mock_path_lookup, Path("whatever"), "a=b")
     assert linter.source_code() == "a=b"
 
 
@@ -44,10 +44,10 @@ def test_checks_encoding_of_pseudo_file(migration_index):
         (codecs.BOM_UTF32_BE, "utf-32-be"),
     ],
 )
-def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_path):
+def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_path, mock_path_lookup):
     path = tmp_path / "file.py"
     path.write_bytes(bom + "a = 12".encode(encoding))
-    linter = FriendFileLinter(LinterContext(migration_index), path)
+    linter = FriendFileLinter(LinterContext(migration_index), mock_path_lookup, path)
     assert linter.source_code() == "a = 12"
 
 
@@ -70,16 +70,16 @@ def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_pa
         ".DS_Store",  # on MacOS
     ],
 )
-def test_file_linter_lints_ignorable_language(path, migration_index):
-    linter = FileLinter(LinterContext(migration_index), Path(path), "")
+def test_file_linter_lints_ignorable_language(path, migration_index, mock_path_lookup):
+    linter = FileLinter(LinterContext(migration_index), mock_path_lookup, Path(path), "")
     advices = list(linter.lint())
     assert not advices
 
 
-def test_file_linter_lints_non_ascii_encoded_file(migration_index):
+def test_file_linter_lints_non_ascii_encoded_file(migration_index, mock_path_lookup):
     preferred_encoding = locale.getpreferredencoding(False)
     non_ascii_encoded_file = Path(__file__).parent.parent / "samples" / "nonascii.py"
-    linter = FileLinter(LinterContext(migration_index), non_ascii_encoded_file)
+    linter = FileLinter(LinterContext(migration_index), mock_path_lookup, non_ascii_encoded_file)
 
     advices = list(linter.lint())
 
@@ -88,11 +88,11 @@ def test_file_linter_lints_non_ascii_encoded_file(migration_index):
     assert advices[0].message == f"File without {preferred_encoding} encoding is not supported {non_ascii_encoded_file}"
 
 
-def test_file_linter_lints_file_with_missing_file(migration_index):
+def test_file_linter_lints_file_with_missing_file(migration_index, mock_path_lookup):
     path = create_autospec(Path)
     path.suffix = ".py"
     path.read_text.side_effect = FileNotFoundError("No such file or directory: 'test.py'")
-    linter = FileLinter(LinterContext(migration_index), path)
+    linter = FileLinter(LinterContext(migration_index), mock_path_lookup, path)
 
     advices = list(linter.lint())
 
@@ -101,11 +101,11 @@ def test_file_linter_lints_file_with_missing_file(migration_index):
     assert advices[0].message == f"File not found: {path}"
 
 
-def test_file_linter_lints_file_with_missing_read_permission(migration_index):
+def test_file_linter_lints_file_with_missing_read_permission(migration_index, mock_path_lookup):
     path = create_autospec(Path)
     path.suffix = ".py"
     path.read_text.side_effect = PermissionError("Permission denied")
-    linter = FileLinter(LinterContext(migration_index), path)
+    linter = FileLinter(LinterContext(migration_index), mock_path_lookup, path)
 
     advices = list(linter.lint())
 

@@ -0,0 +1,9 @@
+# Databricks notebook source
+
+# COMMAND ----------
+
+# MAGIC %run "./values_across_notebooks_child.py"
+
+# COMMAND ----------
+
+spark.table(f"{a}")
@@ -0,0 +1,3 @@
+# Databricks notebook source
+
+a = 12