From df071c6c8a327747c5d059cbc42d17f56494a9a8 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Tue, 9 Jul 2024 11:17:42 +0200 Subject: [PATCH] Infer values from child notebook in run cell (#2075) ## Changes Infer values from child notebook in run cell ### Linked issues Progresses #1901 Progresses #1205 Fixes #1927 ### Functionality None ### Tests - [x] added unit tests Solves 87 'not computed' advices when running `make solacc` --------- Co-authored-by: Eric Vergnaud --- src/databricks/labs/ucx/source_code/base.py | 11 +++ src/databricks/labs/ucx/source_code/jobs.py | 4 +- src/databricks/labs/ucx/source_code/known.py | 2 +- .../labs/ucx/source_code/linters/files.py | 2 +- .../labs/ucx/source_code/notebooks/cells.py | 41 +++++---- .../labs/ucx/source_code/notebooks/sources.py | 88 +++++++++++++------ .../source_code/notebooks/test_sources.py | 32 +++---- .../samples/values_across_notebooks.py | 9 ++ .../samples/values_across_notebooks_child.py | 3 + tests/unit/source_code/test_functional.py | 19 ++-- .../unit/source_code/test_notebook_linter.py | 32 +++++-- 11 files changed, 166 insertions(+), 77 deletions(-) create mode 100644 tests/unit/source_code/samples/values_across_notebooks.py create mode 100644 tests/unit/source_code/samples/values_across_notebooks_child.py diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index 8704541a91..aa4d2524ef 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -214,3 +214,14 @@ def lint(self, code: str) -> Iterable[Advice]: yield from linter.lint_tree(tree) except AstroidSyntaxError as e: yield Failure('syntax-error', str(e), 0, 0, 0, 0) + + def process_child_cell(self, code: str): + try: + tree = Tree.normalize_and_parse(code) + if self._tree is None: + self._tree = tree + else: + self._tree.append_statements(tree) + except AstroidSyntaxError as e: + # error already reported when linting enclosing notebook + logger.warning(f"Failed to parse Python cell: {code}", exc_info=e) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index a39e1e3168..2d00db8a76 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -381,11 +381,11 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job): yield from self._lint_file(container, ctx) def _lint_file(self, file: LocalFile, ctx: LinterContext): - linter = FileLinter(ctx, file.path) + linter = FileLinter(ctx, self._path_lookup, file.path) for advice in linter.lint(): yield file.path, advice def _lint_notebook(self, notebook: Notebook, ctx: LinterContext): - linter = NotebookLinter(ctx, notebook) + linter = NotebookLinter(ctx, self._path_lookup, notebook) for advice in linter.lint(): yield notebook.path, advice diff --git a/src/databricks/labs/ucx/source_code/known.py b/src/databricks/labs/ucx/source_code/known.py index b9a08a4f1d..9e91343c58 100644 --- a/src/databricks/labs/ucx/source_code/known.py +++ b/src/databricks/labs/ucx/source_code/known.py @@ -173,7 +173,7 @@ def _analyze_file(cls, known_distributions, library_root, dist_info, module_path module_ref = module_ref[: -len(suffix)] logger.info(f"Processing module: {module_ref}") ctx = LinterContext(empty_index) - linter = FileLinter(ctx, module_path) + linter = FileLinter(ctx, PathLookup.from_sys_path(module_path.parent), module_path) known_problems = set() for problem in linter.lint(): known_problems.add(KnownProblem(problem.code, problem.message)) diff --git a/src/databricks/labs/ucx/source_code/linters/files.py b/src/databricks/labs/ucx/source_code/linters/files.py index eb09a6eadc..51c6f3fa6a 100644 --- a/src/databricks/labs/ucx/source_code/linters/files.py +++ b/src/databricks/labs/ucx/source_code/linters/files.py @@ -139,7 +139,7 @@ def _lint_one(self, path: Path) -> Iterable[LocatedAdvice]: if path.is_dir(): return [] ctx = self._new_linter_context() - linter = FileLinter(ctx, path) + linter = FileLinter(ctx, self._path_lookup, path) return [advice.for_path(path) for advice in linter.lint()] diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index c68af323da..497aca7052 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -168,22 +168,14 @@ def is_runnable(self) -> bool: return True # TODO def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: - command = f'{LANGUAGE_PREFIX}{self.language.magic_name}' - lines = self._original_code.split('\n') - for idx, line in enumerate(lines): - start = line.index(command) - if start >= 0: - path = line[start + len(command) :] - path = path.strip().strip("'").strip('"') - if len(path) == 0: - continue - notebook_path = Path(path) - start_line = self._original_offset + idx - problems = parent.register_notebook(notebook_path) - return [ - problem.replace(start_line=start_line, start_col=0, end_line=start_line, end_col=len(line)) - for problem in problems - ] + path, idx, line = self._read_notebook_path() + if path is not None: + start_line = self._original_offset + idx + problems = parent.register_notebook(path) + return [ + problem.replace(start_line=start_line, start_col=0, end_line=start_line, end_col=len(line)) + for problem in problems + ] start_line = self._original_offset problem = DependencyProblem( 'invalid-run-cell', @@ -195,6 +187,23 @@ def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProb ) return [problem] + def maybe_notebook_path(self) -> Path | None: + path, _, _ = self._read_notebook_path() + return path + + def _read_notebook_path(self): + command = f'{LANGUAGE_PREFIX}{self.language.magic_name}' + lines = self._original_code.split('\n') + for idx, line in enumerate(lines): + start = line.index(command) + if start >= 0: + path = line[start + len(command) :] + path = path.strip().strip("'").strip('"') + if len(path) == 0: + continue + return Path(path), idx, line + return None, 0, "" + def migrate_notebook_path(self): pass diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 1da4afaf73..f025cab13b 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -5,15 +5,24 @@ from collections.abc import Iterable from functools import cached_property from pathlib import Path +from typing import cast from databricks.sdk.service.workspace import Language from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.base import Advice, Failure, Linter +from databricks.labs.ucx.source_code.base import Advice, Failure, Linter, PythonSequentialLinter from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyGraph, DependencyProblem from databricks.labs.ucx.source_code.linters.context import LinterContext -from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, Cell, CELL_SEPARATOR, NOTEBOOK_HEADER +from databricks.labs.ucx.source_code.notebooks.cells import ( + CellLanguage, + Cell, + CELL_SEPARATOR, + NOTEBOOK_HEADER, + RunCell, + PythonCell, +) +from databricks.labs.ucx.source_code.path_lookup import PathLookup class Notebook(SourceContainer): @@ -84,22 +93,27 @@ class NotebookLinter: to the code cells according to the language of the cell. """ - def __init__(self, langs: LinterContext, notebook: Notebook): - self._languages: LinterContext = langs + def __init__(self, context: LinterContext, path_lookup: PathLookup, notebook: Notebook): + self._context: LinterContext = context + self._path_lookup = path_lookup self._notebook: Notebook = notebook # reuse Python linter, which accumulates statements for improved inference - self._python_linter = langs.linter(Language.PYTHON) + self._python_linter: PythonSequentialLinter = cast(PythonSequentialLinter, context.linter(Language.PYTHON)) @classmethod - def from_source(cls, index: MigrationIndex, source: str, default_language: Language) -> 'NotebookLinter': + def from_source( + cls, index: MigrationIndex, path_lookup: PathLookup, source: str, default_language: Language + ) -> NotebookLinter: ctx = LinterContext(index) notebook = Notebook.parse(Path(""), source, default_language) assert notebook is not None - return cls(ctx, notebook) + return cls(ctx, path_lookup, notebook) def lint(self) -> Iterable[Advice]: for cell in self._notebook.cells: - if not self._languages.is_supported(cell.language.language): + if isinstance(cell, RunCell): + self._load_source_from_run_cell(cell) + if not self._context.is_supported(cell.language.language): continue linter = self._linter(cell.language.language) for advice in linter.lint(cell.original_code): @@ -111,7 +125,29 @@ def lint(self) -> Iterable[Advice]: def _linter(self, language: Language) -> Linter: if language is Language.PYTHON: return self._python_linter - return self._languages.linter(language) + return self._context.linter(language) + + def _load_source_from_run_cell(self, run_cell: RunCell): + path = run_cell.maybe_notebook_path() + if path is None: + return # malformed run cell already reported + resolved = self._path_lookup.resolve(path) + if resolved is None: + return # already reported during dependency building + # transient workspace notebook suffix is inferred from object info + language = SUPPORTED_EXTENSION_LANGUAGES.get(resolved.suffix.lower(), None) + # we only support Python for now + if language is not Language.PYTHON: + return + source = resolved.read_text(_guess_encoding(resolved)) + notebook = Notebook.parse(path, source, language) + for cell in notebook.cells: + if isinstance(cell, RunCell): + self._load_source_from_run_cell(cell) + continue + if not isinstance(cell, PythonCell): + continue + self._python_linter.process_child_cell(cell.original_code) @staticmethod def name() -> str: @@ -124,6 +160,20 @@ def name() -> str: } +def _guess_encoding(path: Path): + # some files encode a unicode BOM (byte-order-mark), so let's use that if available + with path.open('rb') as _file: + raw = _file.read(4) + if raw.startswith(codecs.BOM_UTF32_LE) or raw.startswith(codecs.BOM_UTF32_BE): + return 'utf-32' + if raw.startswith(codecs.BOM_UTF16_LE) or raw.startswith(codecs.BOM_UTF16_BE): + return 'utf-16' + if raw.startswith(codecs.BOM_UTF8): + return 'utf-8-sig' + # no BOM, let's use default encoding + return locale.getpreferredencoding(False) + + class FileLinter: _NOT_YET_SUPPORTED_SUFFIXES = { '.scala', @@ -168,30 +218,18 @@ class FileLinter: 'zip-safe', } - def __init__(self, ctx: LinterContext, path: Path, content: str | None = None): + def __init__(self, ctx: LinterContext, path_lookup: PathLookup, path: Path, content: str | None = None): self._ctx: LinterContext = ctx + self._path_lookup = path_lookup self._path: Path = path self._content = content @cached_property def _source_code(self) -> str: if self._content is None: - self._content = self._path.read_text(self._guess_encoding()) + self._content = self._path.read_text(_guess_encoding(self._path)) return self._content - def _guess_encoding(self): - # some files encode a unicode BOM (byte-order-mark), so let's use that if available - with self._path.open('rb') as _file: - raw = _file.read(4) - if raw.startswith(codecs.BOM_UTF32_LE) or raw.startswith(codecs.BOM_UTF32_BE): - return 'utf-32' - if raw.startswith(codecs.BOM_UTF16_LE) or raw.startswith(codecs.BOM_UTF16_BE): - return 'utf-16' - if raw.startswith(codecs.BOM_UTF8): - return 'utf-8-sig' - # no BOM, let's use default encoding - return locale.getpreferredencoding(False) - def _file_language(self): return SUPPORTED_EXTENSION_LANGUAGES.get(self._path.suffix.lower()) @@ -243,5 +281,5 @@ def _lint_file(self): def _lint_notebook(self): notebook = Notebook.parse(self._path, self._source_code, self._file_language()) - notebook_linter = NotebookLinter(self._ctx, notebook) + notebook_linter = NotebookLinter(self._ctx, self._path_lookup, notebook) yield from notebook_linter.lint() diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index 390737f5ab..0f9fc694c0 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -10,15 +10,15 @@ @pytest.mark.parametrize("path, content", [("xyz.py", "a = 3"), ("xyz.sql", "select * from dual")]) -def test_file_linter_lints_supported_language(path, content, migration_index): - linter = FileLinter(LinterContext(migration_index), Path(path), content) +def test_file_linter_lints_supported_language(path, content, migration_index, mock_path_lookup): + linter = FileLinter(LinterContext(migration_index), mock_path_lookup, Path(path), content) advices = list(linter.lint()) assert not advices @pytest.mark.parametrize("path", ["xyz.scala", "xyz.r", "xyz.sh"]) -def test_file_linter_lints_not_yet_supported_language(path, migration_index): - linter = FileLinter(LinterContext(migration_index), Path(path), "") +def test_file_linter_lints_not_yet_supported_language(path, migration_index, mock_path_lookup): + linter = FileLinter(LinterContext(migration_index), mock_path_lookup, Path(path), "") advices = list(linter.lint()) assert [advice.code for advice in advices] == ["unsupported-language"] @@ -29,8 +29,8 @@ def source_code(self): return self._source_code -def test_checks_encoding_of_pseudo_file(migration_index): - linter = FriendFileLinter(LinterContext(migration_index), Path("whatever"), "a=b") +def test_checks_encoding_of_pseudo_file(migration_index, mock_path_lookup): + linter = FriendFileLinter(LinterContext(migration_index), mock_path_lookup, Path("whatever"), "a=b") assert linter.source_code() == "a=b" @@ -44,10 +44,10 @@ def test_checks_encoding_of_pseudo_file(migration_index): (codecs.BOM_UTF32_BE, "utf-32-be"), ], ) -def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_path): +def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_path, mock_path_lookup): path = tmp_path / "file.py" path.write_bytes(bom + "a = 12".encode(encoding)) - linter = FriendFileLinter(LinterContext(migration_index), path) + linter = FriendFileLinter(LinterContext(migration_index), mock_path_lookup, path) assert linter.source_code() == "a = 12" @@ -70,16 +70,16 @@ def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_pa ".DS_Store", # on MacOS ], ) -def test_file_linter_lints_ignorable_language(path, migration_index): - linter = FileLinter(LinterContext(migration_index), Path(path), "") +def test_file_linter_lints_ignorable_language(path, migration_index, mock_path_lookup): + linter = FileLinter(LinterContext(migration_index), mock_path_lookup, Path(path), "") advices = list(linter.lint()) assert not advices -def test_file_linter_lints_non_ascii_encoded_file(migration_index): +def test_file_linter_lints_non_ascii_encoded_file(migration_index, mock_path_lookup): preferred_encoding = locale.getpreferredencoding(False) non_ascii_encoded_file = Path(__file__).parent.parent / "samples" / "nonascii.py" - linter = FileLinter(LinterContext(migration_index), non_ascii_encoded_file) + linter = FileLinter(LinterContext(migration_index), mock_path_lookup, non_ascii_encoded_file) advices = list(linter.lint()) @@ -88,11 +88,11 @@ def test_file_linter_lints_non_ascii_encoded_file(migration_index): assert advices[0].message == f"File without {preferred_encoding} encoding is not supported {non_ascii_encoded_file}" -def test_file_linter_lints_file_with_missing_file(migration_index): +def test_file_linter_lints_file_with_missing_file(migration_index, mock_path_lookup): path = create_autospec(Path) path.suffix = ".py" path.read_text.side_effect = FileNotFoundError("No such file or directory: 'test.py'") - linter = FileLinter(LinterContext(migration_index), path) + linter = FileLinter(LinterContext(migration_index), mock_path_lookup, path) advices = list(linter.lint()) @@ -101,11 +101,11 @@ def test_file_linter_lints_file_with_missing_file(migration_index): assert advices[0].message == f"File not found: {path}" -def test_file_linter_lints_file_with_missing_read_permission(migration_index): +def test_file_linter_lints_file_with_missing_read_permission(migration_index, mock_path_lookup): path = create_autospec(Path) path.suffix = ".py" path.read_text.side_effect = PermissionError("Permission denied") - linter = FileLinter(LinterContext(migration_index), path) + linter = FileLinter(LinterContext(migration_index), mock_path_lookup, path) advices = list(linter.lint()) diff --git a/tests/unit/source_code/samples/values_across_notebooks.py b/tests/unit/source_code/samples/values_across_notebooks.py new file mode 100644 index 0000000000..fbc16d970a --- /dev/null +++ b/tests/unit/source_code/samples/values_across_notebooks.py @@ -0,0 +1,9 @@ +# Databricks notebook source + +# COMMAND ---------- + +# MAGIC %run "./values_across_notebooks_child.py" + +# COMMAND ---------- + +spark.table(f"{a}") diff --git a/tests/unit/source_code/samples/values_across_notebooks_child.py b/tests/unit/source_code/samples/values_across_notebooks_child.py new file mode 100644 index 0000000000..2b1e35c366 --- /dev/null +++ b/tests/unit/source_code/samples/values_across_notebooks_child.py @@ -0,0 +1,3 @@ +# Databricks notebook source + +a = 12 diff --git a/tests/unit/source_code/test_functional.py b/tests/unit/source_code/test_functional.py index 52a59ba6e0..382cdaf3c7 100644 --- a/tests/unit/source_code/test_functional.py +++ b/tests/unit/source_code/test_functional.py @@ -15,6 +15,7 @@ from databricks.labs.ucx.source_code.base import Advice, CurrentSessionState from databricks.labs.ucx.source_code.linters.context import LinterContext from databricks.labs.ucx.source_code.notebooks.sources import FileLinter +from databricks.labs.ucx.source_code.path_lookup import PathLookup @dataclass(frozen=True, kw_only=True, slots=True) @@ -67,19 +68,19 @@ class Functional: _location = Path(__file__).parent / 'samples/functional' @classmethod - def all(cls) -> list['Functional']: - return [Functional(_) for _ in cls._location.glob('**/*.py')] + def all(cls) -> list[Functional]: + return [Functional(path) for path in cls._location.glob('**/*.py')] @classmethod - def test_id(cls, sample: 'Functional') -> str: + def test_id(cls, sample: Functional) -> str: return sample.path.relative_to(cls._location).as_posix() def __init__(self, path: Path) -> None: self.path = path - def verify(self) -> None: + def verify(self, path_lookup: PathLookup) -> None: expected_problems = list(self._expected_problems()) - actual_advices = list(self._lint()) + actual_advices = list(self._lint(path_lookup)) # Convert the actual problems to the same type as our expected problems for easier comparison. actual_problems = [Expectation.from_advice(advice) for advice in actual_advices] @@ -99,7 +100,7 @@ def verify(self) -> None: assert no_errors, "\n".join(errors) # TODO: output annotated file with comments for quick fixing - def _lint(self) -> Iterable[Advice]: + def _lint(self, path_lookup: PathLookup) -> Iterable[Advice]: migration_index = MigrationIndex( [ MigrationStatus('old', 'things', dst_catalog='brand', dst_schema='new', dst_table='stuff'), @@ -110,7 +111,7 @@ def _lint(self) -> Iterable[Advice]: print(str(session_state)) session_state.named_parameters = {"my-widget": "my-path.py"} ctx = LinterContext(migration_index, session_state) - linter = FileLinter(ctx, self.path) + linter = FileLinter(ctx, path_lookup, self.path) return linter.lint() def _regex_match(self, regex: re.Pattern[str]) -> Generator[tuple[Comment, dict[str, Any]], None, None]: @@ -164,5 +165,5 @@ def _comments(f) -> Generator[Comment, None, None]: @pytest.mark.parametrize("sample", Functional.all(), ids=Functional.test_id) -def test_functional(sample: Functional) -> None: - sample.verify() +def test_functional(sample: Functional, mock_path_lookup) -> None: + sample.verify(mock_path_lookup) diff --git a/tests/unit/source_code/test_notebook_linter.py b/tests/unit/source_code/test_notebook_linter.py index 18b4956cc6..d3bab80847 100644 --- a/tests/unit/source_code/test_notebook_linter.py +++ b/tests/unit/source_code/test_notebook_linter.py @@ -313,19 +313,19 @@ # Add more test cases here ], ) -def test_notebook_linter(lang, source, expected): +def test_notebook_linter(lang, source, expected, mock_path_lookup): # SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159 # Hence SQL statement advice offsets can be wrong because of comments and statements # over multiple lines. - linter = NotebookLinter.from_source(index, source, lang) + linter = NotebookLinter.from_source(index, mock_path_lookup, source, lang) assert linter is not None gathered = list(linter.lint()) assert gathered == expected -def test_notebook_linter_name(): +def test_notebook_linter_name(mock_path_lookup): source = """-- Databricks notebook source""" - linter = NotebookLinter.from_source(index, source, Language.SQL) + linter = NotebookLinter.from_source(index, mock_path_lookup, source, Language.SQL) assert linter.name() == "notebook-linter" @@ -544,8 +544,8 @@ def test_notebook_linter_name(): ), ], ) -def test_notebook_linter_tracks_use(extended_test_index, lang, source, expected): - linter = NotebookLinter.from_source(extended_test_index, source, lang) +def test_notebook_linter_tracks_use(extended_test_index, lang, source, expected, mock_path_lookup): + linter = NotebookLinter.from_source(extended_test_index, mock_path_lookup, source, lang) assert linter is not None advices = list(linter.lint()) assert advices == expected @@ -554,7 +554,7 @@ def test_notebook_linter_tracks_use(extended_test_index, lang, source, expected) def test_computes_values_across_cells(extended_test_index, mock_path_lookup): path = mock_path_lookup.resolve(Path("values_across_cells.py")) source = path.read_text() - linter = NotebookLinter.from_source(extended_test_index, source, Language.PYTHON) + linter = NotebookLinter.from_source(extended_test_index, mock_path_lookup, source, Language.PYTHON) advices = list(linter.lint()) expected = [ Advice( @@ -567,3 +567,21 @@ def test_computes_values_across_cells(extended_test_index, mock_path_lookup): ) ] assert advices == expected + + +def test_computes_values_across_notebooks_using_run(extended_test_index, mock_path_lookup): + path = mock_path_lookup.resolve(Path("values_across_notebooks.py")) + source = path.read_text() + linter = NotebookLinter.from_source(extended_test_index, mock_path_lookup, source, Language.PYTHON) + advices = list(linter.lint()) + expected = [ + Advice( + code='table-migrate', + message='The default format changed in Databricks Runtime 8.0, from Parquet to Delta', + start_line=8, + start_col=0, + end_line=8, + end_col=19, + ) + ] + assert advices == expected