Skip to content

Commit

Permalink
Infer values from child notebook in run cell (#2075)
Browse files Browse the repository at this point in the history
## Changes
Infer values from child notebook in run cell

### Linked issues
Progresses #1901 
Progresses #1205 
Fixes #1927 

### Functionality 
None

### Tests
- [x] added unit tests

Solves 87 'not computed' advices when running `make solacc`

---------

Co-authored-by: Eric Vergnaud <[email protected]>
  • Loading branch information
ericvergnaud and ericvergnaud authored Jul 9, 2024
1 parent 2f02467 commit df071c6
Show file tree
Hide file tree
Showing 11 changed files with 166 additions and 77 deletions.
11 changes: 11 additions & 0 deletions src/databricks/labs/ucx/source_code/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,14 @@ def lint(self, code: str) -> Iterable[Advice]:
yield from linter.lint_tree(tree)
except AstroidSyntaxError as e:
yield Failure('syntax-error', str(e), 0, 0, 0, 0)

def process_child_cell(self, code: str):
try:
tree = Tree.normalize_and_parse(code)
if self._tree is None:
self._tree = tree
else:
self._tree.append_statements(tree)
except AstroidSyntaxError as e:
# error already reported when linting enclosing notebook
logger.warning(f"Failed to parse Python cell: {code}", exc_info=e)
4 changes: 2 additions & 2 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,11 +381,11 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job):
yield from self._lint_file(container, ctx)

def _lint_file(self, file: LocalFile, ctx: LinterContext):
linter = FileLinter(ctx, file.path)
linter = FileLinter(ctx, self._path_lookup, file.path)
for advice in linter.lint():
yield file.path, advice

def _lint_notebook(self, notebook: Notebook, ctx: LinterContext):
linter = NotebookLinter(ctx, notebook)
linter = NotebookLinter(ctx, self._path_lookup, notebook)
for advice in linter.lint():
yield notebook.path, advice
2 changes: 1 addition & 1 deletion src/databricks/labs/ucx/source_code/known.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def _analyze_file(cls, known_distributions, library_root, dist_info, module_path
module_ref = module_ref[: -len(suffix)]
logger.info(f"Processing module: {module_ref}")
ctx = LinterContext(empty_index)
linter = FileLinter(ctx, module_path)
linter = FileLinter(ctx, PathLookup.from_sys_path(module_path.parent), module_path)
known_problems = set()
for problem in linter.lint():
known_problems.add(KnownProblem(problem.code, problem.message))
Expand Down
2 changes: 1 addition & 1 deletion src/databricks/labs/ucx/source_code/linters/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def _lint_one(self, path: Path) -> Iterable[LocatedAdvice]:
if path.is_dir():
return []
ctx = self._new_linter_context()
linter = FileLinter(ctx, path)
linter = FileLinter(ctx, self._path_lookup, path)
return [advice.for_path(path) for advice in linter.lint()]


Expand Down
41 changes: 25 additions & 16 deletions src/databricks/labs/ucx/source_code/notebooks/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,22 +168,14 @@ def is_runnable(self) -> bool:
return True # TODO

def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
command = f'{LANGUAGE_PREFIX}{self.language.magic_name}'
lines = self._original_code.split('\n')
for idx, line in enumerate(lines):
start = line.index(command)
if start >= 0:
path = line[start + len(command) :]
path = path.strip().strip("'").strip('"')
if len(path) == 0:
continue
notebook_path = Path(path)
start_line = self._original_offset + idx
problems = parent.register_notebook(notebook_path)
return [
problem.replace(start_line=start_line, start_col=0, end_line=start_line, end_col=len(line))
for problem in problems
]
path, idx, line = self._read_notebook_path()
if path is not None:
start_line = self._original_offset + idx
problems = parent.register_notebook(path)
return [
problem.replace(start_line=start_line, start_col=0, end_line=start_line, end_col=len(line))
for problem in problems
]
start_line = self._original_offset
problem = DependencyProblem(
'invalid-run-cell',
Expand All @@ -195,6 +187,23 @@ def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProb
)
return [problem]

def maybe_notebook_path(self) -> Path | None:
path, _, _ = self._read_notebook_path()
return path

def _read_notebook_path(self):
command = f'{LANGUAGE_PREFIX}{self.language.magic_name}'
lines = self._original_code.split('\n')
for idx, line in enumerate(lines):
start = line.index(command)
if start >= 0:
path = line[start + len(command) :]
path = path.strip().strip("'").strip('"')
if len(path) == 0:
continue
return Path(path), idx, line
return None, 0, ""

def migrate_notebook_path(self):
pass

Expand Down
88 changes: 63 additions & 25 deletions src/databricks/labs/ucx/source_code/notebooks/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,24 @@
from collections.abc import Iterable
from functools import cached_property
from pathlib import Path
from typing import cast

from databricks.sdk.service.workspace import Language

from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
from databricks.labs.ucx.source_code.base import Advice, Failure, Linter
from databricks.labs.ucx.source_code.base import Advice, Failure, Linter, PythonSequentialLinter

from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyGraph, DependencyProblem
from databricks.labs.ucx.source_code.linters.context import LinterContext
from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, Cell, CELL_SEPARATOR, NOTEBOOK_HEADER
from databricks.labs.ucx.source_code.notebooks.cells import (
CellLanguage,
Cell,
CELL_SEPARATOR,
NOTEBOOK_HEADER,
RunCell,
PythonCell,
)
from databricks.labs.ucx.source_code.path_lookup import PathLookup


class Notebook(SourceContainer):
Expand Down Expand Up @@ -84,22 +93,27 @@ class NotebookLinter:
to the code cells according to the language of the cell.
"""

def __init__(self, langs: LinterContext, notebook: Notebook):
self._languages: LinterContext = langs
def __init__(self, context: LinterContext, path_lookup: PathLookup, notebook: Notebook):
self._context: LinterContext = context
self._path_lookup = path_lookup
self._notebook: Notebook = notebook
# reuse Python linter, which accumulates statements for improved inference
self._python_linter = langs.linter(Language.PYTHON)
self._python_linter: PythonSequentialLinter = cast(PythonSequentialLinter, context.linter(Language.PYTHON))

@classmethod
def from_source(cls, index: MigrationIndex, source: str, default_language: Language) -> 'NotebookLinter':
def from_source(
cls, index: MigrationIndex, path_lookup: PathLookup, source: str, default_language: Language
) -> NotebookLinter:
ctx = LinterContext(index)
notebook = Notebook.parse(Path(""), source, default_language)
assert notebook is not None
return cls(ctx, notebook)
return cls(ctx, path_lookup, notebook)

def lint(self) -> Iterable[Advice]:
for cell in self._notebook.cells:
if not self._languages.is_supported(cell.language.language):
if isinstance(cell, RunCell):
self._load_source_from_run_cell(cell)
if not self._context.is_supported(cell.language.language):
continue
linter = self._linter(cell.language.language)
for advice in linter.lint(cell.original_code):
Expand All @@ -111,7 +125,29 @@ def lint(self) -> Iterable[Advice]:
def _linter(self, language: Language) -> Linter:
if language is Language.PYTHON:
return self._python_linter
return self._languages.linter(language)
return self._context.linter(language)

def _load_source_from_run_cell(self, run_cell: RunCell):
path = run_cell.maybe_notebook_path()
if path is None:
return # malformed run cell already reported
resolved = self._path_lookup.resolve(path)
if resolved is None:
return # already reported during dependency building
# transient workspace notebook suffix is inferred from object info
language = SUPPORTED_EXTENSION_LANGUAGES.get(resolved.suffix.lower(), None)
# we only support Python for now
if language is not Language.PYTHON:
return
source = resolved.read_text(_guess_encoding(resolved))
notebook = Notebook.parse(path, source, language)
for cell in notebook.cells:
if isinstance(cell, RunCell):
self._load_source_from_run_cell(cell)
continue
if not isinstance(cell, PythonCell):
continue
self._python_linter.process_child_cell(cell.original_code)

@staticmethod
def name() -> str:
Expand All @@ -124,6 +160,20 @@ def name() -> str:
}


def _guess_encoding(path: Path):
# some files encode a unicode BOM (byte-order-mark), so let's use that if available
with path.open('rb') as _file:
raw = _file.read(4)
if raw.startswith(codecs.BOM_UTF32_LE) or raw.startswith(codecs.BOM_UTF32_BE):
return 'utf-32'
if raw.startswith(codecs.BOM_UTF16_LE) or raw.startswith(codecs.BOM_UTF16_BE):
return 'utf-16'
if raw.startswith(codecs.BOM_UTF8):
return 'utf-8-sig'
# no BOM, let's use default encoding
return locale.getpreferredencoding(False)


class FileLinter:
_NOT_YET_SUPPORTED_SUFFIXES = {
'.scala',
Expand Down Expand Up @@ -168,30 +218,18 @@ class FileLinter:
'zip-safe',
}

def __init__(self, ctx: LinterContext, path: Path, content: str | None = None):
def __init__(self, ctx: LinterContext, path_lookup: PathLookup, path: Path, content: str | None = None):
self._ctx: LinterContext = ctx
self._path_lookup = path_lookup
self._path: Path = path
self._content = content

@cached_property
def _source_code(self) -> str:
if self._content is None:
self._content = self._path.read_text(self._guess_encoding())
self._content = self._path.read_text(_guess_encoding(self._path))
return self._content

def _guess_encoding(self):
# some files encode a unicode BOM (byte-order-mark), so let's use that if available
with self._path.open('rb') as _file:
raw = _file.read(4)
if raw.startswith(codecs.BOM_UTF32_LE) or raw.startswith(codecs.BOM_UTF32_BE):
return 'utf-32'
if raw.startswith(codecs.BOM_UTF16_LE) or raw.startswith(codecs.BOM_UTF16_BE):
return 'utf-16'
if raw.startswith(codecs.BOM_UTF8):
return 'utf-8-sig'
# no BOM, let's use default encoding
return locale.getpreferredencoding(False)

def _file_language(self):
return SUPPORTED_EXTENSION_LANGUAGES.get(self._path.suffix.lower())

Expand Down Expand Up @@ -243,5 +281,5 @@ def _lint_file(self):

def _lint_notebook(self):
notebook = Notebook.parse(self._path, self._source_code, self._file_language())
notebook_linter = NotebookLinter(self._ctx, notebook)
notebook_linter = NotebookLinter(self._ctx, self._path_lookup, notebook)
yield from notebook_linter.lint()
32 changes: 16 additions & 16 deletions tests/unit/source_code/notebooks/test_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@


@pytest.mark.parametrize("path, content", [("xyz.py", "a = 3"), ("xyz.sql", "select * from dual")])
def test_file_linter_lints_supported_language(path, content, migration_index):
linter = FileLinter(LinterContext(migration_index), Path(path), content)
def test_file_linter_lints_supported_language(path, content, migration_index, mock_path_lookup):
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, Path(path), content)
advices = list(linter.lint())
assert not advices


@pytest.mark.parametrize("path", ["xyz.scala", "xyz.r", "xyz.sh"])
def test_file_linter_lints_not_yet_supported_language(path, migration_index):
linter = FileLinter(LinterContext(migration_index), Path(path), "")
def test_file_linter_lints_not_yet_supported_language(path, migration_index, mock_path_lookup):
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, Path(path), "")
advices = list(linter.lint())
assert [advice.code for advice in advices] == ["unsupported-language"]

Expand All @@ -29,8 +29,8 @@ def source_code(self):
return self._source_code


def test_checks_encoding_of_pseudo_file(migration_index):
linter = FriendFileLinter(LinterContext(migration_index), Path("whatever"), "a=b")
def test_checks_encoding_of_pseudo_file(migration_index, mock_path_lookup):
linter = FriendFileLinter(LinterContext(migration_index), mock_path_lookup, Path("whatever"), "a=b")
assert linter.source_code() == "a=b"


Expand All @@ -44,10 +44,10 @@ def test_checks_encoding_of_pseudo_file(migration_index):
(codecs.BOM_UTF32_BE, "utf-32-be"),
],
)
def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_path):
def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_path, mock_path_lookup):
path = tmp_path / "file.py"
path.write_bytes(bom + "a = 12".encode(encoding))
linter = FriendFileLinter(LinterContext(migration_index), path)
linter = FriendFileLinter(LinterContext(migration_index), mock_path_lookup, path)
assert linter.source_code() == "a = 12"


Expand All @@ -70,16 +70,16 @@ def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_pa
".DS_Store", # on MacOS
],
)
def test_file_linter_lints_ignorable_language(path, migration_index):
linter = FileLinter(LinterContext(migration_index), Path(path), "")
def test_file_linter_lints_ignorable_language(path, migration_index, mock_path_lookup):
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, Path(path), "")
advices = list(linter.lint())
assert not advices


def test_file_linter_lints_non_ascii_encoded_file(migration_index):
def test_file_linter_lints_non_ascii_encoded_file(migration_index, mock_path_lookup):
preferred_encoding = locale.getpreferredencoding(False)
non_ascii_encoded_file = Path(__file__).parent.parent / "samples" / "nonascii.py"
linter = FileLinter(LinterContext(migration_index), non_ascii_encoded_file)
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, non_ascii_encoded_file)

advices = list(linter.lint())

Expand All @@ -88,11 +88,11 @@ def test_file_linter_lints_non_ascii_encoded_file(migration_index):
assert advices[0].message == f"File without {preferred_encoding} encoding is not supported {non_ascii_encoded_file}"


def test_file_linter_lints_file_with_missing_file(migration_index):
def test_file_linter_lints_file_with_missing_file(migration_index, mock_path_lookup):
path = create_autospec(Path)
path.suffix = ".py"
path.read_text.side_effect = FileNotFoundError("No such file or directory: 'test.py'")
linter = FileLinter(LinterContext(migration_index), path)
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, path)

advices = list(linter.lint())

Expand All @@ -101,11 +101,11 @@ def test_file_linter_lints_file_with_missing_file(migration_index):
assert advices[0].message == f"File not found: {path}"


def test_file_linter_lints_file_with_missing_read_permission(migration_index):
def test_file_linter_lints_file_with_missing_read_permission(migration_index, mock_path_lookup):
path = create_autospec(Path)
path.suffix = ".py"
path.read_text.side_effect = PermissionError("Permission denied")
linter = FileLinter(LinterContext(migration_index), path)
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, path)

advices = list(linter.lint())

Expand Down
9 changes: 9 additions & 0 deletions tests/unit/source_code/samples/values_across_notebooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Databricks notebook source

# COMMAND ----------

# MAGIC %run "./values_across_notebooks_child.py"

# COMMAND ----------

spark.table(f"{a}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Databricks notebook source

a = 12
Loading

0 comments on commit df071c6

Please sign in to comment.