Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Infer values from child notebook in run cell #2075

Merged
merged 14 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/databricks/labs/ucx/source_code/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,3 +200,14 @@ def lint(self, code: str) -> Iterable[Advice]:
yield from linter.lint_tree(tree)
except AstroidSyntaxError as e:
yield Failure('syntax-error', str(e), 0, 0, 0, 0)

def process_child_cell(self, code: str):
try:
tree = Tree.normalize_and_parse(code)
if self._tree is None:
self._tree = tree
else:
self._tree.append_statements(tree)
except AstroidSyntaxError as e:
# error already reported when linting enclosing notebook
logger.warning(f"Failed to parse Python cell: {code}", exc_info=e)
4 changes: 2 additions & 2 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,11 +377,11 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job):
yield from self._lint_file(container, ctx)

def _lint_file(self, file: LocalFile, ctx: LinterContext):
linter = FileLinter(ctx, file.path)
linter = FileLinter(ctx, self._path_lookup, file.path)
for advice in linter.lint():
yield file.path, advice

def _lint_notebook(self, notebook: Notebook, ctx: LinterContext):
linter = NotebookLinter(ctx, notebook)
linter = NotebookLinter(ctx, self._path_lookup, notebook)
for advice in linter.lint():
yield notebook.path, advice
2 changes: 1 addition & 1 deletion src/databricks/labs/ucx/source_code/known.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def _analyze_file(cls, known_distributions, library_root, dist_info, module_path
module_ref = module_ref[: -len(suffix)]
logger.info(f"Processing module: {module_ref}")
ctx = LinterContext(empty_index)
linter = FileLinter(ctx, module_path)
linter = FileLinter(ctx, PathLookup.from_sys_path(module_path.parent), module_path)
known_problems = set()
for problem in linter.lint():
known_problems.add(KnownProblem(problem.code, problem.message))
Expand Down
2 changes: 1 addition & 1 deletion src/databricks/labs/ucx/source_code/linters/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def _lint_one(self, path: Path) -> Iterable[LocatedAdvice]:
if path.is_dir():
return []
ctx = self._new_linter_context()
linter = FileLinter(ctx, path)
linter = FileLinter(ctx, self._path_lookup, path)
return [advice.for_path(path) for advice in linter.lint()]


Expand Down
37 changes: 21 additions & 16 deletions src/databricks/labs/ucx/source_code/notebooks/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,22 +168,14 @@ def is_runnable(self) -> bool:
return True # TODO

def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
command = f'{LANGUAGE_PREFIX}{self.language.magic_name}'
lines = self._original_code.split('\n')
for idx, line in enumerate(lines):
start = line.index(command)
if start >= 0:
path = line[start + len(command) :]
path = path.strip().strip("'").strip('"')
if len(path) == 0:
continue
notebook_path = Path(path)
start_line = self._original_offset + idx
problems = parent.register_notebook(notebook_path)
return [
problem.replace(start_line=start_line, start_col=0, end_line=start_line, end_col=len(line))
for problem in problems
]
path, idx, line = self.read_notebook_path()
if path is not None:
start_line = self._original_offset + idx
problems = parent.register_notebook(path)
return [
problem.replace(start_line=start_line, start_col=0, end_line=start_line, end_col=len(line))
for problem in problems
]
start_line = self._original_offset
problem = DependencyProblem(
'invalid-run-cell',
Expand All @@ -195,6 +187,19 @@ def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProb
)
return [problem]

def read_notebook_path(self):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def read_notebook_path(self):
def maybe_notebook_path() -> Path | None:
... = self._read_notebook_path()
def _read_notebook_path(self):

return type interface exposes too much unnecessary private information, make this method private and wrap it with public method with only what we need.

def _load_source_from_run_cell(self, run_cell: RunCell):
        path, _, _ = run_cell.read_notebook_path()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

command = f'{LANGUAGE_PREFIX}{self.language.magic_name}'
lines = self._original_code.split('\n')
for idx, line in enumerate(lines):
start = line.index(command)
if start >= 0:
path = line[start + len(command) :]
path = path.strip().strip("'").strip('"')
if len(path) == 0:
continue
return Path(path), idx, line
return None, 0, ""

def migrate_notebook_path(self):
pass

Expand Down
88 changes: 63 additions & 25 deletions src/databricks/labs/ucx/source_code/notebooks/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,24 @@
from collections.abc import Iterable
from functools import cached_property
from pathlib import Path
from typing import cast

from databricks.sdk.service.workspace import Language

from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
from databricks.labs.ucx.source_code.base import Advice, Failure, Linter
from databricks.labs.ucx.source_code.base import Advice, Failure, Linter, PythonSequentialLinter

from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyGraph, DependencyProblem
from databricks.labs.ucx.source_code.linters.context import LinterContext
from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, Cell, CELL_SEPARATOR, NOTEBOOK_HEADER
from databricks.labs.ucx.source_code.notebooks.cells import (
CellLanguage,
Cell,
CELL_SEPARATOR,
NOTEBOOK_HEADER,
RunCell,
PythonCell,
)
from databricks.labs.ucx.source_code.path_lookup import PathLookup


class Notebook(SourceContainer):
Expand Down Expand Up @@ -84,22 +93,27 @@ class NotebookLinter:
to the code cells according to the language of the cell.
"""

def __init__(self, langs: LinterContext, notebook: Notebook):
self._languages: LinterContext = langs
def __init__(self, context: LinterContext, path_lookup: PathLookup, notebook: Notebook):
self._context: LinterContext = context
self._path_lookup = path_lookup
self._notebook: Notebook = notebook
# reuse Python linter, which accumulates statements for improved inference
self._python_linter = langs.linter(Language.PYTHON)
self._python_linter: PythonSequentialLinter = cast(PythonSequentialLinter, context.linter(Language.PYTHON))

@classmethod
def from_source(cls, index: MigrationIndex, source: str, default_language: Language) -> 'NotebookLinter':
def from_source(
cls, index: MigrationIndex, path_lookup: PathLookup, source: str, default_language: Language
) -> NotebookLinter:
ctx = LinterContext(index)
notebook = Notebook.parse(Path(""), source, default_language)
assert notebook is not None
return cls(ctx, notebook)
return cls(ctx, path_lookup, notebook)

def lint(self) -> Iterable[Advice]:
for cell in self._notebook.cells:
if not self._languages.is_supported(cell.language.language):
if isinstance(cell, RunCell):
self._load_source_from_run_cell(cell)
if not self._context.is_supported(cell.language.language):
continue
linter = self._linter(cell.language.language)
for advice in linter.lint(cell.original_code):
Expand All @@ -111,7 +125,29 @@ def lint(self) -> Iterable[Advice]:
def _linter(self, language: Language) -> Linter:
if language is Language.PYTHON:
return self._python_linter
return self._languages.linter(language)
return self._context.linter(language)

def _load_source_from_run_cell(self, run_cell: RunCell):
path, _, _ = run_cell.read_notebook_path()
if path is None:
return # malformed run cell already reported
resolved = self._path_lookup.resolve(path)
if resolved is None:
return # already reported during dependency building
# TODO deal with workspace notebooks
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

resolved already has workspace notebook because of WorkspacePath.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

noted.

language = SUPPORTED_EXTENSION_LANGUAGES.get(resolved.suffix.lower(), None)
# we only support Python for now
if language is not Language.PYTHON:
return
source = resolved.read_text(_guess_encoding(resolved))
notebook = Notebook.parse(path, source, language)
for cell in notebook.cells:
if isinstance(cell, RunCell):
self._load_source_from_run_cell(cell)
continue
if not isinstance(cell, PythonCell):
continue
self._python_linter.process_child_cell(cell.original_code)

@staticmethod
def name() -> str:
Expand All @@ -124,6 +160,20 @@ def name() -> str:
}


def _guess_encoding(path: Path):
# some files encode a unicode BOM (byte-order-mark), so let's use that if available
with path.open('rb') as _file:
raw = _file.read(4)
if raw.startswith(codecs.BOM_UTF32_LE) or raw.startswith(codecs.BOM_UTF32_BE):
return 'utf-32'
if raw.startswith(codecs.BOM_UTF16_LE) or raw.startswith(codecs.BOM_UTF16_BE):
return 'utf-16'
if raw.startswith(codecs.BOM_UTF8):
return 'utf-8-sig'
# no BOM, let's use default encoding
return locale.getpreferredencoding(False)


class FileLinter:
_NOT_YET_SUPPORTED_SUFFIXES = {
'.scala',
Expand Down Expand Up @@ -168,30 +218,18 @@ class FileLinter:
'zip-safe',
}

def __init__(self, ctx: LinterContext, path: Path, content: str | None = None):
def __init__(self, ctx: LinterContext, path_lookup: PathLookup, path: Path, content: str | None = None):
self._ctx: LinterContext = ctx
self._path_lookup = path_lookup
self._path: Path = path
self._content = content

@cached_property
def _source_code(self) -> str:
if self._content is None:
self._content = self._path.read_text(self._guess_encoding())
self._content = self._path.read_text(_guess_encoding(self._path))
return self._content

def _guess_encoding(self):
# some files encode a unicode BOM (byte-order-mark), so let's use that if available
with self._path.open('rb') as _file:
raw = _file.read(4)
if raw.startswith(codecs.BOM_UTF32_LE) or raw.startswith(codecs.BOM_UTF32_BE):
return 'utf-32'
if raw.startswith(codecs.BOM_UTF16_LE) or raw.startswith(codecs.BOM_UTF16_BE):
return 'utf-16'
if raw.startswith(codecs.BOM_UTF8):
return 'utf-8-sig'
# no BOM, let's use default encoding
return locale.getpreferredencoding(False)

def _file_language(self):
return SUPPORTED_EXTENSION_LANGUAGES.get(self._path.suffix.lower())

Expand Down Expand Up @@ -243,5 +281,5 @@ def _lint_file(self):

def _lint_notebook(self):
notebook = Notebook.parse(self._path, self._source_code, self._file_language())
notebook_linter = NotebookLinter(self._ctx, notebook)
notebook_linter = NotebookLinter(self._ctx, self._path_lookup, notebook)
yield from notebook_linter.lint()
32 changes: 16 additions & 16 deletions tests/unit/source_code/notebooks/test_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@


@pytest.mark.parametrize("path, content", [("xyz.py", "a = 3"), ("xyz.sql", "select * from dual")])
def test_file_linter_lints_supported_language(path, content, migration_index):
linter = FileLinter(LinterContext(migration_index), Path(path), content)
def test_file_linter_lints_supported_language(path, content, migration_index, mock_path_lookup):
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, Path(path), content)
advices = list(linter.lint())
assert not advices


@pytest.mark.parametrize("path", ["xyz.scala", "xyz.r", "xyz.sh"])
def test_file_linter_lints_not_yet_supported_language(path, migration_index):
linter = FileLinter(LinterContext(migration_index), Path(path), "")
def test_file_linter_lints_not_yet_supported_language(path, migration_index, mock_path_lookup):
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, Path(path), "")
advices = list(linter.lint())
assert [advice.code for advice in advices] == ["unsupported-language"]

Expand All @@ -29,8 +29,8 @@ def source_code(self):
return self._source_code


def test_checks_encoding_of_pseudo_file(migration_index):
linter = FriendFileLinter(LinterContext(migration_index), Path("whatever"), "a=b")
def test_checks_encoding_of_pseudo_file(migration_index, mock_path_lookup):
linter = FriendFileLinter(LinterContext(migration_index), mock_path_lookup, Path("whatever"), "a=b")
assert linter.source_code() == "a=b"


Expand All @@ -44,10 +44,10 @@ def test_checks_encoding_of_pseudo_file(migration_index):
(codecs.BOM_UTF32_BE, "utf-32-be"),
],
)
def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_path):
def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_path, mock_path_lookup):
path = tmp_path / "file.py"
path.write_bytes(bom + "a = 12".encode(encoding))
linter = FriendFileLinter(LinterContext(migration_index), path)
linter = FriendFileLinter(LinterContext(migration_index), mock_path_lookup, path)
assert linter.source_code() == "a = 12"


Expand All @@ -70,16 +70,16 @@ def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_pa
".DS_Store", # on MacOS
],
)
def test_file_linter_lints_ignorable_language(path, migration_index):
linter = FileLinter(LinterContext(migration_index), Path(path), "")
def test_file_linter_lints_ignorable_language(path, migration_index, mock_path_lookup):
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, Path(path), "")
advices = list(linter.lint())
assert not advices


def test_file_linter_lints_non_ascii_encoded_file(migration_index):
def test_file_linter_lints_non_ascii_encoded_file(migration_index, mock_path_lookup):
preferred_encoding = locale.getpreferredencoding(False)
non_ascii_encoded_file = Path(__file__).parent.parent / "samples" / "nonascii.py"
linter = FileLinter(LinterContext(migration_index), non_ascii_encoded_file)
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, non_ascii_encoded_file)

advices = list(linter.lint())

Expand All @@ -88,11 +88,11 @@ def test_file_linter_lints_non_ascii_encoded_file(migration_index):
assert advices[0].message == f"File without {preferred_encoding} encoding is not supported {non_ascii_encoded_file}"


def test_file_linter_lints_file_with_missing_file(migration_index):
def test_file_linter_lints_file_with_missing_file(migration_index, mock_path_lookup):
path = create_autospec(Path)
path.suffix = ".py"
path.read_text.side_effect = FileNotFoundError("No such file or directory: 'test.py'")
linter = FileLinter(LinterContext(migration_index), path)
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, path)

advices = list(linter.lint())

Expand All @@ -101,11 +101,11 @@ def test_file_linter_lints_file_with_missing_file(migration_index):
assert advices[0].message == f"File not found: {path}"


def test_file_linter_lints_file_with_missing_read_permission(migration_index):
def test_file_linter_lints_file_with_missing_read_permission(migration_index, mock_path_lookup):
path = create_autospec(Path)
path.suffix = ".py"
path.read_text.side_effect = PermissionError("Permission denied")
linter = FileLinter(LinterContext(migration_index), path)
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, path)

advices = list(linter.lint())

Expand Down
9 changes: 9 additions & 0 deletions tests/unit/source_code/samples/values_across_notebooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Databricks notebook source

# COMMAND ----------

# MAGIC %run "./values_across_notebooks_child.py"

# COMMAND ----------

spark.table(f"{a}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Databricks notebook source

a = 12
Loading
Loading