Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Infer values from child notebook loaded via dbutils.notebook.run() call #2083

Closed
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7a3170f
append statements
ericvergnaud Jul 1, 2024
e56dda6
reuse python linters across cells
ericvergnaud Jul 1, 2024
fa9fb00
resolve cyclic import
ericvergnaud Jul 1, 2024
c8d05b0
fix merge issue
ericvergnaud Jul 2, 2024
f364b02
Merge branch 'main' into infer-values-across-notebook-cells
ericvergnaud Jul 3, 2024
c3cbc7b
successfully infers values from child notebook
ericvergnaud Jul 3, 2024
56e3530
successfully infers values from child notebook in run cell
ericvergnaud Jul 3, 2024
d9d3172
formatting
ericvergnaud Jul 3, 2024
1e5feff
formatting
ericvergnaud Jul 3, 2024
257fea6
PythonSequentialLinter is stateful so instantiate it when required
ericvergnaud Jul 4, 2024
dacf50a
Merge branch 'main' into infer-values-across-notebook-cells
ericvergnaud Jul 4, 2024
65072d6
Merge branch 'infer-values-across-notebook-cells' into infer-values-f…
ericvergnaud Jul 4, 2024
d244a5a
more tests
ericvergnaud Jul 3, 2024
1e9ed6a
peogress
ericvergnaud Jul 4, 2024
d6d9032
infer values from child notebooks loaded via dbutils.notebook.run
ericvergnaud Jul 4, 2024
d8089f0
fix crasher
ericvergnaud Jul 4, 2024
6c7f715
fix crasher
ericvergnaud Jul 4, 2024
e00d9fd
rename method
ericvergnaud Jul 8, 2024
14943fb
Merge branch 'main' into infer-values-from-child-notebook-in-dbutils-…
ericvergnaud Jul 8, 2024
e4fd7be
Merge branch 'main' into infer-values-from-child-notebook-in-dbutils-…
ericvergnaud Jul 9, 2024
fb358bf
Merge branch 'main' into infer-values-from-child-notebook-in-dbutils-…
ericvergnaud Jul 9, 2024
a6ac282
Merge branch 'main' into infer-values-from-child-notebook-in-dbutils-…
ericvergnaud Jul 10, 2024
5a8512d
Merge branch 'main' into infer-values-from-child-notebook-in-dbutils-…
ericvergnaud Jul 10, 2024
bea4042
address comments
ericvergnaud Jul 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 50 additions & 1 deletion src/databricks/labs/ucx/source_code/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
from dataclasses import dataclass
from pathlib import Path

from astroid import NodeNG # type: ignore
from astroid import AstroidSyntaxError, Module, NodeNG # type: ignore

from databricks.sdk.service import compute

from databricks.labs.ucx.source_code.linters.python_ast import Tree

# Code mapping between LSP, PyLint, and our own diagnostics:
# | LSP | PyLint | Our |
Expand Down Expand Up @@ -130,6 +131,16 @@ class Linter:
def lint(self, code: str) -> Iterable[Advice]: ...


class PythonLinter(Linter):

def lint(self, code: str) -> Iterable[Advice]:
tree = Tree.normalize_and_parse(code)
yield from self.lint_tree(tree)

@abstractmethod
def lint_tree(self, tree: Tree) -> Iterable[Advice]: ...


class Fixer:
@abstractmethod
def name(self) -> str: ...
Expand Down Expand Up @@ -170,3 +181,41 @@ def __init__(self, linters: list[Linter]):
def lint(self, code: str) -> Iterable[Advice]:
for linter in self._linters:
yield from linter.lint(code)


class PythonSequentialLinter(Linter):

def __init__(self, linters: list[PythonLinter]):
self._linters = linters
self._tree: Tree | None = None

def lint(self, code: str) -> Iterable[Advice]:
try:
tree = self._parse_and_append(code)
yield from self.lint_tree(tree)
except AstroidSyntaxError as e:
yield Failure('syntax-error', str(e), 0, 0, 0, 0)

def lint_tree(self, tree: Tree) -> Iterable[Advice]:
for linter in self._linters:
yield from linter.lint_tree(tree)

def _parse_and_append(self, code: str) -> Tree:
tree = Tree.normalize_and_parse(code)
self.append_tree(tree)
return tree

def append_tree(self, tree):
nfx marked this conversation as resolved.
Show resolved Hide resolved
nfx marked this conversation as resolved.
Show resolved Hide resolved
if self._tree is None:
self._tree = Tree(Module("root"))
self._tree.append_tree(tree)

def append_nodes(self, nodes: list[NodeNG]):
if self._tree is None:
self._tree = Tree(Module("root"))
self._tree.append_nodes(nodes)

def append_globals(self, globs: dict):
if self._tree is None:
self._tree = Tree(Module("root"))
self._tree.append_globals(globs)
12 changes: 6 additions & 6 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,16 +372,16 @@ def _lint_task(self, task: jobs.Task, job: jobs.Job):
if not container:
continue
if isinstance(container, Notebook):
yield from self._lint_notebook(container, ctx)
yield from self._lint_notebook(container, ctx, session_state)
if isinstance(container, LocalFile):
yield from self._lint_file(container, ctx)
yield from self._lint_file(container, ctx, session_state)

def _lint_file(self, file: LocalFile, ctx: LinterContext):
linter = FileLinter(ctx, file.path)
def _lint_file(self, file: LocalFile, ctx: LinterContext, session_state: CurrentSessionState):
linter = FileLinter(ctx, self._path_lookup, session_state, file.path)
for advice in linter.lint():
yield file.path, advice

def _lint_notebook(self, notebook: Notebook, ctx: LinterContext):
linter = NotebookLinter(ctx, notebook)
def _lint_notebook(self, notebook: Notebook, ctx: LinterContext, session_state: CurrentSessionState):
linter = NotebookLinter(ctx, self._path_lookup, session_state, notebook)
for advice in linter.lint():
yield notebook.path, advice
61 changes: 58 additions & 3 deletions src/databricks/labs/ucx/source_code/known.json
Original file line number Diff line number Diff line change
Expand Up @@ -1265,7 +1265,7 @@
"code": "dbfs-usage",
"message": "Deprecated file system path: dbfs:/"
},
{
{
"code": "table-migrate",
"message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta"
}
Expand Down Expand Up @@ -2038,6 +2038,14 @@
"dockerpycreds.utils": [],
"dockerpycreds.version": []
},
"docstring-to-markdown": {
"docstring_to_markdown": [],
"docstring_to_markdown._utils": [],
"docstring_to_markdown.cpython": [],
"docstring_to_markdown.google": [],
"docstring_to_markdown.plain": [],
"docstring_to_markdown.rst": []
},
"entrypoints": {
"entrypoints": []
},
Expand Down Expand Up @@ -21248,6 +21256,53 @@
"python-dateutil": {
"dateutil": []
},
"python-lsp-jsonrpc": {
"pylsp_jsonrpc": [],
"pylsp_jsonrpc._version": [],
"pylsp_jsonrpc.dispatchers": [],
"pylsp_jsonrpc.endpoint": [],
"pylsp_jsonrpc.exceptions": [],
"pylsp_jsonrpc.streams": []
},
"python-lsp-server": {
"pylsp": [],
"pylsp._utils": [],
"pylsp._version": [],
"pylsp.config": [],
"pylsp.config.config": [],
"pylsp.config.flake8_conf": [],
"pylsp.config.pycodestyle_conf": [],
"pylsp.config.source": [],
"pylsp.hookspecs": [],
"pylsp.lsp": [],
"pylsp.plugins": [],
"pylsp.plugins._resolvers": [],
"pylsp.plugins._rope_task_handle": [],
"pylsp.plugins.autopep8_format": [],
"pylsp.plugins.definition": [],
"pylsp.plugins.flake8_lint": [],
"pylsp.plugins.folding": [],
"pylsp.plugins.highlight": [],
"pylsp.plugins.hover": [],
"pylsp.plugins.jedi_completion": [],
"pylsp.plugins.jedi_rename": [],
"pylsp.plugins.mccabe_lint": [],
"pylsp.plugins.preload_imports": [],
"pylsp.plugins.pycodestyle_lint": [],
"pylsp.plugins.pydocstyle_lint": [],
"pylsp.plugins.pyflakes_lint": [],
"pylsp.plugins.pylint_lint": [],
"pylsp.plugins.references": [],
"pylsp.plugins.rope_autoimport": [],
"pylsp.plugins.rope_completion": [],
"pylsp.plugins.signature": [],
"pylsp.plugins.symbols": [],
"pylsp.plugins.yapf_format": [],
"pylsp.python_lsp": [],
"pylsp.text_edit": [],
"pylsp.uris": [],
"pylsp.workspace": []
},
"pytz": {
"pytz": []
},
Expand Down Expand Up @@ -24445,6 +24500,7 @@
"tzdata": {
"tzdata": []
},
"ujson": {},
"umap": {
"umap": [],
"umap.get": []
Expand Down Expand Up @@ -25246,5 +25302,4 @@
"zipp.compat.py310": [],
"zipp.glob": []
}
}

}
6 changes: 4 additions & 2 deletions src/databricks/labs/ucx/source_code/known.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from databricks.labs.blueprint.entrypoint import get_logger

from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
from databricks.labs.ucx.source_code.base import CurrentSessionState
from databricks.labs.ucx.source_code.graph import DependencyProblem
from databricks.labs.ucx.source_code.linters.context import LinterContext
from databricks.labs.ucx.source_code.notebooks.sources import FileLinter
Expand Down Expand Up @@ -150,8 +151,9 @@ def _analyze_file(cls, known_distributions, library_root, dist_info, module_path
if module_ref.endswith(suffix):
module_ref = module_ref[: -len(suffix)]
logger.info(f"Processing module: {module_ref}")
ctx = LinterContext(empty_index)
linter = FileLinter(ctx, module_path)
session_state = CurrentSessionState()
ctx = LinterContext(empty_index, session_state)
linter = FileLinter(ctx, PathLookup.from_sys_path(module_path.parent), session_state, module_path)
known_problems = set()
for problem in linter.lint():
known_problems.add(KnownProblem(problem.code, problem.message))
Expand Down
23 changes: 17 additions & 6 deletions src/databricks/labs/ucx/source_code/linters/context.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
from typing import cast

from databricks.sdk.service.workspace import Language

from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
from databricks.labs.ucx.source_code.base import Fixer, Linter, SequentialLinter, CurrentSessionState
from databricks.labs.ucx.source_code.base import (
Fixer,
Linter,
SequentialLinter,
CurrentSessionState,
PythonSequentialLinter,
PythonLinter,
)
from databricks.labs.ucx.source_code.linters.dbfs import FromDbfsFolder, DBFSUsageLinter
from databricks.labs.ucx.source_code.linters.imports import DbutilsLinter

Expand All @@ -16,7 +25,7 @@ def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSe
self._index = index
session_state = CurrentSessionState() if not session_state else session_state

python_linters: list[Linter] = []
python_linters: list[PythonLinter] = []
python_fixers: list[Fixer] = []

sql_linters: list[Linter] = []
Expand All @@ -38,9 +47,9 @@ def __init__(self, index: MigrationIndex | None = None, session_state: CurrentSe
]
sql_linters.append(FromDbfsFolder())

self._linters = {
Language.PYTHON: SequentialLinter(python_linters),
Language.SQL: SequentialLinter(sql_linters),
self._linters: dict[Language, list[Linter] | list[PythonLinter]] = {
Language.PYTHON: python_linters,
Language.SQL: sql_linters,
}
self._fixers: dict[Language, list[Fixer]] = {
Language.PYTHON: python_fixers,
Expand All @@ -53,7 +62,9 @@ def is_supported(self, language: Language) -> bool:
def linter(self, language: Language) -> Linter:
if language not in self._linters:
raise ValueError(f"Unsupported language: {language}")
return self._linters[language]
if language is Language.PYTHON:
return PythonSequentialLinter(cast(list[PythonLinter], self._linters[language]))
return SequentialLinter(cast(list[Linter], self._linters[language]))

def fixer(self, language: Language, diagnostic_code: str) -> Fixer | None:
if language not in self._fixers:
Expand Down
12 changes: 6 additions & 6 deletions src/databricks/labs/ucx/source_code/linters/dbfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
from sqlglot import Expression, parse as parse_sql, ParseError as SqlParseError
from sqlglot.expressions import Table

from databricks.labs.ucx.source_code.base import Advice, Linter, Deprecation, CurrentSessionState, Failure
from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor, InferredValue
from databricks.labs.ucx.source_code.base import Advice, Linter, Deprecation, CurrentSessionState, Failure, PythonLinter
from databricks.labs.ucx.source_code.linters.python_ast import Tree, TreeVisitor
from databricks.labs.ucx.source_code.linters.python_infer import InferredValue

logger = logging.getLogger(__name__)

Expand All @@ -29,7 +30,7 @@ def visit_call(self, node: Call):

def _visit_arg(self, arg: NodeNG):
try:
for inferred in Tree(arg).infer_values(self._session_state):
for inferred in InferredValue.infer_from_node(arg, self._session_state):
if not inferred.is_inferred():
logger.debug(f"Could not infer value of {arg.as_string()}")
continue
Expand Down Expand Up @@ -64,7 +65,7 @@ def get_advices(self) -> Iterable[Advice]:
yield from self._advices


class DBFSUsageLinter(Linter):
class DBFSUsageLinter(PythonLinter):

def __init__(self, session_state: CurrentSessionState):
self._session_state = session_state
Expand All @@ -76,11 +77,10 @@ def name() -> str:
"""
return 'dbfs-usage'

def lint(self, code: str) -> Iterable[Advice]:
def lint_tree(self, tree: Tree) -> Iterable[Advice]:
"""
Lints the code looking for file system paths that are deprecated
"""
tree = Tree.normalize_and_parse(code)
visitor = DetectDbfsVisitor(self._session_state)
visitor.visit(tree.node)
yield from visitor.get_advices()
Expand Down
2 changes: 1 addition & 1 deletion src/databricks/labs/ucx/source_code/linters/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def _lint_one(self, path: Path) -> Iterable[LocatedAdvice]:
if path.is_dir():
return []
ctx = self._new_linter_context()
linter = FileLinter(ctx, path)
linter = FileLinter(ctx, self._path_lookup, self._session_state, path)
return [advice.for_path(path) for advice in linter.lint()]


Expand Down
31 changes: 17 additions & 14 deletions src/databricks/labs/ucx/source_code/linters/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import abc
import logging
from collections.abc import Callable, Iterable
from pathlib import Path
from typing import TypeVar, cast

from astroid import ( # type: ignore
Expand All @@ -16,8 +17,10 @@
NodeNG,
)

from databricks.labs.ucx.source_code.base import Linter, Advice, Advisory, CurrentSessionState
from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase, TreeVisitor, InferredValue
from databricks.labs.ucx.source_code.base import Advice, Advisory, CurrentSessionState, PythonLinter
from databricks.labs.ucx.source_code.linters.python_ast import Tree, NodeBase, TreeVisitor
from databricks.labs.ucx.source_code.linters.python_infer import InferredValue
from databricks.labs.ucx.source_code.path_lookup import PathLookup

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -90,7 +93,7 @@ def get_notebook_paths(self, session_state: CurrentSessionState) -> tuple[bool,
"""
arg = DbutilsLinter.get_dbutils_notebook_run_path_arg(self.node)
try:
all_inferred = Tree(arg).infer_values(session_state)
all_inferred = InferredValue.infer_from_node(arg, session_state)
return self._get_notebook_paths(all_inferred)
except InferenceError:
logger.debug(f"Can't infer value(s) of {arg.as_string()}")
Expand All @@ -110,13 +113,12 @@ def _get_notebook_paths(cls, all_inferred: Iterable[InferredValue]) -> tuple[boo
return has_unresolved, paths


class DbutilsLinter(Linter):
class DbutilsLinter(PythonLinter):

def __init__(self, session_state: CurrentSessionState):
self._session_state = session_state

def lint(self, code: str) -> Iterable[Advice]:
tree = Tree.normalize_and_parse(code)
def lint_tree(self, tree: Tree) -> Iterable[Advice]:
nodes = self.list_dbutils_notebook_run_calls(tree)
for node in nodes:
yield from self._raise_advice_if_unresolved(node.node, self._session_state)
Expand Down Expand Up @@ -161,17 +163,18 @@ def __init__(self, node: NodeNG, path: str, is_append: bool):
self._path = path
self._is_append = is_append

@property
def node(self):
return self._node

@property
def path(self):
return self._path

@property
def is_append(self):
return self._is_append
def mutate_path_lookup(self, path_lookup: PathLookup):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def mutate_path_lookup(self, path_lookup: PathLookup):
def apply_to(self, path_lookup: PathLookup):

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

path = Path(self._path)
if not path.is_absolute():
path = path_lookup.cwd / path
if self._is_append:
path_lookup.append_path(path)
return
path_lookup.prepend_path(path)


class AbsolutePath(SysPathChange):
Expand Down Expand Up @@ -229,7 +232,7 @@ def visit_call(self, node: Call):
relative = True
changed = changed.args[0]
try:
for inferred in Tree(changed).infer_values(self._session_state):
for inferred in InferredValue.infer_from_node(changed, self._session_state):
self._visit_inferred(changed, inferred, relative, is_append)
except InferenceError:
self.sys_path_changes.append(UnresolvedPath(changed, changed.as_string(), is_append))
Expand Down
Loading
Loading