-
Notifications
You must be signed in to change notification settings - Fork 87
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Detect dbutils.notebook.run #1284
Changes from 35 commits
e47b965
a0c1e16
a2808ce
4066042
12e075c
ee88135
c428e18
c7e0065
d0ce635
38e18cc
c3c160e
9fd6a11
8e36094
cece88d
3a39712
63b1d82
2aaedf8
946d998
da61c2b
b43dd5f
81fffd3
6cdcfd4
80c07a5
59d1ece
fa785fe
036d5fc
8796eec
ac13e63
c6aaa2e
94fc41c
cd58d7a
f0770af
12c8e30
33495a1
c005c46
ea181b4
1ddd452
fa6a8cf
967f956
e1fcb4f
2224c3f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,58 @@ | ||||||||||||||||
import ast | ||||||||||||||||
import logging | ||||||||||||||||
|
||||||||||||||||
logger = logging.getLogger(__name__) | ||||||||||||||||
|
||||||||||||||||
|
||||||||||||||||
class MatchingVisitor(ast.NodeVisitor): | ||||||||||||||||
|
||||||||||||||||
def __init__(self, node_type: type, match_nodes: list[tuple[str, type]]): | ||||||||||||||||
self.matched_nodes: list[ast.AST] = [] | ||||||||||||||||
self._node_type = node_type | ||||||||||||||||
self._match_nodes = match_nodes | ||||||||||||||||
|
||||||||||||||||
# pylint: disable=invalid-name | ||||||||||||||||
nfx marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||||||||||||||||
def visit_Call(self, node: ast.Call): | ||||||||||||||||
if self._node_type is not ast.Call: | ||||||||||||||||
return | ||||||||||||||||
try: | ||||||||||||||||
if self._matches(node.func, 0): | ||||||||||||||||
self.matched_nodes.append(node) | ||||||||||||||||
except NotImplementedError as e: | ||||||||||||||||
logger.warning(f"Missing implementation: {e.args[0]}") | ||||||||||||||||
|
||||||||||||||||
def _matches(self, node: ast.AST, depth: int): | ||||||||||||||||
if depth >= len(self._match_nodes): | ||||||||||||||||
return False | ||||||||||||||||
pair = self._match_nodes[depth] | ||||||||||||||||
if not isinstance(node, pair[1]): | ||||||||||||||||
return False | ||||||||||||||||
next_node: ast.AST | None = None | ||||||||||||||||
if isinstance(node, ast.Attribute): | ||||||||||||||||
if node.attr != pair[0]: | ||||||||||||||||
return False | ||||||||||||||||
next_node = node.value | ||||||||||||||||
elif isinstance(node, ast.Name): | ||||||||||||||||
if node.id != pair[0]: | ||||||||||||||||
return False | ||||||||||||||||
else: | ||||||||||||||||
raise NotImplementedError(str(type(node))) | ||||||||||||||||
if next_node is None: | ||||||||||||||||
# is this the last node to match ? | ||||||||||||||||
return len(self._match_nodes) - 1 == depth | ||||||||||||||||
return self._matches(next_node, depth + 1) | ||||||||||||||||
|
||||||||||||||||
|
||||||||||||||||
class ASTLinter: | ||||||||||||||||
|
||||||||||||||||
def __init__(self): | ||||||||||||||||
self._module: ast.Module | None = None | ||||||||||||||||
|
||||||||||||||||
def parse(self, code: str): | ||||||||||||||||
self._module = ast.parse(code) | ||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
so that we either fail initialising or have a valid state. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I respectfully disagree. A constructor should not perform processing, especially not when that processing may fail under uncontrolled conditions (source code received from outside) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Factory is just fine There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually the reason for this is that ASTLinter is a base class for PythonLinter, which also needs to follow Linter conventions... changing that now There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||||||||||||||||
|
||||||||||||||||
def locate(self, node_type: type, match_nodes: list[tuple[str, type]]) -> list[ast.AST]: | ||||||||||||||||
assert self._module is not None | ||||||||||||||||
visitor = MatchingVisitor(node_type, match_nodes) | ||||||||||||||||
visitor.visit(self._module) | ||||||||||||||||
return visitor.matched_nodes |
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -1,18 +1,30 @@ | ||||||||
from __future__ import annotations # for type hints | ||||||||
|
||||||||
import ast | ||||||||
import logging | ||||||||
from abc import ABC, abstractmethod | ||||||||
from ast import parse as parse_python | ||||||||
from collections.abc import Callable | ||||||||
from collections.abc import Callable, Iterable | ||||||||
from enum import Enum | ||||||||
|
||||||||
from sqlglot import ParseError as SQLParseError | ||||||||
from sqlglot import parse as parse_sql | ||||||||
from databricks.sdk.service.workspace import Language | ||||||||
|
||||||||
NOTEBOOK_HEADER = " Databricks notebook source" | ||||||||
CELL_SEPARATOR = " COMMAND ----------" | ||||||||
MAGIC_PREFIX = ' MAGIC' | ||||||||
LANGUAGE_PREFIX = ' %' | ||||||||
from databricks.labs.ucx.source_code.astlinter import ASTLinter | ||||||||
from databricks.labs.ucx.source_code.base import Linter, Advice, Advisory | ||||||||
|
||||||||
|
||||||||
logger = logging.getLogger(__name__) | ||||||||
# use a specific logger for sqlglot warnings so we can disable them selectively | ||||||||
sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot") | ||||||||
|
||||||||
NOTEBOOK_HEADER = "Databricks notebook source" | ||||||||
CELL_SEPARATOR = "COMMAND ----------" | ||||||||
MAGIC_PREFIX = 'MAGIC' | ||||||||
LANGUAGE_PREFIX = '%' | ||||||||
LANGUAGE_PI = 'LANGUAGE' | ||||||||
COMMENT_PI = 'COMMENT' | ||||||||
|
||||||||
|
||||||||
class Cell(ABC): | ||||||||
|
@@ -22,6 +34,7 @@ | |||||||
|
||||||||
@property | ||||||||
def migrated_code(self): | ||||||||
# this property is for reading the migrated code, not for generating it | ||||||||
return self._original_code # for now since we're not doing any migration yet | ||||||||
|
||||||||
@property | ||||||||
|
@@ -38,6 +51,42 @@ | |||||||
raise NotImplementedError() | ||||||||
|
||||||||
|
||||||||
class PythonLinter(ASTLinter, Linter): | ||||||||
def lint(self, code: str) -> Iterable[Advice]: | ||||||||
self.parse(code) | ||||||||
nodes = self.locate(ast.Call, [("run", ast.Attribute), ("notebook", ast.Attribute), ("dbutils", ast.Name)]) | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if the code is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree it's theoretically possible to do that but I can't think of any benefit for users to keep a local private copy of a public API ? To your point, we can't address this edge cases for now. Tbh, I can think of thousands of them, such as:
I have created ticket #1334 for that |
||||||||
return [ self._convert_dbutils_notebook_run_to_advice(node) for node in nodes ] | ||||||||
|
||||||||
@classmethod | ||||||||
def _convert_dbutils_notebook_run_to_advice(cls, node: ast.AST) -> Advisory: | ||||||||
assert isinstance(node, ast.Call) | ||||||||
path = cls.get_dbutils_notebook_run_path_arg(node) | ||||||||
if isinstance(path, ast.Constant): | ||||||||
return Advisory( | ||||||||
'notebook-auto-migrate', | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||||||||
"Call to 'dbutils.notebook.run' will be migrated automatically", | ||||||||
node.lineno, | ||||||||
node.col_offset, | ||||||||
node.end_lineno or 0, | ||||||||
node.end_col_offset or 0, | ||||||||
) | ||||||||
return Advisory( | ||||||||
'notebook-manual-migrate', | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||||||||
"Path for 'dbutils.notebook.run' is not a constant and requires adjusting the notebook path", | ||||||||
node.lineno, | ||||||||
node.col_offset, | ||||||||
node.end_lineno or 0, | ||||||||
node.end_col_offset or 0, | ||||||||
) | ||||||||
|
||||||||
@staticmethod | ||||||||
def get_dbutils_notebook_run_path_arg(node: ast.Call): | ||||||||
if len(node.args) > 0: | ||||||||
return node.args[0] | ||||||||
arg = next(kw for kw in node.keywords if kw.arg == "path") | ||||||||
return arg.value if arg is not None else None | ||||||||
|
||||||||
|
||||||||
class PythonCell(Cell): | ||||||||
|
||||||||
@property | ||||||||
|
@@ -46,15 +95,22 @@ | |||||||
|
||||||||
def is_runnable(self) -> bool: | ||||||||
try: | ||||||||
ast = parse_python(self._original_code) | ||||||||
return ast is not None | ||||||||
tree = parse_python(self._original_code) | ||||||||
return tree is not None | ||||||||
except SyntaxError: | ||||||||
return False | ||||||||
return True | ||||||||
|
||||||||
def build_dependency_graph(self, parent: DependencyGraph): | ||||||||
# TODO https://github.com/databrickslabs/ucx/issues/1200 | ||||||||
# TODO https://github.com/databrickslabs/ucx/issues/1202 | ||||||||
pass | ||||||||
linter = ASTLinter() | ||||||||
linter.parse(self._original_code) | ||||||||
nodes = linter.locate(ast.Call, [("run", ast.Attribute), ("notebook", ast.Attribute), ("dbutils", ast.Name)]) | ||||||||
for node in nodes: | ||||||||
assert isinstance(node, ast.Call) | ||||||||
path = PythonLinter.get_dbutils_notebook_run_path_arg(node) | ||||||||
if isinstance(path, ast.Constant): | ||||||||
parent.register_dependency(path.value.strip("'").strip('"')) | ||||||||
|
||||||||
|
||||||||
|
||||||||
class RCell(Cell): | ||||||||
|
@@ -94,7 +150,8 @@ | |||||||
statements = parse_sql(self._original_code) | ||||||||
return len(statements) > 0 | ||||||||
except SQLParseError: | ||||||||
return False | ||||||||
sqlglot_logger.warning(f"Failed to parse SQL using 'sqlglot': {self._original_code}") | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's also log the sqlglot error, so that we can create issues over there. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||||||||
return True | ||||||||
|
||||||||
def build_dependency_graph(self, parent: DependencyGraph): | ||||||||
pass # not in scope | ||||||||
|
@@ -123,7 +180,7 @@ | |||||||
return True # TODO | ||||||||
|
||||||||
def build_dependency_graph(self, parent: DependencyGraph): | ||||||||
command = f'{LANGUAGE_PREFIX}{self.language.magic_name}'.strip() | ||||||||
command = f'{LANGUAGE_PREFIX}{self.language.magic_name}' | ||||||||
lines = self._original_code.split('\n') | ||||||||
for line in lines: | ||||||||
start = line.index(command) | ||||||||
|
@@ -136,19 +193,23 @@ | |||||||
|
||||||||
class CellLanguage(Enum): | ||||||||
# long magic_names must come first to avoid shorter ones being matched | ||||||||
PYTHON = Language.PYTHON, 'python', '#', PythonCell | ||||||||
SCALA = Language.SCALA, 'scala', '//', ScalaCell | ||||||||
SQL = Language.SQL, 'sql', '--', SQLCell | ||||||||
RUN = None, 'run', None, RunCell | ||||||||
MARKDOWN = None, 'md', None, MarkdownCell | ||||||||
R = Language.R, 'r', '#', RCell | ||||||||
PYTHON = Language.PYTHON, 'python', '#', True, PythonCell | ||||||||
SCALA = Language.SCALA, 'scala', '//', True, ScalaCell | ||||||||
SQL = Language.SQL, 'sql', '--', True, SQLCell | ||||||||
RUN = None, 'run', '', False, RunCell | ||||||||
# see https://spec.commonmark.org/0.31.2/#html-comment | ||||||||
MARKDOWN = None, 'md', "<!--->", False, MarkdownCell | ||||||||
R = Language.R, 'r', '#', True, RCell | ||||||||
|
||||||||
def __init__(self, *args): | ||||||||
super().__init__() | ||||||||
self._language = args[0] | ||||||||
self._magic_name = args[1] | ||||||||
self._comment_prefix = args[2] | ||||||||
self._new_cell = args[3] | ||||||||
# PI stands for Processing Instruction | ||||||||
# pylint: disable=invalid-name | ||||||||
self._requires_isolated_PI = args[3] | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
:) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed the warning, but I guess people looking into this code would be informed enough by the comment ? |
||||||||
self._new_cell = args[4] | ||||||||
|
||||||||
@property | ||||||||
def language(self) -> Language: | ||||||||
|
@@ -162,6 +223,10 @@ | |||||||
def comment_prefix(self) -> str: | ||||||||
return self._comment_prefix | ||||||||
|
||||||||
@property | ||||||||
def requires_isolated_pi(self) -> str: | ||||||||
return self._requires_isolated_PI | ||||||||
|
||||||||
@classmethod | ||||||||
def of_language(cls, language: Language) -> CellLanguage: | ||||||||
return next((cl for cl in CellLanguage if cl.language == language)) | ||||||||
|
@@ -171,7 +236,7 @@ | |||||||
return next((cl for cl in CellLanguage if magic_name.startswith(cl.magic_name)), None) | ||||||||
|
||||||||
def read_cell_language(self, lines: list[str]) -> CellLanguage | None: | ||||||||
magic_prefix = f'{self.comment_prefix}{MAGIC_PREFIX}' | ||||||||
magic_prefix = f'{self.comment_prefix} {MAGIC_PREFIX} ' | ||||||||
magic_language_prefix = f'{magic_prefix}{LANGUAGE_PREFIX}' | ||||||||
for line in lines: | ||||||||
# if we find a non-comment then we're done | ||||||||
|
@@ -191,26 +256,28 @@ | |||||||
|
||||||||
def extract_cells(self, source: str) -> list[Cell] | None: | ||||||||
lines = source.split('\n') | ||||||||
header = f"{self.comment_prefix}{NOTEBOOK_HEADER}" | ||||||||
header = f"{self.comment_prefix} {NOTEBOOK_HEADER}" | ||||||||
if not lines[0].startswith(header): | ||||||||
raise ValueError("Not a Databricks notebook source!") | ||||||||
|
||||||||
def make_cell(lines_: list[str]): | ||||||||
def make_cell(cell_lines: list[str]): | ||||||||
# trim leading blank lines | ||||||||
while len(lines_) > 0 and len(lines_[0]) == 0: | ||||||||
lines_.pop(0) | ||||||||
while len(cell_lines) > 0 and len(cell_lines[0]) == 0: | ||||||||
cell_lines.pop(0) | ||||||||
# trim trailing blank lines | ||||||||
while len(lines_) > 0 and len(lines_[-1]) == 0: | ||||||||
lines_.pop(-1) | ||||||||
cell_language = self.read_cell_language(lines_) | ||||||||
while len(cell_lines) > 0 and len(cell_lines[-1]) == 0: | ||||||||
cell_lines.pop(-1) | ||||||||
cell_language = self.read_cell_language(cell_lines) | ||||||||
if cell_language is None: | ||||||||
cell_language = self | ||||||||
cell_source = '\n'.join(lines_) | ||||||||
else: | ||||||||
self._make_runnable(cell_lines, cell_language) | ||||||||
cell_source = '\n'.join(cell_lines) | ||||||||
return cell_language.new_cell(cell_source) | ||||||||
|
||||||||
cells = [] | ||||||||
cell_lines: list[str] = [] | ||||||||
separator = f"{self.comment_prefix}{CELL_SEPARATOR}" | ||||||||
separator = f"{self.comment_prefix} {CELL_SEPARATOR}" | ||||||||
for i in range(1, len(lines)): | ||||||||
line = lines[i].strip() | ||||||||
if line.startswith(separator): | ||||||||
|
@@ -225,6 +292,41 @@ | |||||||
|
||||||||
return cells | ||||||||
|
||||||||
def _make_runnable(self, lines: list[str], cell_language: CellLanguage): | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
i think it's a better name for this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||||||||
prefix = f"{self.comment_prefix} {MAGIC_PREFIX} " | ||||||||
prefix_len = len(prefix) | ||||||||
# pylint: disable=too-many-nested-blocks | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
this one can be avoided There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||||||||
for i, line in enumerate(lines): | ||||||||
if line.startswith(prefix): | ||||||||
line = line[prefix_len:] | ||||||||
if cell_language.requires_isolated_pi: | ||||||||
if line.startswith(LANGUAGE_PREFIX): | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
line = f"{cell_language.comment_prefix} {LANGUAGE_PI}" | ||||||||
lines[i] = line | ||||||||
continue | ||||||||
if line.startswith(self.comment_prefix): | ||||||||
line = f"{cell_language.comment_prefix} {COMMENT_PI}{line}" | ||||||||
lines[i] = line | ||||||||
|
||||||||
def make_unrunnable(self, code: str, cell_language: CellLanguage) -> str: | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
might be a better name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||||||||
language_pi_prefix = f"{cell_language.comment_prefix} {LANGUAGE_PI}" | ||||||||
comment_pi_prefix = f"{cell_language.comment_prefix} {COMMENT_PI}" | ||||||||
comment_pi_prefix_len = len(comment_pi_prefix) | ||||||||
lines = code.split('\n') | ||||||||
for i, line in enumerate(lines): | ||||||||
if line.startswith(language_pi_prefix): | ||||||||
line = f"{self.comment_prefix} {MAGIC_PREFIX} {LANGUAGE_PREFIX}{cell_language.magic_name}" | ||||||||
lines[i] = line | ||||||||
continue | ||||||||
if line.startswith(comment_pi_prefix): | ||||||||
lines[i] = line[comment_pi_prefix_len:] | ||||||||
continue | ||||||||
line = f"{self.comment_prefix} {MAGIC_PREFIX} {line}" | ||||||||
lines[i] = line | ||||||||
if code.endswith('./'): | ||||||||
lines.append('\n') | ||||||||
return "\n".join(lines) | ||||||||
|
||||||||
|
||||||||
class DependencyGraph: | ||||||||
|
||||||||
|
@@ -313,13 +415,16 @@ | |||||||
|
||||||||
def to_migrated_code(self): | ||||||||
default_language = CellLanguage.of_language(self._language) | ||||||||
header = f"{default_language.comment_prefix}{NOTEBOOK_HEADER}" | ||||||||
header = f"{default_language.comment_prefix} {NOTEBOOK_HEADER}" | ||||||||
sources = [header] | ||||||||
for i, cell in enumerate(self._cells): | ||||||||
sources.append(cell.migrated_code) | ||||||||
migrated_code = cell.migrated_code | ||||||||
if cell.language is not default_language: | ||||||||
migrated_code = default_language.make_unrunnable(migrated_code, cell.language) | ||||||||
sources.append(migrated_code) | ||||||||
if i < len(self._cells) - 1: | ||||||||
sources.append('') | ||||||||
sources.append(f'{default_language.comment_prefix}{CELL_SEPARATOR}') | ||||||||
sources.append(f'{default_language.comment_prefix} {CELL_SEPARATOR}') | ||||||||
sources.append('') | ||||||||
if self._ends_with_lf: | ||||||||
sources.append('') # following join will append lf | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Databricks notebook source | ||
# time horizon | ||
days = 300 | ||
|
||
# volatility | ||
sigma = 0.04 | ||
|
||
# drift (average growth rate) | ||
mu = 0.05 | ||
|
||
# initial starting price | ||
start_price = 10 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Databricks notebook source | ||
dbutils.notebook.run("./leaf3.py.txt") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import datetime | ||
# Updated List of Notebooks | ||
notebooks_list = [ | ||
'/Production/data_solutions/accounts/companyA_10011111/de/companyA_de_report', | ||
'/Production/data_solutions/accounts/companyB_10022222/de/companyB_de_report', | ||
'/Production/data_solutions/accounts/companyC_10033333/de/companyC_de_report', | ||
'/Production/data_solutions/accounts/companyD_10044444/de/companyD_de_report', | ||
] | ||
# Execution: | ||
for notebook in notebooks_list: | ||
try: | ||
start_time = datetime.datetime.now() | ||
print("Running the report of " + str(notebook).split('/')[len(str(notebook).split('/'))-1]) | ||
status = dbutils.notebook.run(notebook,100000) | ||
end_time = datetime.datetime.now() | ||
print("Finished, time taken: " + str(start_time-end_time)) | ||
except: | ||
print("The notebook {0} failed to run".format(notebook)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if something is public, let's expose this as a method - would be easier to refactor.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done