databrickslabs · ericvergnaud · Apr 4, 2024 · Apr 4, 2024 · Apr 4, 2024 · Apr 4, 2024
diff --git a/src/databricks/labs/ucx/source_code/astlinter.py b/src/databricks/labs/ucx/source_code/astlinter.py
@@ -0,0 +1,58 @@
+import ast
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class MatchingVisitor(ast.NodeVisitor):
+
+    def __init__(self, node_type: type, match_nodes: list[tuple[str, type]]):
+        self.matched_nodes: list[ast.AST] = []
+        self._node_type = node_type
+        self._match_nodes = match_nodes
+
+    # pylint: disable=invalid-name
-    # pylint: disable=invalid-name
+    # visit_Call is the invalid naming convention, but it is required for NodeVisitor
+    # pylint: disable=invalid-name
-    # pylint: disable=invalid-name
+    # visit_Call is the invalid naming convention, but it is required for NodeVisitor
+    # pylint: disable=invalid-name
+    def visit_Call(self, node: ast.Call):
+        if self._node_type is not ast.Call:
+            return
+        try:
+            if self._matches(node.func, 0):
+                self.matched_nodes.append(node)
+        except NotImplementedError as e:
+            logger.warning(f"Missing implementation: {e.args[0]}")
+
+    def _matches(self, node: ast.AST, depth: int):
+        if depth >= len(self._match_nodes):
+            return False
+        pair = self._match_nodes[depth]
+        if not isinstance(node, pair[1]):
+            return False
+        next_node: ast.AST | None = None
+        if isinstance(node, ast.Attribute):
+            if node.attr != pair[0]:
+                return False
+            next_node = node.value
+        elif isinstance(node, ast.Name):
+            if node.id != pair[0]:
+                return False
+        else:
+            raise NotImplementedError(str(type(node)))
+        if next_node is None:
+            # is this the last node to match ?
+            return len(self._match_nodes) - 1 == depth
+        return self._matches(next_node, depth + 1)
+
+
+class ASTLinter:
+
+    def __init__(self):
+        self._module: ast.Module | None = None
+
+    def parse(self, code: str):
+        self._module = ast.parse(code)
-    def __init__(self):
-        self._module: ast.Module | None = None
-
-    def parse(self, code: str):
-        self._module = ast.parse(code)
+    def __init__(self, code: str):
+        self._module = ast.parse(code)
-    def __init__(self):
-        self._module: ast.Module | None = None
-
-    def parse(self, code: str):
-        self._module = ast.parse(code)
+    def __init__(self, code: str):
+        self._module = ast.parse(code)
+
+    def locate(self, node_type: type, match_nodes: list[tuple[str, type]]) -> list[ast.AST]:
+        assert self._module is not None
+        visitor = MatchingVisitor(node_type, match_nodes)
+        visitor.visit(self._module)
+        return visitor.matched_nodes
diff --git a/src/databricks/labs/ucx/source_code/notebook.py b/src/databricks/labs/ucx/source_code/notebook.py
@@ -1,18 +1,30 @@
 from __future__ import annotations  # for type hints
 
+import ast
+import logging
 from abc import ABC, abstractmethod
 from ast import parse as parse_python
-from collections.abc import Callable
+from collections.abc import Callable, Iterable
 from enum import Enum
 
 from sqlglot import ParseError as SQLParseError
 from sqlglot import parse as parse_sql
 from databricks.sdk.service.workspace import Language
 
-NOTEBOOK_HEADER = " Databricks notebook source"
-CELL_SEPARATOR = " COMMAND ----------"
-MAGIC_PREFIX = ' MAGIC'
-LANGUAGE_PREFIX = ' %'
+from databricks.labs.ucx.source_code.astlinter import ASTLinter
+from databricks.labs.ucx.source_code.base import Linter, Advice, Advisory
+
+
+logger = logging.getLogger(__name__)
+# use a specific logger for sqlglot warnings so we can disable them selectively
+sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot")
+
+NOTEBOOK_HEADER = "Databricks notebook source"
+CELL_SEPARATOR = "COMMAND ----------"
+MAGIC_PREFIX = 'MAGIC'
+LANGUAGE_PREFIX = '%'
+LANGUAGE_PI = 'LANGUAGE'
+COMMENT_PI = 'COMMENT'
 
 
 class Cell(ABC):
@@ -22,6 +34,7 @@
 
     @property
     def migrated_code(self):
+        # this property is for reading the migrated code, not for generating it
         return self._original_code  # for now since we're not doing any migration yet
 
     @property
@@ -38,6 +51,42 @@
         raise NotImplementedError()
 
 
+class PythonLinter(ASTLinter, Linter):
+    def lint(self, code: str) -> Iterable[Advice]:
+        self.parse(code)
+        nodes = self.locate(ast.Call, [("run", ast.Attribute), ("notebook", ast.Attribute), ("dbutils", ast.Name)])
+        return [ self._convert_dbutils_notebook_run_to_advice(node) for node in nodes ]
+
+    @classmethod
+    def _convert_dbutils_notebook_run_to_advice(cls, node: ast.AST) -> Advisory:
+        assert isinstance(node, ast.Call)
+        path = cls.get_dbutils_notebook_run_path_arg(node)
+        if isinstance(path, ast.Constant):
+            return Advisory(
+                'notebook-auto-migrate',
-                'notebook-auto-migrate',
+                'dbutils-notebook-run-literal',
-                'notebook-auto-migrate',
+                'dbutils-notebook-run-literal',
+                "Call to 'dbutils.notebook.run' will be migrated automatically",
+                node.lineno,
+                node.col_offset,
+                node.end_lineno or 0,
+                node.end_col_offset or 0,
+            )
+        return Advisory(
+            'notebook-manual-migrate',
-            'notebook-manual-migrate',
+            'dbutils-notebook-run-dynamic',
-            'notebook-manual-migrate',
+            'dbutils-notebook-run-dynamic',
+            "Path for 'dbutils.notebook.run' is not a constant and requires adjusting the notebook path",
+            node.lineno,
+            node.col_offset,
+            node.end_lineno or 0,
+            node.end_col_offset or 0,
+        )
+
+    @staticmethod
+    def get_dbutils_notebook_run_path_arg(node: ast.Call):
+        if len(node.args) > 0:
+            return node.args[0]
+        arg = next(kw for kw in node.keywords if kw.arg == "path")
+        return arg.value if arg is not None else None
+
+
 class PythonCell(Cell):
 
     @property
@@ -46,15 +95,22 @@
 
     def is_runnable(self) -> bool:
         try:
-            ast = parse_python(self._original_code)
-            return ast is not None
+            tree = parse_python(self._original_code)
+            return tree is not None
         except SyntaxError:
-            return False
+            return True
 
     def build_dependency_graph(self, parent: DependencyGraph):
-        # TODO https://github.com/databrickslabs/ucx/issues/1200
         # TODO https://github.com/databrickslabs/ucx/issues/1202
-        pass
+        linter = ASTLinter()
+        linter.parse(self._original_code)
+        nodes = linter.locate(ast.Call, [("run", ast.Attribute), ("notebook", ast.Attribute), ("dbutils", ast.Name)])
+        for node in nodes:
+            assert isinstance(node, ast.Call)
+            path = PythonLinter.get_dbutils_notebook_run_path_arg(node)
+            if isinstance(path, ast.Constant):
+                parent.register_dependency(path.value.strip("'").strip('"'))
+
 
 
 class RCell(Cell):
@@ -94,7 +150,8 @@
             statements = parse_sql(self._original_code)
             return len(statements) > 0
         except SQLParseError:
-            return False
+            sqlglot_logger.warning(f"Failed to parse SQL using 'sqlglot': {self._original_code}")
+            return True
 
     def build_dependency_graph(self, parent: DependencyGraph):
         pass  # not in scope
@@ -123,7 +180,7 @@
         return True  # TODO
 
     def build_dependency_graph(self, parent: DependencyGraph):
-        command = f'{LANGUAGE_PREFIX}{self.language.magic_name}'.strip()
+        command = f'{LANGUAGE_PREFIX}{self.language.magic_name}'
         lines = self._original_code.split('\n')
         for line in lines:
             start = line.index(command)
@@ -136,19 +193,23 @@
 
 class CellLanguage(Enum):
     # long magic_names must come first to avoid shorter ones being matched
-    PYTHON = Language.PYTHON, 'python', '#', PythonCell
-    SCALA = Language.SCALA, 'scala', '//', ScalaCell
-    SQL = Language.SQL, 'sql', '--', SQLCell
-    RUN = None, 'run', None, RunCell
-    MARKDOWN = None, 'md', None, MarkdownCell
-    R = Language.R, 'r', '#', RCell
+    PYTHON = Language.PYTHON, 'python', '#', True, PythonCell
+    SCALA = Language.SCALA, 'scala', '//', True, ScalaCell
+    SQL = Language.SQL, 'sql', '--', True, SQLCell
+    RUN = None, 'run', '', False, RunCell
+    # see https://spec.commonmark.org/0.31.2/#html-comment
+    MARKDOWN = None, 'md', "<!--->", False, MarkdownCell
+    R = Language.R, 'r', '#', True, RCell
 
     def __init__(self, *args):
         super().__init__()
         self._language = args[0]
         self._magic_name = args[1]
         self._comment_prefix = args[2]
-        self._new_cell = args[3]
+        # PI stands for Processing Instruction
+        # pylint: disable=invalid-name
+        self._requires_isolated_PI = args[3]
-        self._requires_isolated_PI = args[3]
+        self._requires_isolated_processing_instruction = args[3]
-        self._requires_isolated_PI = args[3]
+        self._requires_isolated_processing_instruction = args[3]
+        self._new_cell = args[4]
 
     @property
     def language(self) -> Language:
@@ -162,6 +223,10 @@
     def comment_prefix(self) -> str:
         return self._comment_prefix
 
+    @property
+    def requires_isolated_pi(self) -> str:
+        return self._requires_isolated_PI
+
     @classmethod
     def of_language(cls, language: Language) -> CellLanguage:
         return next((cl for cl in CellLanguage if cl.language == language))
@@ -171,7 +236,7 @@
         return next((cl for cl in CellLanguage if magic_name.startswith(cl.magic_name)), None)
 
     def read_cell_language(self, lines: list[str]) -> CellLanguage | None:
-        magic_prefix = f'{self.comment_prefix}{MAGIC_PREFIX}'
+        magic_prefix = f'{self.comment_prefix} {MAGIC_PREFIX} '
         magic_language_prefix = f'{magic_prefix}{LANGUAGE_PREFIX}'
         for line in lines:
             # if we find a non-comment then we're done
@@ -191,26 +256,28 @@
 
     def extract_cells(self, source: str) -> list[Cell] | None:
         lines = source.split('\n')
-        header = f"{self.comment_prefix}{NOTEBOOK_HEADER}"
+        header = f"{self.comment_prefix} {NOTEBOOK_HEADER}"
         if not lines[0].startswith(header):
             raise ValueError("Not a Databricks notebook source!")
 
-        def make_cell(lines_: list[str]):
+        def make_cell(cell_lines: list[str]):
             # trim leading blank lines
-            while len(lines_) > 0 and len(lines_[0]) == 0:
-                lines_.pop(0)
+            while len(cell_lines) > 0 and len(cell_lines[0]) == 0:
+                cell_lines.pop(0)
             # trim trailing blank lines
-            while len(lines_) > 0 and len(lines_[-1]) == 0:
-                lines_.pop(-1)
-            cell_language = self.read_cell_language(lines_)
+            while len(cell_lines) > 0 and len(cell_lines[-1]) == 0:
+                cell_lines.pop(-1)
+            cell_language = self.read_cell_language(cell_lines)
             if cell_language is None:
                 cell_language = self
-            cell_source = '\n'.join(lines_)
+            else:
+                self._make_runnable(cell_lines, cell_language)
+            cell_source = '\n'.join(cell_lines)
             return cell_language.new_cell(cell_source)
 
         cells = []
         cell_lines: list[str] = []
-        separator = f"{self.comment_prefix}{CELL_SEPARATOR}"
+        separator = f"{self.comment_prefix} {CELL_SEPARATOR}"
         for i in range(1, len(lines)):
             line = lines[i].strip()
             if line.startswith(separator):
@@ -225,6 +292,41 @@
 
         return cells
 
+    def _make_runnable(self, lines: list[str], cell_language: CellLanguage):
-    def _make_runnable(self, lines: list[str], cell_language: CellLanguage):
+    def _unwrap_magic(self, lines: list[str], cell_language: CellLanguage):
-    def _make_runnable(self, lines: list[str], cell_language: CellLanguage):
+    def _unwrap_magic(self, lines: list[str], cell_language: CellLanguage):
+        prefix = f"{self.comment_prefix} {MAGIC_PREFIX} "
+        prefix_len = len(prefix)
+        # pylint: disable=too-many-nested-blocks
-        # pylint: disable=too-many-nested-blocks
-        # pylint: disable=too-many-nested-blocks
+        for i, line in enumerate(lines):
+            if line.startswith(prefix):
+                line = line[prefix_len:]
+                if cell_language.requires_isolated_pi:
+                    if line.startswith(LANGUAGE_PREFIX):
-                if cell_language.requires_isolated_pi:
-                    if line.startswith(LANGUAGE_PREFIX):
+                if cell_language.requires_isolated_pi and line.startswith(LANGUAGE_PREFIX):
-                if cell_language.requires_isolated_pi:
-                    if line.startswith(LANGUAGE_PREFIX):
+                if cell_language.requires_isolated_pi and line.startswith(LANGUAGE_PREFIX):
+                        line = f"{cell_language.comment_prefix} {LANGUAGE_PI}"
+                lines[i] = line
+                continue
+            if line.startswith(self.comment_prefix):
+                line = f"{cell_language.comment_prefix} {COMMENT_PI}{line}"
+                lines[i] = line
+
+    def make_unrunnable(self, code: str, cell_language: CellLanguage) -> str:
-    def make_unrunnable(self, code: str, cell_language: CellLanguage) -> str:
+    def as_magic(self, code: str, cell_language: CellLanguage) -> str:
-    def make_unrunnable(self, code: str, cell_language: CellLanguage) -> str:
+    def as_magic(self, code: str, cell_language: CellLanguage) -> str:
+        language_pi_prefix = f"{cell_language.comment_prefix} {LANGUAGE_PI}"
+        comment_pi_prefix = f"{cell_language.comment_prefix} {COMMENT_PI}"
+        comment_pi_prefix_len = len(comment_pi_prefix)
+        lines = code.split('\n')
+        for i, line in enumerate(lines):
+            if line.startswith(language_pi_prefix):
+                line = f"{self.comment_prefix} {MAGIC_PREFIX} {LANGUAGE_PREFIX}{cell_language.magic_name}"
+                lines[i] = line
+                continue
+            if line.startswith(comment_pi_prefix):
+                lines[i] = line[comment_pi_prefix_len:]
+                continue
+            line = f"{self.comment_prefix} {MAGIC_PREFIX} {line}"
+            lines[i] = line
+        if code.endswith('./'):
+            lines.append('\n')
+        return "\n".join(lines)
+
 
 class DependencyGraph:
 
@@ -313,13 +415,16 @@
 
     def to_migrated_code(self):
         default_language = CellLanguage.of_language(self._language)
-        header = f"{default_language.comment_prefix}{NOTEBOOK_HEADER}"
+        header = f"{default_language.comment_prefix} {NOTEBOOK_HEADER}"
         sources = [header]
         for i, cell in enumerate(self._cells):
-            sources.append(cell.migrated_code)
+            migrated_code = cell.migrated_code
+            if cell.language is not default_language:
+                migrated_code = default_language.make_unrunnable(migrated_code, cell.language)
+            sources.append(migrated_code)
             if i < len(self._cells) - 1:
                 sources.append('')
-                sources.append(f'{default_language.comment_prefix}{CELL_SEPARATOR}')
+                sources.append(f'{default_language.comment_prefix} {CELL_SEPARATOR}')
                 sources.append('')
         if self._ends_with_lf:
             sources.append('')  # following join will append lf

diff --git a/tests/unit/source_code/notebooks/leaf3.py.txt b/tests/unit/source_code/notebooks/leaf3.py.txt
@@ -0,0 +1,12 @@
+# Databricks notebook source
+# time horizon
+days = 300
+
+# volatility
+sigma = 0.04
+
+# drift (average growth rate)
+mu = 0.05
+
+# initial starting price
+start_price = 10
diff --git a/tests/unit/source_code/notebooks/root4.py.txt b/tests/unit/source_code/notebooks/root4.py.txt
@@ -0,0 +1,2 @@
+# Databricks notebook source
+dbutils.notebook.run("./leaf3.py.txt")
diff --git a/tests/unit/source_code/notebooks/run_notebooks.py.txt b/tests/unit/source_code/notebooks/run_notebooks.py.txt
@@ -0,0 +1,18 @@
+import datetime
+# Updated List of Notebooks
+notebooks_list = [
+                  '/Production/data_solutions/accounts/companyA_10011111/de/companyA_de_report',
+                  '/Production/data_solutions/accounts/companyB_10022222/de/companyB_de_report',
+                  '/Production/data_solutions/accounts/companyC_10033333/de/companyC_de_report',
+                  '/Production/data_solutions/accounts/companyD_10044444/de/companyD_de_report',  
+                  ]
+# Execution:
+for notebook in notebooks_list:
+  try:
+    start_time = datetime.datetime.now()
+    print("Running the report of " + str(notebook).split('/')[len(str(notebook).split('/'))-1])
+    status = dbutils.notebook.run(notebook,100000)
+    end_time = datetime.datetime.now()
+    print("Finished, time taken: " + str(start_time-end_time))
+  except:
+    print("The notebook {0} failed to run".format(notebook))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Databricks notebook source
		dbutils.notebook.run("./leaf3.py.txt")