From f41c1a67ae7c48244444ae161b9f31430c4436fa Mon Sep 17 00:00:00 2001 From: Serge Smertin Date: Fri, 3 May 2024 00:36:41 +0200 Subject: [PATCH] ... skeleton --- .../labs/ucx/contexts/application.py | 10 ++ src/databricks/labs/ucx/source_code/graph.py | 11 +++ src/databricks/labs/ucx/source_code/jobs.py | 91 +++++++++++++++++-- .../labs/ucx/source_code/notebooks/loaders.py | 2 +- .../labs/ucx/source_code/notebooks/sources.py | 7 +- 5 files changed, 108 insertions(+), 13 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 6bf15af62e..1509ee1bca 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -31,6 +31,7 @@ from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from databricks.labs.ucx.hive_metastore.verification import VerifyHasMetastore from databricks.labs.ucx.installer.workflows import DeployedWorkflows +from databricks.labs.ucx.source_code.jobs import WorkflowLinter from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader, WorkspaceNotebookLoader from databricks.labs.ucx.source_code.files import FileLoader, LocalFileResolver, SysPathProvider from databricks.labs.ucx.source_code.graph import DependencyResolver, DependencyGraphBuilder @@ -392,3 +393,12 @@ def dependency_resolver(self): @cached_property def dependency_graph_builder(self): return DependencyGraphBuilder(self.dependency_resolver) + + @cached_property + def workflow_linter(self): + return WorkflowLinter( + self.workspace_client, + self.dependency_graph_builder, + self.tables_migrator.index(), + self.whitelist, + ) diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index 19d582c1a9..8f6e1a6ab2 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -6,6 +6,7 @@ from pathlib import Path from collections.abc import Callable +from databricks.labs.ucx.source_code.base import Advisory from databricks.labs.ucx.source_code.python_linter import ASTLinter, PythonLinter @@ -311,6 +312,16 @@ def replace( end_col if end_col is not None else self.end_col, ) + def as_advisory(self) -> 'Advisory': + return Advisory( + code=self.code, + message=self.message, + start_line=self.start_line, + start_col=self.start_col, + end_line=self.end_line, + end_col=self.end_col, + ) + @dataclass class MaybeGraph: diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 8f51f614cf..c15989f060 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -1,20 +1,93 @@ +import collections +from pathlib import Path + from databricks.sdk import WorkspaceClient -from databricks.labs.ucx.source_code.graph import DependencyResolver -from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader +from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex +from databricks.labs.ucx.mixins.wspath import WorkspacePath +from databricks.labs.ucx.source_code.base import Advice +from databricks.labs.ucx.source_code.graph import DependencyGraphBuilder +from databricks.labs.ucx.source_code.languages import Languages +from databricks.labs.ucx.source_code.notebooks.sources import Notebook, NotebookLinter +from databricks.labs.ucx.source_code.whitelist import Whitelist class WorkflowLinter: - def __init__(self, ws: WorkspaceClient, dependency_resolver: DependencyResolver): + def __init__( + self, + ws: WorkspaceClient, + builder: DependencyGraphBuilder, + migration_index: MigrationIndex, + whitelist: Whitelist, + ): self._ws = ws - self._dependency_resolver = dependency_resolver + self._builder = builder + self._migration_index = migration_index + self._whitelist = whitelist def lint(self, job_id: int): job = self._ws.jobs.get(job_id) + problems: dict[Path, list[Advice]] = collections.defaultdict(list) for task in job.settings.tasks: if task.notebook_task: - self._lint_notebook(job_id) - - notebook = NotebookLoader(self._ws).load(job_id) - graph = self._dependency_graph_builder.build(notebook) - return graph.lint() + path = WorkspacePath(self._ws, task.notebook_task.notebook_path) + maybe = self._builder.build_notebook_dependency_graph(path) + for problem in maybe.problems: + problems[problem.source_path].append(problem.as_advisory()) + for dependency in maybe.graph.all_dependencies: + container = dependency.load() + if not container: + continue + if isinstance(container, Notebook): + languages = Languages(self._migration_index) + linter = NotebookLinter(languages, container) + for problem in linter.lint(): + problems[container.path].append(problem) + if task.spark_python_task: + # TODO: ... load from dbfs + continue + if task.spark_jar_task: + # TODO: ... + continue + if task.spark_submit_task: + # TODO: ... --py-files + continue + if task.python_wheel_task: + # TODO: ... load wheel + continue + if task.pipeline_task: + # TODO: load pipeline notebooks + continue + if task.run_job_task: + # TODO: load other job and lint + continue + if task.existing_cluster_id: + # TODO: load pypi and dbfs libraries + continue + if task.libraries: + for library in task.libraries: + # TODO: load library + continue + continue + for job_cluster in job.settings.job_clusters: + for library in job_cluster.libraries: + if library.pypi: + # TODO: whitelist + continue + if library.dbfs: + # TODO: load from DBFS + continue + if library.jar: + # TODO: warning + continue + if library.egg: + # TODO: load and lint + continue + if library.whl: + # TODO: load and lint + continue + if library.requirements: + # TODO: load and lint + continue + # TODO: .. + continue diff --git a/src/databricks/labs/ucx/source_code/notebooks/loaders.py b/src/databricks/labs/ucx/source_code/notebooks/loaders.py index 2e12d8b8c2..2db18ac6f3 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/loaders.py +++ b/src/databricks/labs/ucx/source_code/notebooks/loaders.py @@ -56,7 +56,7 @@ def _load_notebook(self, object_info: ObjectInfo) -> SourceContainer: assert object_info.path is not None assert object_info.language is not None source = self._load_source(object_info) - return Notebook.parse(object_info.path, source, object_info.language) + return Notebook.parse(Path(object_info.path), source, object_info.language) def _load_source(self, object_info: ObjectInfo) -> str: assert object_info.path is not None diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 58f282df04..85664aeba5 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import Iterable +from pathlib import Path from databricks.sdk.service.workspace import Language @@ -15,14 +16,14 @@ class Notebook(SourceContainer): @staticmethod - def parse(path: str, source: str, default_language: Language) -> Notebook: + def parse(path: Path, source: str, default_language: Language) -> Notebook: default_cell_language = CellLanguage.of_language(default_language) cells = default_cell_language.extract_cells(source) if cells is None: raise ValueError(f"Could not parse Notebook: {path}") return Notebook(path, source, default_language, cells, source.endswith('\n')) - def __init__(self, path: str, source: str, language: Language, cells: list[Cell], ends_with_lf): + def __init__(self, path: Path, source: str, language: Language, cells: list[Cell], ends_with_lf): self._path = path self._source = source self._language = language @@ -30,7 +31,7 @@ def __init__(self, path: str, source: str, language: Language, cells: list[Cell] self._ends_with_lf = ends_with_lf @property - def path(self) -> str: + def path(self) -> Path: return self._path @property