Skip to content

Commit

Permalink
... skeleton
Browse files Browse the repository at this point in the history
  • Loading branch information
nfx committed May 2, 2024
1 parent 19a1bd4 commit f41c1a6
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 13 deletions.
10 changes: 10 additions & 0 deletions src/databricks/labs/ucx/contexts/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler
from databricks.labs.ucx.hive_metastore.verification import VerifyHasMetastore
from databricks.labs.ucx.installer.workflows import DeployedWorkflows
from databricks.labs.ucx.source_code.jobs import WorkflowLinter
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader, WorkspaceNotebookLoader
from databricks.labs.ucx.source_code.files import FileLoader, LocalFileResolver, SysPathProvider
from databricks.labs.ucx.source_code.graph import DependencyResolver, DependencyGraphBuilder
Expand Down Expand Up @@ -392,3 +393,12 @@ def dependency_resolver(self):
@cached_property
def dependency_graph_builder(self):
return DependencyGraphBuilder(self.dependency_resolver)

@cached_property
def workflow_linter(self):
return WorkflowLinter(
self.workspace_client,
self.dependency_graph_builder,
self.tables_migrator.index(),
self.whitelist,
)
11 changes: 11 additions & 0 deletions src/databricks/labs/ucx/source_code/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pathlib import Path
from collections.abc import Callable

from databricks.labs.ucx.source_code.base import Advisory
from databricks.labs.ucx.source_code.python_linter import ASTLinter, PythonLinter


Expand Down Expand Up @@ -311,6 +312,16 @@ def replace(
end_col if end_col is not None else self.end_col,
)

def as_advisory(self) -> 'Advisory':
return Advisory(
code=self.code,
message=self.message,
start_line=self.start_line,
start_col=self.start_col,
end_line=self.end_line,
end_col=self.end_col,
)


@dataclass
class MaybeGraph:
Expand Down
91 changes: 82 additions & 9 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,93 @@
import collections
from pathlib import Path

from databricks.sdk import WorkspaceClient

from databricks.labs.ucx.source_code.graph import DependencyResolver
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader
from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
from databricks.labs.ucx.mixins.wspath import WorkspacePath
from databricks.labs.ucx.source_code.base import Advice
from databricks.labs.ucx.source_code.graph import DependencyGraphBuilder
from databricks.labs.ucx.source_code.languages import Languages
from databricks.labs.ucx.source_code.notebooks.sources import Notebook, NotebookLinter
from databricks.labs.ucx.source_code.whitelist import Whitelist


class WorkflowLinter:
def __init__(self, ws: WorkspaceClient, dependency_resolver: DependencyResolver):
def __init__(
self,
ws: WorkspaceClient,
builder: DependencyGraphBuilder,
migration_index: MigrationIndex,
whitelist: Whitelist,
):
self._ws = ws
self._dependency_resolver = dependency_resolver
self._builder = builder
self._migration_index = migration_index
self._whitelist = whitelist

def lint(self, job_id: int):
job = self._ws.jobs.get(job_id)
problems: dict[Path, list[Advice]] = collections.defaultdict(list)
for task in job.settings.tasks:
if task.notebook_task:
self._lint_notebook(job_id)

notebook = NotebookLoader(self._ws).load(job_id)
graph = self._dependency_graph_builder.build(notebook)
return graph.lint()
path = WorkspacePath(self._ws, task.notebook_task.notebook_path)
maybe = self._builder.build_notebook_dependency_graph(path)
for problem in maybe.problems:
problems[problem.source_path].append(problem.as_advisory())
for dependency in maybe.graph.all_dependencies:
container = dependency.load()
if not container:
continue
if isinstance(container, Notebook):
languages = Languages(self._migration_index)
linter = NotebookLinter(languages, container)
for problem in linter.lint():
problems[container.path].append(problem)
if task.spark_python_task:
# TODO: ... load from dbfs
continue
if task.spark_jar_task:
# TODO: ...
continue
if task.spark_submit_task:
# TODO: ... --py-files
continue
if task.python_wheel_task:
# TODO: ... load wheel
continue
if task.pipeline_task:
# TODO: load pipeline notebooks
continue
if task.run_job_task:
# TODO: load other job and lint
continue
if task.existing_cluster_id:
# TODO: load pypi and dbfs libraries
continue
if task.libraries:
for library in task.libraries:
# TODO: load library
continue
continue
for job_cluster in job.settings.job_clusters:
for library in job_cluster.libraries:
if library.pypi:
# TODO: whitelist
continue
if library.dbfs:
# TODO: load from DBFS
continue
if library.jar:
# TODO: warning
continue
if library.egg:
# TODO: load and lint
continue
if library.whl:
# TODO: load and lint
continue
if library.requirements:
# TODO: load and lint
continue
# TODO: ..
continue
2 changes: 1 addition & 1 deletion src/databricks/labs/ucx/source_code/notebooks/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _load_notebook(self, object_info: ObjectInfo) -> SourceContainer:
assert object_info.path is not None
assert object_info.language is not None
source = self._load_source(object_info)
return Notebook.parse(object_info.path, source, object_info.language)
return Notebook.parse(Path(object_info.path), source, object_info.language)

def _load_source(self, object_info: ObjectInfo) -> str:
assert object_info.path is not None
Expand Down
7 changes: 4 additions & 3 deletions src/databricks/labs/ucx/source_code/notebooks/sources.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from collections.abc import Iterable
from pathlib import Path

from databricks.sdk.service.workspace import Language

Expand All @@ -15,22 +16,22 @@
class Notebook(SourceContainer):

@staticmethod
def parse(path: str, source: str, default_language: Language) -> Notebook:
def parse(path: Path, source: str, default_language: Language) -> Notebook:
default_cell_language = CellLanguage.of_language(default_language)
cells = default_cell_language.extract_cells(source)
if cells is None:
raise ValueError(f"Could not parse Notebook: {path}")
return Notebook(path, source, default_language, cells, source.endswith('\n'))

def __init__(self, path: str, source: str, language: Language, cells: list[Cell], ends_with_lf):
def __init__(self, path: Path, source: str, language: Language, cells: list[Cell], ends_with_lf):
self._path = path
self._source = source
self._language = language
self._cells = cells
self._ends_with_lf = ends_with_lf

@property
def path(self) -> str:
def path(self) -> Path:
return self._path

@property
Expand Down

0 comments on commit f41c1a6

Please sign in to comment.