diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index b463b05bd6..d0dd01338f 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -25,6 +25,7 @@ AwsACL, ) from databricks.labs.ucx.hive_metastore.mapping import TableMapping +from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex from databricks.labs.ucx.hive_metastore.table_migrate import ( MigrationStatusRefresher, TablesMigrator, @@ -33,12 +34,16 @@ from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from databricks.labs.ucx.hive_metastore.verification import VerifyHasMetastore from databricks.labs.ucx.installer.workflows import DeployedWorkflows -from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader, WorkspaceNotebookLoader +from databricks.labs.ucx.source_code.jobs import WorkflowLinter +from databricks.labs.ucx.source_code.notebooks.loaders import ( + NotebookResolver, + NotebookLoader, +) from databricks.labs.ucx.source_code.files import FileLoader, LocalFileResolver from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.labs.ucx.source_code.graph import DependencyResolver, DependencyGraphBuilder from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist -from databricks.labs.ucx.source_code.site_packages import SitePackagesResolver, SitePackages +from databricks.labs.ucx.source_code.site_packages import SitePackageResolver, SitePackages from databricks.labs.ucx.source_code.languages import Languages from databricks.labs.ucx.workspace_access import generic, redash from databricks.labs.ucx.workspace_access.groups import GroupManager @@ -346,7 +351,7 @@ def verify_has_metastore(self): @cached_property def notebook_loader(self) -> NotebookLoader: - return WorkspaceNotebookLoader(self.workspace_client) + return NotebookLoader() @cached_property def notebook_resolver(self): @@ -364,11 +369,11 @@ def path_lookup(self): @cached_property def file_loader(self): - return FileLoader(self.path_lookup) + return FileLoader() @cached_property def site_packages_resolver(self): - return SitePackagesResolver(self.site_packages, self.file_loader, self.path_lookup) + return SitePackageResolver(self.site_packages, self.file_loader, self.path_lookup) @cached_property def whitelist(self): @@ -385,11 +390,10 @@ def file_resolver(self): @cached_property def dependency_resolver(self): + # TODO: link back self.site_packages_resolver and self.whitelist_resolver, return DependencyResolver( [ self.notebook_resolver, - self.site_packages_resolver, - self.whitelist_resolver, self.file_resolver, ] ) @@ -398,6 +402,15 @@ def dependency_resolver(self): def dependency_graph_builder(self): return DependencyGraphBuilder(self.dependency_resolver, self.path_lookup) + @cached_property + def workflow_linter(self): + return WorkflowLinter( + self.workspace_client, + self.dependency_resolver, + self.path_lookup, + MigrationIndex([]), # TODO: bring back self.tables_migrator.index() + ) + class CliContext(GlobalContext, abc.ABC): @cached_property diff --git a/src/databricks/labs/ucx/contexts/workspace_cli.py b/src/databricks/labs/ucx/contexts/workspace_cli.py index da257f239d..994d53b1f1 100644 --- a/src/databricks/labs/ucx/contexts/workspace_cli.py +++ b/src/databricks/labs/ucx/contexts/workspace_cli.py @@ -14,7 +14,6 @@ from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources from databricks.labs.ucx.contexts.application import CliContext from databricks.labs.ucx.source_code.files import LocalFileMigrator -from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, LocalNotebookLoader from databricks.labs.ucx.workspace_access.clusters import ClusterAccess @@ -163,7 +162,3 @@ def iam_role_creation(self): self.workspace_client, self.aws_resource_permissions, ) - - @cached_property - def notebook_loader(self) -> NotebookLoader: - return LocalNotebookLoader(self.path_lookup) diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index f0b0bef5fb..b389cb6a25 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -62,6 +62,7 @@ from databricks.labs.ucx.installer.policy import ClusterPolicyInstaller from databricks.labs.ucx.installer.workflows import WorkflowsDeployment from databricks.labs.ucx.runtime import Workflows +from databricks.labs.ucx.source_code.jobs import JobProblem from databricks.labs.ucx.workspace_access.base import Permissions from databricks.labs.ucx.workspace_access.generic import WorkspaceObjectInfo from databricks.labs.ucx.workspace_access.groups import ConfigureGroups, MigratedGroup @@ -101,6 +102,7 @@ def deploy_schema(sql_backend: SqlBackend, inventory_schema: str): functools.partial(table, "submit_runs", SubmitRunInfo), functools.partial(table, "policies", PolicyInfo), functools.partial(table, "migration_status", MigrationStatus), + functools.partial(table, "workflow_problems", JobProblem), functools.partial(table, "udfs", Udf), functools.partial(table, "logs", LogRecord), ], diff --git a/src/databricks/labs/ucx/mixins/fixtures.py b/src/databricks/labs/ucx/mixins/fixtures.py index a9548a87b3..e925fc4f6a 100644 --- a/src/databricks/labs/ucx/mixins/fixtures.py +++ b/src/databricks/labs/ucx/mixins/fixtures.py @@ -558,7 +558,7 @@ def create(*, scope: str, principal: str, permission: workspace.AclPermission): def make_notebook(ws, make_random): def create( *, - path: str | None = None, + path: str | Path | None = None, content: BinaryIO | None = None, language: Language = Language.PYTHON, format: ImportFormat = ImportFormat.SOURCE, # pylint: disable=redefined-builtin @@ -566,6 +566,8 @@ def create( ) -> str: if path is None: path = f"/Users/{ws.current_user.me().user_name}/sdk-{make_random(4)}" + elif isinstance(path, pathlib.Path): + path = str(path) if content is None: content = io.BytesIO(b"print(1)") path = str(path) @@ -754,43 +756,33 @@ def create(*, instance_pool_name=None, node_type_id=None, **kwargs): @pytest.fixture def make_job(ws, make_random, make_notebook): - def create(**kwargs): + def create(notebook_path: str | Path | None = None, **kwargs): task_spark_conf = None if "name" not in kwargs: kwargs["name"] = f"sdk-{make_random(4)}" if "spark_conf" in kwargs: task_spark_conf = kwargs["spark_conf"] kwargs.pop("spark_conf") + if isinstance(notebook_path, pathlib.Path): + notebook_path = str(notebook_path) + if not notebook_path: + notebook_path = make_notebook() + assert notebook_path is not None if "tasks" not in kwargs: - if task_spark_conf: - kwargs["tasks"] = [ - jobs.Task( - task_key=make_random(4), - description=make_random(4), - new_cluster=compute.ClusterSpec( - num_workers=1, - node_type_id=ws.clusters.select_node_type(local_disk=True, min_memory_gb=16), - spark_version=ws.clusters.select_spark_version(latest=True), - spark_conf=task_spark_conf, - ), - notebook_task=jobs.NotebookTask(notebook_path=make_notebook()), - timeout_seconds=0, - ) - ] - else: - kwargs["tasks"] = [ - jobs.Task( - task_key=make_random(4), - description=make_random(4), - new_cluster=compute.ClusterSpec( - num_workers=1, - node_type_id=ws.clusters.select_node_type(local_disk=True, min_memory_gb=16), - spark_version=ws.clusters.select_spark_version(latest=True), - ), - notebook_task=jobs.NotebookTask(notebook_path=make_notebook()), - timeout_seconds=0, - ) - ] + kwargs["tasks"] = [ + jobs.Task( + task_key=make_random(4), + description=make_random(4), + new_cluster=compute.ClusterSpec( + num_workers=1, + node_type_id=ws.clusters.select_node_type(local_disk=True, min_memory_gb=16), + spark_version=ws.clusters.select_spark_version(latest=True), + spark_conf=task_spark_conf, + ), + notebook_task=jobs.NotebookTask(notebook_path=str(notebook_path)), + timeout_seconds=0, + ) + ] job = ws.jobs.create(**kwargs) logger.info(f"Job: {ws.config.host}#job/{job.job_id}") return job diff --git a/src/databricks/labs/ucx/mixins/wspath.py b/src/databricks/labs/ucx/mixins/wspath.py index 3b92492b83..b561d04aa4 100644 --- a/src/databricks/labs/ucx/mixins/wspath.py +++ b/src/databricks/labs/ucx/mixins/wspath.py @@ -6,12 +6,12 @@ from functools import cached_property # pylint: disable-next=import-private-name -from pathlib import Path, _PosixFlavour, _Accessor # type: ignore +from pathlib import Path, _PosixFlavour # type: ignore from urllib.parse import quote_from_bytes as urlquote_from_bytes from io import BytesIO, StringIO from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import NotFound +from databricks.sdk.errors import NotFound, DatabricksError from databricks.sdk.service.workspace import ObjectInfo, ObjectType, ExportFormat, ImportFormat, Language logger = logging.getLogger(__name__) @@ -74,7 +74,7 @@ def __exit__(self, *args): pass -class _DatabricksAccessor(_Accessor): +class _DatabricksAccessor: chmod = _na('accessor.chmod') getcwd = _na('accessor.getcwd') group = _na('accessor.group') @@ -286,8 +286,11 @@ def suffix(self): if not self.is_notebook(): return "" for sfx, lang in self._SUFFIXES.items(): - if self._object_info.language == lang: - return sfx + try: + if self._object_info.language == lang: + return sfx + except DatabricksError: + return "" return "" def __lt__(self, other: pathlib.PurePath): @@ -313,13 +316,22 @@ def _return_false(self) -> bool: is_junction = _return_false def is_dir(self): - return self._object_info.object_type == ObjectType.DIRECTORY + try: + return self._object_info.object_type == ObjectType.DIRECTORY + except DatabricksError: + return False def is_file(self): - return self._object_info.object_type == ObjectType.FILE + try: + return self._object_info.object_type == ObjectType.FILE + except DatabricksError: + return False def is_notebook(self): - return self._object_info.object_type == ObjectType.NOTEBOOK + try: + return self._object_info.object_type == ObjectType.NOTEBOOK + except DatabricksError: + return False def __eq__(self, other): return isinstance(other, Path) and self.as_posix() == other.as_posix() diff --git a/src/databricks/labs/ucx/runtime.py b/src/databricks/labs/ucx/runtime.py index a99d44f05c..3e232ceeab 100644 --- a/src/databricks/labs/ucx/runtime.py +++ b/src/databricks/labs/ucx/runtime.py @@ -15,6 +15,7 @@ MigrateHiveSerdeTablesInPlace, MigrateExternalTablesCTAS, ) +from databricks.labs.ucx.source_code.workflows import ExperimentalWorkflowLinter from databricks.labs.ucx.workspace_access.workflows import ( GroupMigration, PermissionsMigrationAPI, @@ -50,6 +51,7 @@ def all(cls): RemoveWorkspaceLocalGroups(), MigrateTablesInMounts(), PermissionsMigrationAPI(), + ExperimentalWorkflowLinter(), Failing(), ] ) diff --git a/src/databricks/labs/ucx/source_code/dbfs.py b/src/databricks/labs/ucx/source_code/dbfs.py index f044c2debf..374cb4952a 100644 --- a/src/databricks/labs/ucx/source_code/dbfs.py +++ b/src/databricks/labs/ucx/source_code/dbfs.py @@ -79,7 +79,7 @@ def lint(self, code: str) -> Iterable[Advice]: tree = ast.parse(code) visitor = DetectDbfsVisitor() visitor.visit(tree) - return visitor.get_advices() + yield from visitor.get_advices() class FromDbfsFolder(Linter): diff --git a/src/databricks/labs/ucx/source_code/files.py b/src/databricks/labs/ucx/source_code/files.py index f5ee19ceea..88b8955f91 100644 --- a/src/databricks/labs/ucx/source_code/files.py +++ b/src/databricks/labs/ucx/source_code/files.py @@ -1,17 +1,13 @@ from __future__ import annotations # for type hints -import ast import logging from pathlib import Path -from collections.abc import Callable from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.sdk.service.workspace import Language from databricks.labs.ucx.source_code.languages import Languages from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage -from databricks.labs.ucx.source_code.notebooks.base import NOTEBOOK_HEADER -from databricks.labs.ucx.source_code.python_linter import PythonLinter, ASTLinter from databricks.labs.ucx.source_code.graph import ( DependencyGraph, SourceContainer, @@ -19,6 +15,7 @@ DependencyLoader, Dependency, BaseDependencyResolver, + MaybeDependency, ) logger = logging.getLogger(__name__) @@ -32,68 +29,22 @@ def __init__(self, path: Path, source: str, language: Language): # using CellLanguage so we can reuse the facilities it provides self._language = CellLanguage.of_language(language) + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: + if self._language is not CellLanguage.PYTHON: + logger.warning(f"Unsupported language: {self._language.language}") + return [] + maybe = parent.build_graph_from_python_source(self._original_code) + problems = [] + for problem in maybe.problems: + problems.append(problem.replace(source_path=self._path)) + return problems + @property - def path(self): + def path(self) -> Path: return self._path - def build_dependency_graph(self, parent: DependencyGraph, path_lookup: PathLookup) -> None: - if self._language is not CellLanguage.PYTHON: - logger.warning(f"Unsupported language: {self._language.language}") - return - path_lookup.push_cwd(self.path.parent) - self._build_dependency_graph(parent) - path_lookup.pop_cwd() - - def _build_dependency_graph(self, parent: DependencyGraph) -> None: - # TODO replace the below with parent.build_graph_from_python_source - # can only be done after https://github.com/databrickslabs/ucx/issues/1287 - linter = ASTLinter.parse(self._original_code) - run_notebook_calls = PythonLinter.list_dbutils_notebook_run_calls(linter) - for call in run_notebook_calls: - call_problems: list[DependencyProblem] = [] - notebook_path_arg = PythonLinter.get_dbutils_notebook_run_path_arg(call) - if isinstance(notebook_path_arg, ast.Constant): - notebook_path = notebook_path_arg.value - parent.register_notebook(Path(notebook_path), call_problems.append) - call_problems = [ - problem.replace( - source_path=self._path, - start_line=call.lineno, - start_col=call.col_offset, - end_line=call.end_lineno or 0, - end_col=call.end_col_offset or 0, - ) - for problem in call_problems - ] - parent.add_problems(call_problems) - continue - problem = DependencyProblem( - code='dependency-not-constant', - message="Can't check dependency not provided as a constant", - source_path=self._path, - start_line=call.lineno, - start_col=call.col_offset, - end_line=call.end_lineno or 0, - end_col=call.end_col_offset or 0, - ) - parent.add_problems([problem]) - # TODO https://github.com/databrickslabs/ucx/issues/1287 - for pair in PythonLinter.list_import_sources(linter): - import_name = pair[0] - import_problems: list[DependencyProblem] = [] - parent.register_import(import_name, import_problems.append) - node = pair[1] - import_problems = [ - problem.replace( - source_path=self._path, - start_line=node.lineno, - start_col=node.col_offset, - end_line=node.end_lineno or 0, - end_col=node.end_col_offset or 0, - ) - for problem in import_problems - ] - parent.add_problems(import_problems) + def __repr__(self): + return f"" class LocalFileMigrator: @@ -148,34 +99,17 @@ def _apply_file_fix(self, path): class FileLoader(DependencyLoader): - - def __init__(self, path_lookup: PathLookup): - self._path_lookup = path_lookup - - def load_dependency(self, dependency: Dependency) -> SourceContainer | None: - fullpath = self.full_path(dependency.path) - assert fullpath is not None - return LocalFile(fullpath, fullpath.read_text("utf-8"), Language.PYTHON) + def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None: + absolute_path = path_lookup.resolve(dependency.path) + if not absolute_path: + return None + return LocalFile(absolute_path, absolute_path.read_text("utf-8"), Language.PYTHON) def exists(self, path: Path) -> bool: - return self.full_path(path) is not None - - def full_path(self, path: Path) -> Path | None: - if path.is_file(): - return path - for parent in self._path_lookup.paths: - child = Path(parent, path) - if child.is_file(): - return child - return None - - def is_notebook(self, path: Path) -> bool: - fullpath = self.full_path(path) - if fullpath is None: - return False - with fullpath.open(mode="r", encoding="utf-8") as stream: - line = stream.readline() - return NOTEBOOK_HEADER in line + return path.exists() + + def __repr__(self): + return "FileLoader()" class LocalFileResolver(BaseDependencyResolver): @@ -187,16 +121,34 @@ def __init__(self, file_loader: FileLoader, next_resolver: BaseDependencyResolve def with_next_resolver(self, resolver: BaseDependencyResolver) -> BaseDependencyResolver: return LocalFileResolver(self._file_loader, resolver) - # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1559 - def resolve_local_file( - self, path: Path, problem_collector: Callable[[DependencyProblem], None] - ) -> Dependency | None: - if self._file_loader.exists(path) and not self._file_loader.is_notebook(path): - return Dependency(self._file_loader, path) - return super().resolve_local_file(path, problem_collector) - - def resolve_import(self, name: str, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: - fullpath = self._file_loader.full_path(Path(f"{name}.py")) - if fullpath is not None: - return Dependency(self._file_loader, fullpath) - return super().resolve_import(name, problem_collector) + def resolve_local_file(self, path_lookup, path: Path) -> MaybeDependency: + absolute_path = path_lookup.resolve(path) + if absolute_path: + return MaybeDependency(Dependency(self._file_loader, absolute_path), []) + return super().resolve_local_file(path_lookup, path) + + def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency: + parts = [] + # Relative imports use leading dots. A single leading dot indicates a relative import, starting with + # the current package. Two or more leading dots indicate a relative import to the parent(s) of the current + # package, one level per dot after the first. + # see https://docs.python.org/3/reference/import.html#package-relative-imports + for i, rune in enumerate(name): + if not i and rune == '.': # leading single dot + parts.append(path_lookup.cwd.as_posix()) + continue + if rune != '.': + parts.append(name[i:].replace('.', '/')) + break + parts.append("..") + for candidate in (f'{"/".join(parts)}.py', f'{"/".join(parts)}/__init__.py'): + relative_path = Path(candidate) + absolute_path = path_lookup.resolve(relative_path) + if not absolute_path: + continue + dependency = Dependency(self._file_loader, absolute_path) + return MaybeDependency(dependency, []) + return super().resolve_import(path_lookup, name) + + def __repr__(self): + return "LocalFileResolver()" diff --git a/src/databricks/labs/ucx/source_code/graph.py b/src/databricks/labs/ucx/source_code/graph.py index b7a74f8203..ca271e2cf9 100644 --- a/src/databricks/labs/ucx/source_code/graph.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -2,10 +2,12 @@ import abc import ast +import sys from dataclasses import dataclass from pathlib import Path -from collections.abc import Callable, Iterable +from collections.abc import Callable +from databricks.labs.ucx.source_code.base import Advisory from databricks.labs.ucx.source_code.python_linter import ASTLinter, PythonLinter from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -22,9 +24,13 @@ def __init__( self._dependency = dependency self._parent = parent self._resolver = resolver - self._path_lookup = path_lookup + self._path_lookup = path_lookup.change_directory(dependency.path.parent) self._dependencies: dict[Dependency, DependencyGraph] = {} + @property + def path_lookup(self): + return self._path_lookup + @property def dependency(self): return self._dependency @@ -33,47 +39,44 @@ def dependency(self): def path(self): return self._dependency.path - def add_problems(self, problems: list[DependencyProblem]): - problems = [problem.replace(source_path=self.dependency.path) for problem in problems] - self._resolver.add_problems(problems) - - # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1559 - def register_notebook( - self, path: Path, problem_collector: Callable[[DependencyProblem], None] - ) -> DependencyGraph | None: - resolved = self._resolver.resolve_notebook(path, problem_collector) - if resolved is None: - return None - return self.register_dependency(resolved) - - # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1559 - def register_import( - self, name: str, problem_collector: Callable[[DependencyProblem], None] - ) -> DependencyGraph | None: - resolved = self._resolver.resolve_import(name, problem_collector) - if resolved is None: - return None - return self.register_dependency(resolved) - - def register_dependency(self, dependency: Dependency): - # already registered ? - child_graph = self._locate_dependency(dependency) - if child_graph is not None: - self._dependencies[dependency] = child_graph - return child_graph + def register_library(self, name: str) -> MaybeGraph: + # TODO: use DistInfoResolver to load wheel/egg/pypi dependencies + # TODO: https://github.com/databrickslabs/ucx/issues/1642 + # TODO: https://github.com/databrickslabs/ucx/issues/1643 + # TODO: https://github.com/databrickslabs/ucx/issues/1640 + return MaybeGraph(None, [DependencyProblem('not-yet-implemented', f'Library dependency: {name}')]) + + def register_notebook(self, path: Path) -> MaybeGraph: + maybe = self._resolver.resolve_notebook(self.path_lookup, path) + if not maybe.dependency: + return MaybeGraph(None, maybe.problems) + return self.register_dependency(maybe.dependency) + + def register_import(self, name: str) -> MaybeGraph: + maybe = self._resolver.resolve_import(self.path_lookup, name) + if not maybe.dependency: + return MaybeGraph(None, maybe.problems) + return self.register_dependency(maybe.dependency) + + def register_dependency(self, dependency: Dependency) -> MaybeGraph: + # TODO: this has to be a private method, because we don't want to allow free-form dependencies. + # the only case we have for this method to be used outside of this class is for SitePackages (or DistInfo) + # See databricks.labs.ucx.source_code.site_packages.SitePackageContainer.build_dependency_graph for reference + maybe = self.locate_dependency(dependency.path) + if maybe.graph is not None: + self._dependencies[dependency] = maybe.graph + return maybe # nay, create the child graph and populate it child_graph = DependencyGraph(dependency, self, self._resolver, self._path_lookup) self._dependencies[dependency] = child_graph - container = dependency.load() + container = dependency.load(self.path_lookup) if not container: - return None - container.build_dependency_graph(child_graph, self._path_lookup) - return child_graph + problem = DependencyProblem('dependency-register-failed', 'Failed to register dependency', dependency.path) + return MaybeGraph(child_graph, [problem]) + problems = container.build_dependency_graph(child_graph) + return MaybeGraph(child_graph, problems) - def _locate_dependency(self, dependency: Dependency) -> DependencyGraph | None: - return self.locate_dependency(dependency.path) - - def locate_dependency(self, path: Path) -> DependencyGraph | None: + def locate_dependency(self, path: Path) -> MaybeGraph: # need a list since unlike JS, Python won't let you assign closure variables found: list[DependencyGraph] = [] # TODO https://github.com/databrickslabs/ucx/issues/1287 @@ -91,7 +94,9 @@ def check_registered_dependency(graph): return False self.root.visit(check_registered_dependency, set()) - return found[0] if len(found) > 0 else None + if not found: + return MaybeGraph(None, [DependencyProblem('dependency-not-found', 'Dependency not found')]) + return MaybeGraph(found[0], []) @property def root(self): @@ -116,10 +121,17 @@ def local_dependencies(self) -> set[Dependency]: @property def all_paths(self) -> set[Path]: + # TODO: remove this public method, as it'll throw false positives + # for package imports, like certifi. a WorkflowTask is also a dependency, + # but it does not exist on a filesyste return {d.path for d in self.all_dependencies} + def all_relative_names(self) -> set[str]: + """This method is intented to simplify testing""" + return {d.path.relative_to(self._path_lookup.cwd).as_posix() for d in self.all_dependencies} + # when visit_node returns True it interrupts the visit - def visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: set[Path]) -> bool | None: + def visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: set[Path]) -> bool: if self.path in visited: return False visited.add(self.path) @@ -130,46 +142,54 @@ def visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: s return True return False - def build_graph_from_python_source(self, python_code: str, problem_collector: Callable[[DependencyProblem], None]): + def _traverse_notebook_run(self, call: ast.Call): + notebook_path_arg = PythonLinter.get_dbutils_notebook_run_path_arg(call) + if not isinstance(notebook_path_arg, ast.Constant): + yield DependencyProblem( + code='dependency-not-constant', + message="Can't check dependency not provided as a constant", + start_line=call.lineno, + start_col=call.col_offset, + end_line=call.end_lineno or 0, + end_col=call.end_col_offset or 0, + ) + return + notebook_path = notebook_path_arg.value + maybe = self.register_notebook(Path(notebook_path)) + for problem in maybe.problems: + yield problem.replace( + start_line=call.lineno, + start_col=call.col_offset, + end_line=call.end_lineno or 0, + end_col=call.end_col_offset or 0, + ) + + def build_graph_from_python_source(self, python_code: str) -> MaybeGraph: linter = ASTLinter.parse(python_code) - calls = linter.locate(ast.Call, [("run", ast.Attribute), ("notebook", ast.Attribute), ("dbutils", ast.Name)]) - for call in calls: + problems: list[DependencyProblem] = [] + run_notebook_calls = PythonLinter.list_dbutils_notebook_run_calls(linter) + for call in run_notebook_calls: assert isinstance(call, ast.Call) - path = PythonLinter.get_dbutils_notebook_run_path_arg(call) - if isinstance(path, ast.Constant): - path = path.value.strip().strip("'").strip('"') - call_problems: list[DependencyProblem] = [] - self.register_notebook(Path(path), call_problems.append) - for problem in call_problems: - problem = problem.replace( - start_line=call.lineno, - start_col=call.col_offset, - end_line=call.end_lineno or 0, - end_col=call.end_col_offset or 0, - ) - problem_collector(problem) - else: - problem = DependencyProblem( - code='dependency-not-constant', - message="Can't check dependency not provided as a constant", - start_line=call.lineno, - start_col=call.col_offset, - end_line=call.end_lineno or 0, - end_col=call.end_col_offset or 0, - ) - problem_collector(problem) - for pair in PythonLinter.list_import_sources(linter): - import_problems: list[DependencyProblem] = [] - self.register_import(pair[0], import_problems.append) - node = pair[1] - for problem in import_problems: - problem = problem.replace( + for problem in self._traverse_notebook_run(call): + problems.append(problem) + for import_name, node in PythonLinter.list_import_sources(linter): + if import_name.split('.')[0] in {'databricks'} | sys.stdlib_module_names: + # TODO: redundant: see databricks.labs.ucx.source_code.whitelist.Whitelist.__init__ + # we don't need to register stdlib or databricks.* imports + continue + maybe = self.register_import(import_name) + for problem in maybe.problems: + with_lines = problem.replace( start_line=node.lineno, start_col=node.col_offset, end_line=node.end_lineno or 0, end_col=node.end_col_offset or 0, ) - problem_collector(problem) + problems.append(with_lines) + return MaybeGraph(self, problems) + + def __repr__(self): + return f"" class Dependency(abc.ABC): @@ -188,25 +208,23 @@ def __hash__(self): def __eq__(self, other): return isinstance(other, type(self)) and self.path == other.path - def load(self) -> SourceContainer | None: - return self._loader.load_dependency(self) + def load(self, path_lookup: PathLookup) -> SourceContainer | None: + return self._loader.load_dependency(path_lookup, self) + + def __repr__(self): + return f"Dependency<{self.path}>" class SourceContainer(abc.ABC): @abc.abstractmethod - def build_dependency_graph(self, parent: DependencyGraph, path_lookup: PathLookup) -> None: + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: raise NotImplementedError() class DependencyLoader(abc.ABC): - @abc.abstractmethod - def load_dependency(self, dependency: Dependency) -> SourceContainer | None: - raise NotImplementedError() - - @abc.abstractmethod - def is_notebook(self, path: Path) -> bool: + def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None: raise NotImplementedError() @@ -215,47 +233,39 @@ class WrappingLoader(DependencyLoader): def __init__(self, source_container: SourceContainer): self._source_container = source_container - def is_notebook(self, path: Path) -> bool: - raise NotImplementedError() # should never happen - - def load_dependency(self, dependency: Dependency) -> SourceContainer | None: + def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None: return self._source_container + def __repr__(self): + return f"" + class BaseDependencyResolver(abc.ABC): def __init__(self, next_resolver: BaseDependencyResolver | None): self._next_resolver = next_resolver - self._problems: list[DependencyProblem] = [] @abc.abstractmethod def with_next_resolver(self, resolver: BaseDependencyResolver) -> BaseDependencyResolver: raise NotImplementedError() - @property - def problems(self): - return self._problems - - def add_problems(self, problems: list[DependencyProblem]): - self._problems.extend(problems) - @property def next_resolver(self): return self._next_resolver - def resolve_notebook(self, path: Path, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: + def resolve_notebook(self, path_lookup: PathLookup, path: Path) -> MaybeDependency: + # TODO: remove StubResolver and return MaybeDependency(None, [...]) assert self._next_resolver is not None - return self._next_resolver.resolve_notebook(path, problem_collector) + return self._next_resolver.resolve_notebook(path_lookup, path) - def resolve_local_file( - self, path: Path, problem_collector: Callable[[DependencyProblem], None] - ) -> Dependency | None: + def resolve_local_file(self, path_lookup, path: Path) -> MaybeDependency: assert self._next_resolver is not None - return self._next_resolver.resolve_local_file(path, problem_collector) + return self._next_resolver.resolve_local_file(path_lookup, path) - def resolve_import(self, name: str, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: + def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency: + # TODO: remove StubResolver and return MaybeDependency(None, [...]) assert self._next_resolver is not None - return self._next_resolver.resolve_import(name, problem_collector) + return self._next_resolver.resolve_import(path_lookup, name) class StubResolver(BaseDependencyResolver): @@ -266,16 +276,24 @@ def __init__(self): def with_next_resolver(self, resolver: BaseDependencyResolver) -> BaseDependencyResolver: raise NotImplementedError("Should never happen!") - def resolve_notebook(self, path: Path, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: - return None + def resolve_notebook(self, path_lookup: PathLookup, path: Path) -> MaybeDependency: + return self._fail('notebook-not-found', f"Notebook not found: {path.as_posix()}") + + def resolve_local_file(self, path_lookup: PathLookup, path: Path) -> MaybeDependency: + return self._fail('file-not-found', f"File not found: {path.as_posix()}") - def resolve_local_file( - self, path: Path, problem_collector: Callable[[DependencyProblem], None] - ) -> Dependency | None: - return None + def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency: + return self._fail('import-not-found', f"Could not locate import: {name}") - def resolve_import(self, name: str, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: - return None + @staticmethod + def _fail(code: str, message: str): + return MaybeDependency(None, [DependencyProblem(code, message)]) + + +@dataclass +class MaybeDependency: + dependency: Dependency | None + problems: list[DependencyProblem] class DependencyResolver: @@ -286,60 +304,17 @@ def __init__(self, resolvers: list[BaseDependencyResolver]): previous = resolver self._resolver: BaseDependencyResolver = previous - def resolve_notebook( - self, path: Path, problem_collector: Callable[[DependencyProblem], None] | None = None - ) -> Dependency | None: - problems: list[DependencyProblem] = [] - dependency = self._resolver.resolve_notebook(path, problems.append) - if dependency is None: - problem = DependencyProblem('notebook-not-found', f"Notebook not found: {path.as_posix()}") - problems.append(problem) - if problem_collector: - for problem in problems: - problem_collector(problem) - else: - self.add_problems(problems) - return dependency - - def resolve_local_file( - self, path: Path, problem_collector: Callable[[DependencyProblem], None] | None = None - ) -> Dependency | None: - problems: list[DependencyProblem] = [] - dependency = self._resolver.resolve_local_file(path, problems.append) - if dependency is None: - problem = DependencyProblem('file-not-found', f"File not found: {path.as_posix()}") - problems.append(problem) - if problem_collector: - for problem in problems: - problem_collector(problem) - else: - self.add_problems(problems) - return dependency - - def resolve_import( - self, name: str, problem_collector: Callable[[DependencyProblem], None] | None = None - ) -> Dependency | None: - problems: list[DependencyProblem] = [] - dependency = self._resolver.resolve_import(name, problems.append) - if dependency is None: - problem = DependencyProblem('import-not-found', f"Could not locate import: {name}") - problems.append(problem) - if problem_collector: - for problem in problems: - problem_collector(problem) - else: - self.add_problems(problems) - return dependency + def resolve_notebook(self, path_lookup: PathLookup, path: Path) -> MaybeDependency: + return self._resolver.resolve_notebook(path_lookup, path) - @property - def problems(self) -> Iterable[DependencyProblem]: - resolver = self._resolver - while resolver is not None: - yield from resolver.problems - resolver = resolver.next_resolver + def resolve_local_file(self, path_lookup: PathLookup, path: Path) -> MaybeDependency: + return self._resolver.resolve_local_file(path_lookup, path) - def add_problems(self, problems: list[DependencyProblem]): - self._resolver.add_problems(problems) + def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency: + return self._resolver.resolve_import(path_lookup, name) + + def __repr__(self): + return f"" MISSING_SOURCE_PATH = "" @@ -355,6 +330,9 @@ class DependencyProblem: end_line: int = -1 end_col: int = -1 + def is_path_missing(self): + return self.source_path == Path(MISSING_SOURCE_PATH) + def replace( self, code: str | None = None, @@ -375,6 +353,26 @@ def replace( end_col if end_col is not None else self.end_col, ) + def as_advisory(self) -> 'Advisory': + return Advisory( + code=self.code, + message=self.message, + start_line=self.start_line, + start_col=self.start_col, + end_line=self.end_line, + end_col=self.end_col, + ) + + +@dataclass +class MaybeGraph: + graph: DependencyGraph | None + problems: list[DependencyProblem] + + @property + def failed(self): + return len(self.problems) > 0 + class DependencyGraphBuilder: @@ -382,26 +380,29 @@ def __init__(self, resolver: DependencyResolver, path_lookup: PathLookup): self._resolver = resolver self._path_lookup = path_lookup - @property - def problems(self): - return self._resolver.problems - - def build_local_file_dependency_graph(self, path: Path) -> DependencyGraph | None: - dependency = self._resolver.resolve_local_file(path) - if dependency is None: - return None - graph = DependencyGraph(dependency, None, self._resolver, self._path_lookup) - container = dependency.load() + def build_local_file_dependency_graph(self, path: Path) -> MaybeGraph: + maybe = self._resolver.resolve_local_file(self._path_lookup, path) + if not maybe.dependency: + return MaybeGraph(None, maybe.problems) + graph = DependencyGraph(maybe.dependency, None, self._resolver, self._path_lookup) + container = maybe.dependency.load(graph.path_lookup) if container is not None: - container.build_dependency_graph(graph, self._path_lookup) - return graph - - def build_notebook_dependency_graph(self, path: Path) -> DependencyGraph | None: - dependency = self._resolver.resolve_notebook(path) - if dependency is None: - return None - graph = DependencyGraph(dependency, None, self._resolver, self._path_lookup) - container = dependency.load() + problems = container.build_dependency_graph(graph) + if problems: + return MaybeGraph(None, problems) + return MaybeGraph(graph, []) + + def build_notebook_dependency_graph(self, path: Path) -> MaybeGraph: + maybe = self._resolver.resolve_notebook(self._path_lookup, path) + if not maybe.dependency: + return MaybeGraph(None, maybe.problems) + graph = DependencyGraph(maybe.dependency, None, self._resolver, self._path_lookup) + container = maybe.dependency.load(graph.path_lookup) if container is not None: - container.build_dependency_graph(graph, self._path_lookup) - return graph + problems = container.build_dependency_graph(graph) + if problems: + return MaybeGraph(None, problems) + return MaybeGraph(graph, []) + + def __repr__(self): + return f"" diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py new file mode 100644 index 0000000000..1d6cb06ee7 --- /dev/null +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -0,0 +1,250 @@ +import functools +import logging +from collections.abc import Iterable +from dataclasses import dataclass +from pathlib import Path + +from databricks.labs.blueprint.parallel import Threads +from databricks.labs.lsql.backends import SqlBackend +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import NotFound +from databricks.sdk.service import compute, jobs + +from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex +from databricks.labs.ucx.mixins.wspath import WorkspacePath +from databricks.labs.ucx.source_code.base import CurrentSessionState +from databricks.labs.ucx.source_code.files import LocalFile +from databricks.labs.ucx.source_code.graph import ( + DependencyGraph, + SourceContainer, + DependencyProblem, + Dependency, + WrappingLoader, + DependencyResolver, +) +from databricks.labs.ucx.source_code.languages import Languages +from databricks.labs.ucx.source_code.notebooks.sources import Notebook, NotebookLinter, FileLinter +from databricks.labs.ucx.source_code.path_lookup import PathLookup + +logger = logging.getLogger(__name__) + + +@dataclass +class JobProblem: + job_id: int + job_name: str + task_key: str + path: str + code: str + message: str + start_line: int + start_col: int + end_line: int + end_col: int + + +class WorkflowTask(Dependency): + def __init__(self, ws: WorkspaceClient, task: jobs.Task, job: jobs.Job): + loader = WrappingLoader(WorkflowTaskContainer(ws, task)) + super().__init__(loader, Path(f'/jobs/{task.task_key}')) + self._task = task + self._job = job + + def load(self, path_lookup: PathLookup) -> SourceContainer | None: + return self._loader.load_dependency(path_lookup, self) + + def __repr__(self): + return f'WorkflowTask<{self._task.task_key} of {self._job.settings.name}>' + + +class WorkflowTaskContainer(SourceContainer): + def __init__(self, ws: WorkspaceClient, task: jobs.Task): + self._task = task + self._ws = ws + + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: + return list(self._register_task_dependencies(parent)) + + def _register_task_dependencies(self, graph: DependencyGraph) -> Iterable[DependencyProblem]: + yield from self._register_libraries(graph) + yield from self._register_notebook(graph) + yield from self._register_spark_python_task(graph) + yield from self._register_python_wheel_task(graph) + yield from self._register_spark_jar_task(graph) + yield from self._register_run_job_task(graph) + yield from self._register_pipeline_task(graph) + yield from self._register_existing_cluster_id(graph) + yield from self._register_spark_submit_task(graph) + + def _register_libraries(self, graph: DependencyGraph): + if not self._task.libraries: + return + for library in self._task.libraries: + yield from self._lint_library(library, graph) + + def _lint_library(self, library: compute.Library, graph: DependencyGraph) -> Iterable[DependencyProblem]: + if library.pypi: + # TODO: https://github.com/databrickslabs/ucx/issues/1642 + maybe = graph.register_library(library.pypi.package) + if maybe.problems: + yield from maybe.problems + if library.jar: + # TODO: https://github.com/databrickslabs/ucx/issues/1641 + yield DependencyProblem('not-yet-implemented', 'Jar library is not yet implemented') + if library.egg: + # TODO: https://github.com/databrickslabs/ucx/issues/1643 + maybe = graph.register_library(library.egg) + if maybe.problems: + yield from maybe.problems + if library.whl: + # TODO: download the wheel somewhere local and add it to "virtual sys.path" via graph.path_lookup.push_path + # TODO: https://github.com/databrickslabs/ucx/issues/1640 + maybe = graph.register_library(library.whl) + if maybe.problems: + yield from maybe.problems + if library.requirements: + # TODO: download and add every entry via graph.register_library + # TODO: https://github.com/databrickslabs/ucx/issues/1644 + yield DependencyProblem('not-yet-implemented', 'Requirements library is not yet implemented') + + def _register_notebook(self, graph: DependencyGraph) -> Iterable[DependencyProblem]: + if not self._task.notebook_task: + return + notebook_path = self._task.notebook_task.notebook_path + logger.info(f'Disovering {self._task.task_key} entrypoint: {notebook_path}') + path = WorkspacePath(self._ws, notebook_path) + maybe = graph.register_notebook(path) + if maybe.problems: + yield from maybe.problems + + def _register_spark_python_task(self, graph: DependencyGraph): # pylint: disable=unused-argument + if not self._task.spark_python_task: + return + # TODO: https://github.com/databrickslabs/ucx/issues/1639 + yield DependencyProblem('not-yet-implemented', 'Spark Python task is not yet implemented') + + def _register_python_wheel_task(self, graph: DependencyGraph): # pylint: disable=unused-argument + if not self._task.python_wheel_task: + return + # TODO: https://github.com/databrickslabs/ucx/issues/1640 + yield DependencyProblem('not-yet-implemented', 'Python wheel task is not yet implemented') + + def _register_spark_jar_task(self, graph: DependencyGraph): # pylint: disable=unused-argument + if not self._task.spark_jar_task: + return + # TODO: https://github.com/databrickslabs/ucx/issues/1641 + yield DependencyProblem('not-yet-implemented', 'Spark Jar task is not yet implemented') + + def _register_run_job_task(self, graph: DependencyGraph): # pylint: disable=unused-argument + if not self._task.run_job_task: + return + # TODO: it's not clear how to terminate the graph + yield DependencyProblem('not-yet-implemented', 'Run job task is not yet implemented') + + def _register_pipeline_task(self, graph: DependencyGraph): # pylint: disable=unused-argument + if not self._task.pipeline_task: + return + # TODO: https://github.com/databrickslabs/ucx/issues/1638 + yield DependencyProblem('not-yet-implemented', 'Pipeline task is not yet implemented') + + def _register_existing_cluster_id(self, graph: DependencyGraph): # pylint: disable=unused-argument + if not self._task.existing_cluster_id: + return + # TODO: https://github.com/databrickslabs/ucx/issues/1637 + # load libraries installed on the referred cluster + yield DependencyProblem('not-yet-implemented', 'Existing cluster id is not yet implemented') + + def _register_spark_submit_task(self, graph: DependencyGraph): # pylint: disable=unused-argument + if not self._task.spark_submit_task: + return + yield DependencyProblem('not-yet-implemented', 'Spark submit task is not yet implemented') + + +class WorkflowLinter: + def __init__( + self, + ws: WorkspaceClient, + resolver: DependencyResolver, + path_lookup: PathLookup, + migration_index: MigrationIndex, + ): + self._ws = ws + self._resolver = resolver + self._path_lookup = path_lookup + self._migration_index = migration_index + + def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): + tasks = [] + for job in self._ws.jobs.list(): + tasks.append(functools.partial(self.lint_job, job.job_id)) + problems: list[JobProblem] = [] + for batch in Threads.strict('linting workflows', tasks): + problems.extend(batch) + sql_backend.save_table(f'{inventory_database}.workflow_problems', problems, JobProblem, mode='overwrite') + + def lint_job(self, job_id: int) -> list[JobProblem]: + try: + job = self._ws.jobs.get(job_id) + return list(self._lint_job(job)) + except NotFound: + logger.warning(f'Could not find job: {job_id}') + return [] + + _UNKNOWN = Path('') + + def _lint_job(self, job: jobs.Job) -> list[JobProblem]: + problems: list[JobProblem] = [] + assert job.job_id is not None + assert job.settings is not None + assert job.settings.name is not None + assert job.settings.tasks is not None + for task in job.settings.tasks: + for path, advice in self._lint_task(task, job): + absolute_path = path.absolute().as_posix() if path != self._UNKNOWN else 'UNKNOWN' + job_problem = JobProblem( + job_id=job.job_id, + job_name=job.settings.name, + task_key=task.task_key, + path=absolute_path, + code=advice.code, + message=advice.message, + start_line=advice.start_line, + start_col=advice.start_col, + end_line=advice.end_line, + end_col=advice.end_col, + ) + problems.append(job_problem) + return problems + + def _lint_task(self, task: jobs.Task, job: jobs.Job): + dependency: Dependency = WorkflowTask(self._ws, task, job) + graph = DependencyGraph(dependency, None, self._resolver, self._path_lookup) + container = dependency.load(self._path_lookup) + assert container is not None # because we just created it + problems = container.build_dependency_graph(graph) + if problems: + for problem in problems: + source_path = self._UNKNOWN if problem.is_path_missing() else problem.source_path + yield source_path, problem + return + session_state = CurrentSessionState() + state = Languages(self._migration_index, session_state) + for dependency in graph.all_dependencies: + logger.info(f'Linting {task.task_key} dependency: {dependency}') + container = dependency.load(graph.path_lookup) + if not container: + continue + if isinstance(container, Notebook): + yield from self._lint_notebook(container, state) + if isinstance(container, LocalFile): + yield from self._lint_file(container, state) + + def _lint_file(self, file: LocalFile, state: Languages): + linter = FileLinter(state, file.path) + for advice in linter.lint(): + yield file.path, advice + + def _lint_notebook(self, notebook: Notebook, state: Languages): + linter = NotebookLinter(state, notebook) + for advice in linter.lint(): + yield notebook.path, advice diff --git a/src/databricks/labs/ucx/source_code/languages.py b/src/databricks/labs/ucx/source_code/languages.py index 7413bb4307..2ffc062d11 100644 --- a/src/databricks/labs/ucx/source_code/languages.py +++ b/src/databricks/labs/ucx/source_code/languages.py @@ -9,9 +9,9 @@ class Languages: - def __init__(self, index: MigrationIndex): + def __init__(self, index: MigrationIndex, session_state: CurrentSessionState | None = None): self._index = index - session_state = CurrentSessionState() + session_state = CurrentSessionState() if not session_state else session_state from_table = FromTable(index, session_state=session_state) dbfs_from_folder = FromDbfsFolder() self._linters = { diff --git a/src/databricks/labs/ucx/source_code/notebooks/base.py b/src/databricks/labs/ucx/source_code/notebooks/base.py deleted file mode 100644 index e312d24b28..0000000000 --- a/src/databricks/labs/ucx/source_code/notebooks/base.py +++ /dev/null @@ -1 +0,0 @@ -NOTEBOOK_HEADER = "Databricks notebook source" diff --git a/src/databricks/labs/ucx/source_code/notebooks/cells.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py index 7f8fc35130..cbfd2015dc 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/cells.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -5,17 +5,15 @@ from ast import parse as parse_python from enum import Enum from pathlib import Path -from collections.abc import Callable from sqlglot import parse as parse_sql, ParseError as SQLParseError -from databricks.labs.ucx.source_code.notebooks.base import NOTEBOOK_HEADER from databricks.sdk.service.workspace import Language -from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem - +from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem, MaybeGraph # use a specific logger for sqlglot warnings so we can disable them selectively sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot") +NOTEBOOK_HEADER = "Databricks notebook source" CELL_SEPARATOR = "COMMAND ----------" MAGIC_PREFIX = 'MAGIC' LANGUAGE_PREFIX = '%' @@ -56,9 +54,12 @@ def is_runnable(self) -> bool: raise NotImplementedError() @abstractmethod - def build_dependency_graph(self, parent: DependencyGraph, problem_collector: Callable[[DependencyProblem], None]): + def build_dependency_graph(self, parent: DependencyGraph) -> MaybeGraph: raise NotImplementedError() + def __repr__(self): + return f"{self.language.name}: {self._original_code[:20]}" + class PythonCell(Cell): @@ -73,8 +74,8 @@ def is_runnable(self) -> bool: except SyntaxError: return True - def build_dependency_graph(self, parent: DependencyGraph, problem_collector: Callable[[DependencyProblem], None]): - parent.build_graph_from_python_source(self._original_code, problem_collector) + def build_dependency_graph(self, parent: DependencyGraph) -> MaybeGraph: + return parent.build_graph_from_python_source(self._original_code) class RCell(Cell): @@ -86,8 +87,8 @@ def language(self): def is_runnable(self) -> bool: return True # TODO - def build_dependency_graph(self, parent: DependencyGraph, problem_collector: Callable[[DependencyProblem], None]): - pass # not in scope + def build_dependency_graph(self, parent: DependencyGraph) -> MaybeGraph: + return MaybeGraph(None, []) class ScalaCell(Cell): @@ -99,8 +100,8 @@ def language(self): def is_runnable(self) -> bool: return True # TODO - def build_dependency_graph(self, parent: DependencyGraph, problem_collector: Callable[[DependencyProblem], None]): - pass # TODO + def build_dependency_graph(self, parent: DependencyGraph) -> MaybeGraph: + return MaybeGraph(None, []) class SQLCell(Cell): @@ -117,8 +118,8 @@ def is_runnable(self) -> bool: sqlglot_logger.warning(f"Failed to parse SQL using 'sqlglot': {self._original_code}", exc_info=e) return True - def build_dependency_graph(self, parent: DependencyGraph, problem_collector: Callable[[DependencyProblem], None]): - pass # not in scope + def build_dependency_graph(self, parent: DependencyGraph) -> MaybeGraph: + return MaybeGraph(None, []) class MarkdownCell(Cell): @@ -130,8 +131,8 @@ def language(self): def is_runnable(self) -> bool: return True # TODO - def build_dependency_graph(self, parent: DependencyGraph, problem_collector: Callable[[DependencyProblem], None]): - pass # not in scope + def build_dependency_graph(self, parent: DependencyGraph) -> MaybeGraph: + return MaybeGraph(None, []) class RunCell(Cell): @@ -143,7 +144,7 @@ def language(self): def is_runnable(self) -> bool: return True # TODO - def build_dependency_graph(self, parent: DependencyGraph, problem_collector: Callable[[DependencyProblem], None]): + def build_dependency_graph(self, parent: DependencyGraph) -> MaybeGraph: command = f'{LANGUAGE_PREFIX}{self.language.magic_name}' lines = self._original_code.split('\n') for idx, line in enumerate(lines): @@ -153,15 +154,15 @@ def build_dependency_graph(self, parent: DependencyGraph, problem_collector: Cal path = path.strip().strip("'").strip('"') if len(path) == 0: continue - problems: list[DependencyProblem] = [] - parent.register_notebook(Path(path), problems.append) + notebook_path = Path(path) + maybe = parent.register_notebook(notebook_path) + if not maybe.problems: + return maybe start_line = self._original_offset + idx + 1 - for problem in problems: - problem = problem.replace( - start_line=start_line, start_col=0, end_line=start_line, end_col=len(line) - ) - problem_collector(problem) - return + problems = self._adjust_problem_source_paths_and_line_numbers(parent, start_line, line, maybe.problems) + if problems: + return MaybeGraph(None, problems) + return MaybeGraph(maybe.graph, []) start_line = self._original_offset + 1 problem = DependencyProblem( 'invalid-run-cell', @@ -171,7 +172,27 @@ def build_dependency_graph(self, parent: DependencyGraph, problem_collector: Cal end_line=start_line, end_col=len(self._original_code), ) - problem_collector(problem) + return MaybeGraph(None, [problem]) + + @staticmethod + def _adjust_problem_source_paths_and_line_numbers( + parent: DependencyGraph, + line_number: int, + source_code_line: str, + raw_problems: list[DependencyProblem], + ): + problems: list[DependencyProblem] = [] + for problem in raw_problems: + if problem.is_path_missing(): + problem = problem.replace(source_path=parent.dependency.path.absolute()) + problem = problem.replace( + start_line=line_number, + start_col=0, + end_line=line_number, + end_col=len(source_code_line), + ) + problems.append(problem) + return problems def migrate_notebook_path(self): pass @@ -186,11 +207,8 @@ def language(self): def is_runnable(self) -> bool: return True # TODO - def build_dependency_graph(self, parent: DependencyGraph, problem_collector: Callable[[DependencyProblem], None]): - pass # nothing to do - - def migrate_notebook_path(self): - pass # nothing to do + def build_dependency_graph(self, parent: DependencyGraph) -> MaybeGraph: + return MaybeGraph(None, []) class PipCell(Cell): @@ -202,11 +220,8 @@ def language(self): def is_runnable(self) -> bool: return True # TODO - def build_dependency_graph(self, parent: DependencyGraph, problem_collector: Callable[[DependencyProblem], None]): - pass # nothing to do - - def migrate_notebook_path(self): - pass + def build_dependency_graph(self, parent: DependencyGraph) -> MaybeGraph: + return MaybeGraph(None, []) class CellLanguage(Enum): @@ -230,6 +245,10 @@ def __init__(self, *args): self._requires_isolated_pi = args[3] self._new_cell = args[4] + @property + def file_magic_header(self): + return f"{self._comment_prefix} {NOTEBOOK_HEADER}" + @property def language(self) -> Language: return self._language @@ -277,8 +296,7 @@ def new_cell(self, source: str, original_offset: int) -> Cell: def extract_cells(self, source: str) -> list[Cell] | None: lines = source.split('\n') - header = f"{self.comment_prefix} {NOTEBOOK_HEADER}" - if not lines[0].startswith(header): + if not lines[0].startswith(self.file_magic_header): raise ValueError("Not a Databricks notebook source!") def make_cell(cell_lines: list[str], start: int): diff --git a/src/databricks/labs/ucx/source_code/notebooks/loaders.py b/src/databricks/labs/ucx/source_code/notebooks/loaders.py index 56dce132a9..2ac21547f7 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/loaders.py +++ b/src/databricks/labs/ucx/source_code/notebooks/loaders.py @@ -1,21 +1,23 @@ from __future__ import annotations import abc +import logging from pathlib import Path -from collections.abc import Callable -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.workspace import ObjectType, ObjectInfo, ExportFormat, Language +from databricks.sdk.errors import NotFound -from databricks.labs.ucx.source_code.files import FileLoader from databricks.labs.ucx.source_code.graph import ( BaseDependencyResolver, - DependencyProblem, Dependency, DependencyLoader, SourceContainer, + MaybeDependency, ) +from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage from databricks.labs.ucx.source_code.notebooks.sources import Notebook +from databricks.labs.ucx.source_code.path_lookup import PathLookup + +logger = logging.getLogger(__name__) class NotebookResolver(BaseDependencyResolver): @@ -27,55 +29,54 @@ def __init__(self, notebook_loader: NotebookLoader, next_resolver: BaseDependenc def with_next_resolver(self, resolver: BaseDependencyResolver) -> BaseDependencyResolver: return NotebookResolver(self._notebook_loader, resolver) - def resolve_notebook(self, path: Path, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: - if self._notebook_loader.is_notebook(path): - return Dependency(self._notebook_loader, path) - return super().resolve_notebook(path, problem_collector) + def resolve_notebook(self, path_lookup: PathLookup, path: Path) -> MaybeDependency: + absolute_path = self._notebook_loader.resolve(path_lookup, path) + if not absolute_path: + return super().resolve_notebook(path_lookup, path) + dependency = Dependency(self._notebook_loader, absolute_path) + return MaybeDependency(dependency, []) class NotebookLoader(DependencyLoader, abc.ABC): - pass - - -class WorkspaceNotebookLoader(NotebookLoader): - - def __init__(self, ws: WorkspaceClient): - self._ws = ws - - def is_notebook(self, path: Path): - object_info = self._ws.workspace.get_status(str(path)) - # TODO check error conditions, see https://github.com/databrickslabs/ucx/issues/1361 - return object_info is not None and object_info.object_type is ObjectType.NOTEBOOK - - def load_dependency(self, dependency: Dependency) -> SourceContainer | None: - object_info = self._ws.workspace.get_status(str(dependency.path)) - # TODO check error conditions, see https://github.com/databrickslabs/ucx/issues/1361 - return self._load_notebook(object_info) - - def _load_notebook(self, object_info: ObjectInfo) -> SourceContainer: - assert object_info.path is not None - assert object_info.language is not None - source = self._load_source(object_info) - return Notebook.parse(Path(object_info.path), source, object_info.language) + def resolve(self, path_lookup: PathLookup, path: Path) -> Path | None: + """When exported through Git, notebooks are saved with a .py extension. If the path is a notebook, return the + path to the notebook. If the path is a Python file, return the path to the Python file. If the path is neither, + return None.""" + for candidate in (self._adjust_path(path), path): + absolute_path = path_lookup.resolve(candidate) + if not absolute_path: + continue + return absolute_path + return None + + def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None: + # TODO: catch exceptions, and create MaybeContainer to follow the pattern - OSError and NotFound are common + absolute_path = self.resolve(path_lookup, dependency.path) + if not absolute_path: + return None + try: + content = absolute_path.read_text("utf-8") + except NotFound: + logger.warning(f"Could not read notebook from workspace: {absolute_path}") + return None + language = self._detect_language(content) + if not language: + logger.warning(f"Could not detect language for {absolute_path}") + return None + return Notebook.parse(absolute_path, content, language) - def _load_source(self, object_info: ObjectInfo) -> str: - assert object_info.path is not None - with self._ws.workspace.download(object_info.path, format=ExportFormat.SOURCE) as f: - return f.read().decode("utf-8") - - -class LocalNotebookLoader(NotebookLoader, FileLoader): - - def load_dependency(self, dependency: Dependency) -> SourceContainer | None: - fullpath = self.full_path(self._adjust_path(dependency.path)) - assert fullpath is not None - return Notebook.parse(fullpath, fullpath.read_text("utf-8"), Language.PYTHON) - - def is_notebook(self, path: Path) -> bool: - return super().is_notebook(self._adjust_path(path)) + @staticmethod + def _detect_language(content: str): + for language in CellLanguage: + if content.startswith(language.file_magic_header): + return language.language + return None @staticmethod def _adjust_path(path: Path): if path.suffix == ".py": return path return Path(path.as_posix() + ".py") + + def __repr__(self): + return "NotebookLoader()" diff --git a/src/databricks/labs/ucx/source_code/notebooks/migrator.py b/src/databricks/labs/ucx/source_code/notebooks/migrator.py index 74d6837348..4064965300 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/migrator.py +++ b/src/databricks/labs/ucx/source_code/notebooks/migrator.py @@ -2,38 +2,33 @@ from pathlib import Path -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.workspace import ObjectInfo, ExportFormat, ObjectType from databricks.labs.ucx.source_code.graph import Dependency from databricks.labs.ucx.source_code.languages import Languages from databricks.labs.ucx.source_code.notebooks.cells import RunCell -from databricks.labs.ucx.source_code.notebooks.loaders import WorkspaceNotebookLoader +from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader from databricks.labs.ucx.source_code.notebooks.sources import Notebook +from databricks.labs.ucx.source_code.path_lookup import PathLookup class NotebookMigrator: - def __init__( - self, - ws: WorkspaceClient, - languages: Languages, - ): - self._ws = ws + def __init__(self, languages: Languages): + # TODO: move languages to `apply` self._languages = languages - def revert(self, object_info: ObjectInfo): - if not object_info.path: + def revert(self, path: Path): + backup_path = path.with_suffix(".bak") + if not backup_path.exists(): return False - with self._ws.workspace.download(object_info.path + ".bak", format=ExportFormat.SOURCE) as f: - code = f.read().decode("utf-8") - self._ws.workspace.upload(object_info.path, code.encode("utf-8")) - return True + return path.write_text(backup_path.read_text()) > 0 - def apply(self, object_info: ObjectInfo) -> bool: - if not object_info.path or not object_info.language or object_info.object_type is not ObjectType.NOTEBOOK: + def apply(self, path: Path) -> bool: + if not path.exists(): return False - dependency = Dependency(WorkspaceNotebookLoader(self._ws), Path(object_info.path)) - container = dependency.load() + dependency = Dependency(NotebookLoader(), path) + # TODO: the interface for this method has to be changed + lookup = PathLookup.from_sys_path(Path.cwd()) + container = dependency.load(lookup) assert isinstance(container, Notebook) return self._apply(container) @@ -53,7 +48,7 @@ def _apply(self, notebook: Notebook) -> bool: cell.migrated_code = migrated_code changed = True if changed: - self._ws.workspace.upload(notebook.path.as_posix() + ".bak", notebook.original_code.encode("utf-8")) - self._ws.workspace.upload(notebook.path.as_posix(), notebook.to_migrated_code().encode("utf-8")) # TODO https://github.com/databrickslabs/ucx/issues/1327 store 'migrated' status + notebook.path.replace(notebook.path.with_suffix(".bak")) + notebook.path.write_text(notebook.to_migrated_code()) return changed diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 4bb79e1936..caff152c49 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -1,18 +1,17 @@ from __future__ import annotations from collections.abc import Iterable +from functools import cached_property from pathlib import Path -from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.sdk.service.workspace import Language from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.base import Advice +from databricks.labs.ucx.source_code.base import Advice, Failure from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyGraph, DependencyProblem from databricks.labs.ucx.source_code.languages import Languages -from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, Cell, CELL_SEPARATOR -from databricks.labs.ucx.source_code.notebooks.base import NOTEBOOK_HEADER +from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, Cell, CELL_SEPARATOR, NOTEBOOK_HEADER class Notebook(SourceContainer): @@ -61,13 +60,19 @@ def to_migrated_code(self): sources.append('') # following join will append lf return '\n'.join(sources) - def build_dependency_graph(self, parent: DependencyGraph, path_lookup: PathLookup) -> None: - path_lookup.push_cwd(self.path.parent) + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: problems: list[DependencyProblem] = [] for cell in self._cells: - cell.build_dependency_graph(parent, problems.append) - parent.add_problems(problems) - path_lookup.pop_cwd() + # create + maybe = cell.build_dependency_graph(parent) + for problem in maybe.problems: + if problem.is_path_missing(): + problem = problem.replace(source_path=parent.path) + problems.append(problem) + return problems + + def __repr__(self): + return f"" class NotebookLinter: @@ -101,3 +106,49 @@ def lint(self) -> Iterable[Advice]: @staticmethod def name() -> str: return "notebook-linter" + + +class FileLinter: + _EXT = { + '.py': Language.PYTHON, + '.sql': Language.SQL, + } + + def __init__(self, langs: Languages, path: Path): + self._languages: Languages = langs + self._path: Path = path + + @cached_property + def _content(self) -> str: + return self._path.read_text() + + def _file_language(self): + return self._EXT.get(self._path.suffix) + + def _is_notebook(self): + language = self._file_language() + if not language: + return False + cell_language = CellLanguage.of_language(language) + return self._content.startswith(cell_language.file_magic_header) + + def lint(self) -> Iterable[Advice]: + if self._is_notebook(): + yield from self._lint_notebook() + else: + yield from self._lint_file() + + def _lint_file(self): + language = self._file_language() + if not language: + yield Failure("unsupported-language", f"Cannot detect language for {self._path}", 0, 0, 1, 1) + try: + linter = self._languages.linter(language) + yield from linter.lint(self._content) + except ValueError as err: + yield Failure("unsupported-language", str(err), 0, 0, 1, 1) + + def _lint_notebook(self): + notebook = Notebook.parse(self._path, self._content, self._file_language()) + notebook_linter = NotebookLinter(self._languages, notebook) + yield from notebook_linter.lint() diff --git a/src/databricks/labs/ucx/source_code/path_lookup.py b/src/databricks/labs/ucx/source_code/path_lookup.py index 04375df6b8..90ba996816 100644 --- a/src/databricks/labs/ucx/source_code/path_lookup.py +++ b/src/databricks/labs/ucx/source_code/path_lookup.py @@ -1,9 +1,11 @@ from __future__ import annotations +import logging import sys -from collections.abc import Iterable from pathlib import Path +logger = logging.getLogger(__name__) + class PathLookup: @@ -17,8 +19,23 @@ def from_sys_path(cls, cwd: Path): return PathLookup(cwd, [Path(path) for path in sys.path]) def __init__(self, cwd: Path, sys_paths: list[Path]): + self._cwd = cwd self._sys_paths = sys_paths - self._cwds = [cwd] + + def change_directory(self, new_working_directory: Path) -> PathLookup: + return PathLookup(new_working_directory, self._sys_paths) + + def resolve(self, path: Path) -> Path | None: + if path.is_absolute() and path.exists(): + return path + for parent in self.paths: + absolute_path = parent / path + try: + if absolute_path.exists(): + return absolute_path + except PermissionError: + logger.warning(f"Permission denied to access {absolute_path}") + return None def push_path(self, path: Path): self._sys_paths.insert(0, path) @@ -35,20 +52,12 @@ def pop_path(self) -> Path: return result @property - def paths(self) -> Iterable[Path]: - yield self.cwd - yield from self._sys_paths - - def push_cwd(self, path: Path): - self._cwds.append(path) - - def pop_cwd(self): - result = self._cwds[0] - del self._cwds[0] - return result + def paths(self) -> list[Path]: + return [self.cwd] + self._sys_paths @property def cwd(self): - # the below might fail but that's better than returning an incorrect cwd - assert len(self._cwds) > 0 - return self._cwds[-1] + return self._cwd + + def __repr__(self): + return f"PathLookup(cwd={self._cwd}, sys_paths={self._sys_paths})" diff --git a/src/databricks/labs/ucx/source_code/pyspark.py b/src/databricks/labs/ucx/source_code/pyspark.py index 9cf51294e9..1ce97b542c 100644 --- a/src/databricks/labs/ucx/source_code/pyspark.py +++ b/src/databricks/labs/ucx/source_code/pyspark.py @@ -251,6 +251,7 @@ def __init__(self): TableNameMatcher("createTable", 1, 1000, 0, "tableName"), TableNameMatcher("createExternalTable", 1, 1000, 0, "tableName"), TableNameMatcher("getTable", 1, 1, 0), + TableNameMatcher("table", 1, 1, 0), TableNameMatcher("isCached", 1, 1, 0), TableNameMatcher("listColumns", 1, 2, 0, "tableName"), TableNameMatcher("tableExists", 1, 2, 0, "tableName"), diff --git a/src/databricks/labs/ucx/source_code/python_linter.py b/src/databricks/labs/ucx/source_code/python_linter.py index 7425a282ba..bddca8b58c 100644 --- a/src/databricks/labs/ucx/source_code/python_linter.py +++ b/src/databricks/labs/ucx/source_code/python_linter.py @@ -216,6 +216,13 @@ def is_none(self) -> bool: return False return self._root.value is None + def __repr__(self): + truncate_after = 32 + code = ast.unparse(self._root) + if len(code) > truncate_after: + code = code[0:truncate_after] + "..." + return f"" + class PythonLinter(Linter): @@ -261,8 +268,10 @@ def list_dbutils_notebook_run_calls(linter: ASTLinter) -> list[ast.Call]: def list_import_sources(linter: ASTLinter) -> list[tuple[str, ast.AST]]: nodes = linter.locate(ast.Import, []) tuples = [(alias.name, node) for node in nodes for alias in node.names] - nodes = linter.locate(ast.ImportFrom, []) - tuples.extend((node.module, node) for node in nodes) + for node in linter.locate(ast.ImportFrom, []): + leading_dots = "." * node.level + import_name = f"{leading_dots}{node.module}" if node.module else leading_dots + tuples.append((import_name, node)) nodes = linter.locate(ast.Call, [("import_module", ast.Attribute), ("importlib", ast.Name)]) tuples.extend((node.args[0].value, node) for node in nodes) nodes = linter.locate(ast.Call, [("__import__", ast.Attribute), ("importlib", ast.Name)]) diff --git a/src/databricks/labs/ucx/source_code/site_packages.py b/src/databricks/labs/ucx/source_code/site_packages.py index 3302704490..4fde9a2307 100644 --- a/src/databricks/labs/ucx/source_code/site_packages.py +++ b/src/databricks/labs/ucx/source_code/site_packages.py @@ -1,9 +1,7 @@ from __future__ import annotations import os -from dataclasses import dataclass from pathlib import Path -from collections.abc import Callable from databricks.labs.ucx.source_code.files import FileLoader from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.labs.ucx.source_code.graph import ( @@ -13,10 +11,12 @@ DependencyGraph, BaseDependencyResolver, DependencyProblem, + MaybeDependency, ) -class SitePackagesResolver(BaseDependencyResolver): +class SitePackageResolver(BaseDependencyResolver): + # TODO: this is incorrect logic, remove this resolver def __init__( self, @@ -31,14 +31,17 @@ def __init__( self._path_lookup = path_lookup def with_next_resolver(self, resolver: BaseDependencyResolver) -> BaseDependencyResolver: - return SitePackagesResolver(self._site_packages, self._file_loader, self._path_lookup, resolver) + return SitePackageResolver(self._site_packages, self._file_loader, self._path_lookup, resolver) - def resolve_import(self, name: str, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: + def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency: + # TODO: `resovle_import` is irrelevant for dist-info containers + # databricks-labs-ucx vs databricks.labs.ucx site_package = self._site_packages[name] if site_package is not None: container = SitePackageContainer(self._file_loader, site_package) - return Dependency(WrappingLoader(container), Path(name)) - return super().resolve_import(name, problem_collector) + dependency = Dependency(WrappingLoader(container), Path(name)) + return MaybeDependency(dependency, []) + return super().resolve_import(path_lookup, name) class SitePackageContainer(SourceContainer): @@ -47,9 +50,20 @@ def __init__(self, file_loader: FileLoader, site_package: SitePackage): self._file_loader = file_loader self._site_package = site_package - def build_dependency_graph(self, parent: DependencyGraph, path_lookup: PathLookup) -> None: + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: + problems: list[DependencyProblem] = [] for module_path in self._site_package.module_paths: - parent.register_dependency(Dependency(self._file_loader, module_path)) + maybe = parent.register_dependency(Dependency(self._file_loader, module_path)) + if maybe.problems: + problems.extend(maybe.problems) + return problems + + @property + def paths(self): + return self._site_package.module_paths + + def __repr__(self): + return f"" class SitePackages: @@ -70,7 +84,6 @@ def __getitem__(self, item: str) -> SitePackage | None: return self._packages.get(item, None) -@dataclass class SitePackage: @staticmethod @@ -104,3 +117,6 @@ def top_levels(self) -> list[str]: @property def module_paths(self) -> list[Path]: return [Path(self._dist_info_path.parent, path) for path in self._module_paths] + + def __repr__(self): + return f"" diff --git a/src/databricks/labs/ucx/source_code/whitelist.py b/src/databricks/labs/ucx/source_code/whitelist.py index d0e7d0311d..080605d8c7 100644 --- a/src/databricks/labs/ucx/source_code/whitelist.py +++ b/src/databricks/labs/ucx/source_code/whitelist.py @@ -5,7 +5,7 @@ from dataclasses import dataclass, field from enum import Enum from pathlib import Path -from collections.abc import Callable, Iterable +from collections.abc import Iterable from yaml import load_all as load_yaml, Loader from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -17,6 +17,7 @@ DependencyGraph, BaseDependencyResolver, DependencyProblem, + MaybeDependency, ) @@ -30,36 +31,37 @@ def with_next_resolver(self, resolver: BaseDependencyResolver) -> BaseDependency return WhitelistResolver(self._whitelist, resolver) # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1559 - def resolve_import(self, name: str, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: - if self._is_whitelisted(name): + def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency: + problem, is_known = self._is_whitelisted(name) + if problem is not None: + return MaybeDependency(None, [problem]) + if is_known: container = StubContainer() - return Dependency(WrappingLoader(container), Path(name)) - return super().resolve_import(name, problem_collector) + dependency = Dependency(WrappingLoader(container), Path(name)) + return MaybeDependency(dependency, []) + return super().resolve_import(path_lookup, name) - def _is_whitelisted(self, name: str) -> bool: + def _is_whitelisted(self, name: str) -> tuple[DependencyProblem | None, bool]: compatibility = self._whitelist.compatibility(name) # TODO attach compatibility to dependency, see https://github.com/databrickslabs/ucx/issues/1382 if compatibility is None: - return False + return None, False if compatibility == UCCompatibility.NONE: # TODO move to linter, see https://github.com/databrickslabs/ucx/issues/1527 - self._problems.append( + return ( DependencyProblem( code="dependency-check", message=f"Use of dependency {name} is deprecated", - start_line=0, - start_col=0, - end_line=0, - end_col=0, - ) + ), + True, ) - return True + return None, True class StubContainer(SourceContainer): - def build_dependency_graph(self, parent: DependencyGraph, path_lookup: PathLookup) -> None: - pass + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: + return [] class UCCompatibility(Enum): diff --git a/src/databricks/labs/ucx/source_code/workflows.py b/src/databricks/labs/ucx/source_code/workflows.py new file mode 100644 index 0000000000..c3775dfb1c --- /dev/null +++ b/src/databricks/labs/ucx/source_code/workflows.py @@ -0,0 +1,13 @@ +from databricks.labs.ucx.contexts.workflow_task import RuntimeContext +from databricks.labs.ucx.framework.tasks import Workflow, job_task + + +class ExperimentalWorkflowLinter(Workflow): + def __init__(self): + super().__init__('experimental-workflow-linter') + + @job_task(job_cluster="table_migration") + def lint_all_workflows(self, ctx: RuntimeContext): + """[EXPERIMENTAL] Analyses all jobs for source code compatibility problems. This is an experimental feature, + that is not yet fully supported.""" + ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 3f8b0425b8..132740ec76 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -549,6 +549,8 @@ def aws_cli_ctx(ws, env_or_skip, make_schema, sql_backend): class TestInstallationContext(TestRuntimeContext): + __test__ = False + def __init__( # pylint: disable=too-many-arguments self, make_table_fixture, diff --git a/tests/integration/source_code/__init__.py b/tests/integration/source_code/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py new file mode 100644 index 0000000000..554db28ad9 --- /dev/null +++ b/tests/integration/source_code/test_jobs.py @@ -0,0 +1,74 @@ +from pathlib import Path + +import pytest + +from databricks.labs.ucx.mixins.wspath import WorkspacePath + + +def test_running_real_workflow_linter_job(installation_ctx): + ctx = installation_ctx + ctx.workspace_installation.run() + ctx.deployed_workflows.run_workflow("experimental-workflow-linter") + ctx.deployed_workflows.validate_step("experimental-workflow-linter") + cursor = ctx.sql_backend.fetch(f"SELECT COUNT(*) AS count FROM {ctx.inventory_database}.workflow_problems") + result = next(cursor) + assert result['count'] > 0 + + +@pytest.fixture +def simple_ctx(installation_ctx, sql_backend, ws): + return installation_ctx.replace( + sql_backend=sql_backend, + workspace_client=ws, + connect=ws.config, + ) + + +def test_linter_from_context(simple_ctx): + # This code is essentially the same as in test_running_real_workflow_linter_job, + # but it's executed on the caller side and is easier to debug. + simple_ctx.workflow_linter.refresh_report(simple_ctx.sql_backend, simple_ctx.inventory_database) + + cursor = simple_ctx.sql_backend.fetch( + f"SELECT COUNT(*) AS count FROM {simple_ctx.inventory_database}.workflow_problems" + ) + result = next(cursor) + assert result['count'] > 0 + + +def test_job_linter_no_problems(simple_ctx, ws, make_job): + j = make_job() + + problems = simple_ctx.workflow_linter.lint_job(j.job_id) + + assert len(problems) == 0 + + +def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, make_notebook, make_random): + entrypoint = WorkspacePath(ws, f"~/linter-{make_random(4)}").expanduser() + entrypoint.mkdir() + + main_notebook = entrypoint / 'main' + make_notebook(path=main_notebook, content=b'%run ./second_notebook') + j = make_job(notebook_path=main_notebook) + + make_notebook( + path=entrypoint / 'second_notebook', + content=b"""import some_file +print('hello world') +display(spark.read.parquet("/mnt/something")) +""", + ) + + some_file = entrypoint / 'some_file.py' + some_file.write_text('display(spark.read.parquet("/mnt/foo/bar"))') + + problems = simple_ctx.workflow_linter.lint_job(j.job_id) + + messages = {f'{Path(p.path).relative_to(entrypoint)}:{p.start_line} [{p.code}] {p.message}' for p in problems} + assert messages == { + 'second_notebook:4 [direct-filesystem-access] The use of default dbfs: references is deprecated: /mnt/something', + 'some_file.py:1 [direct-filesystem-access] The use of default dbfs: references is deprecated: /mnt/foo/bar', + 'some_file.py:1 [dbfs-usage] Deprecated file system path in call to: /mnt/foo/bar', + 'second_notebook:4 [dbfs-usage] Deprecated file system path in call to: /mnt/something', + } diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py index 838d481fb2..6504deae65 100644 --- a/tests/unit/__init__.py +++ b/tests/unit/__init__.py @@ -4,26 +4,22 @@ import logging import os import pathlib +from pathlib import Path from unittest.mock import create_autospec -from typing import BinaryIO from databricks.labs.blueprint.installation import MockInstallation from databricks.labs.lsql.backends import MockBackend -from databricks.labs.ucx.source_code.notebooks.loaders import LocalNotebookLoader from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service.compute import ClusterDetails, Policy from databricks.sdk.service.jobs import BaseJob, BaseRun from databricks.sdk.service.pipelines import GetPipelineResponse, PipelineStateInfo from databricks.sdk.service.sql import EndpointConfPair -from databricks.sdk.service.workspace import ExportResponse, GetSecretResponse, Language +from databricks.sdk.service.workspace import ExportResponse, GetSecretResponse from databricks.labs.ucx.hive_metastore.mapping import TableMapping, TableToMigrate -from databricks.labs.ucx.source_code.graph import SourceContainer, Dependency -from databricks.labs.ucx.source_code.files import LocalFile, FileLoader +from databricks.labs.ucx.source_code.graph import SourceContainer from databricks.labs.ucx.source_code.path_lookup import PathLookup -from databricks.labs.ucx.source_code.notebooks.sources import Notebook -from databricks.labs.ucx.source_code.notebooks.base import NOTEBOOK_HEADER from databricks.labs.ucx.source_code.whitelist import Whitelist logging.getLogger("tests").setLevel("DEBUG") @@ -127,6 +123,7 @@ def _id_list(cls: type, ids=None): def _load_sources(cls: type, *filenames: str): + # TODO: remove the usage of it in favor of MockPathLookup if not filenames: return [] installation = MockInstallation(DEFAULT_CONFIG | {_: _load_source(f'{_FOLDERS[cls]}/{_}') for _ in filenames}) @@ -156,144 +153,29 @@ def _secret_not_found(secret_scope, _): raise NotFound(msg) -# can't remove **kwargs because it receives format=xxx -# pylint: disable=unused-argument -def _download_side_effect(sources: dict[str, str], visited: dict[str, bool], *args, **kwargs): - filename = args[0] - if filename.startswith('./'): - filename = filename[2:] - visited[filename] = True - source = sources.get(filename, None) - if filename.find(".py") < 0: - filename = filename + ".py" - if filename.find(".txt") < 0: - filename = filename + ".txt" - result = create_autospec(BinaryIO) - if source is None: - source = sources.get(filename) - assert source is not None - result.__enter__.return_value.read.return_value = source.encode("utf-8") - return result - - -def _load_dependency_side_effect(sources: dict[str, str], visited: dict[str, bool], *args): - dependency = args[0] - filename = str(dependency.path) - is_package_file = os.path.isfile(dependency.path) - if is_package_file: - with dependency.path.open("r") as f: - source = f.read() - else: - if filename.startswith('./'): - filename = filename[2:] - visited[filename] = True - source = sources.get(filename, None) - if filename.find(".py") < 0: - filename = filename + ".py" - if filename.find(".txt") < 0: - filename = filename + ".txt" - if source is None: - source = sources.get(filename) - assert source is not None - if NOTEBOOK_HEADER in source: - return Notebook.parse(pathlib.Path(filename), source, Language.PYTHON) - return LocalFile(pathlib.Path(filename), source, Language.PYTHON) - - -def _is_notebook_side_effect(sources: dict[str, str], *args): - dependency = args[0] - filename = dependency.path.as_posix() - if filename.startswith('./'): - filename = filename[2:] - source = sources.get(filename, None) - if filename.find(".py") < 0: - filename = filename + ".py" - if filename.find(".txt") < 0: - filename = filename + ".txt" - if source is None: - source = sources.get(filename) - assert source is not None - return NOTEBOOK_HEADER in source - - -def _full_path_side_effect(sources: dict[str, str], *args): - path = args[0] - filename = path.as_posix() - if filename.startswith('./'): - filename = filename[2:] - if filename in sources: - return pathlib.Path(filename) - if filename.find(".py") < 0: - filename = filename + ".py" - if filename.find(".txt") < 0: - filename = filename + ".txt" - if filename in sources: - return pathlib.Path(filename) - return None - - -def _is_file_side_effect(sources: dict[str, str], *args): - return _full_path_side_effect(sources, *args) is not None - - -def _local_loader_with_side_effects(cls: type, sources: dict[str, str], visited: dict[str, bool]): - file_loader = create_autospec(cls) - file_loader.exists.side_effect = lambda *args, **kwargs: _is_file_side_effect(sources, *args) - file_loader.is_notebook.return_value = False - file_loader.full_path.side_effect = lambda *args: _full_path_side_effect(sources, *args) - file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect( - sources, visited, *args - ) - return file_loader - - -class TestFileLoader(FileLoader): - __test__ = False - - def __init__(self, path_lookup: PathLookup, sources: dict[str, str]): - super().__init__(path_lookup) - self._sources = sources - - def exists(self, path: pathlib.Path): - if super().exists(path): - return True +class MockPathLookup(PathLookup): + def __init__(self, cwd='source_code/samples', sys_paths: list[Path] | None = None): + super().__init__(pathlib.Path(__file__).parent / cwd, sys_paths or []) + + def change_directory(self, new_working_directory: Path) -> 'MockPathLookup': + return MockPathLookup(new_working_directory, self._sys_paths) + + def resolve(self, path: pathlib.Path) -> pathlib.Path | None: + if path.is_absolute() and path.exists(): + return path filename = path.as_posix() - if filename.startswith('./'): - filename = filename[2:] - if filename in self._sources: - return True - if filename.find(".py") < 0: - filename = filename + ".py" - if filename.find(".txt") < 0: - filename = filename + ".txt" - return filename in self._sources - - -class VisitingFileLoader(FileLoader): - __test__ = False - - def __init__(self, path_lookup: PathLookup, visited: dict[str, bool]): - super().__init__(path_lookup) - self._visited = visited - - def load_dependency(self, dependency: Dependency) -> SourceContainer | None: - container = super().load_dependency(dependency) - if isinstance(container, LocalFile): - self._visited[container.path.as_posix()] = True - return container - - -class VisitingNotebookLoader(LocalNotebookLoader): - - def __init__(self, path_lookup: PathLookup, visited: dict[str, bool]): - super().__init__(path_lookup) - self._visited = visited - - def load_dependency(self, dependency: Dependency) -> SourceContainer | None: - container = super().load_dependency(dependency) - if isinstance(container, Notebook): - self._visited[container.path.as_posix()] = True - return container + candidates = [filename] + if not filename.endswith('.txt'): + candidates.append(f'{filename}.txt') + for candidate in candidates: + some_file = self._cwd / candidate + if not some_file.exists(): + continue + return some_file + return None + + def __repr__(self): + return f"" def workspace_client_mock( diff --git a/tests/unit/source_code/samples/root1-no-leaf.run.py.txt b/tests/unit/source_code/samples/root1-no-leaf.run.py.txt new file mode 100644 index 0000000000..022b6f5bb8 --- /dev/null +++ b/tests/unit/source_code/samples/root1-no-leaf.run.py.txt @@ -0,0 +1,20 @@ +# Databricks notebook source +# time horizon +days = 300 + +# volatility +sigma = 0.04 + +# drift (average growth rate) +mu = 0.05 + +# initial starting price +start_price = 10 + +# COMMAND ---------- + +# MAGIC %run "./leaf1.py.txt" + +# COMMAND ---------- + +# MAGIC %run "./__NOT_FOUND__" diff --git a/tests/unit/source_code/samples/root4-no-leaf.py.txt b/tests/unit/source_code/samples/root4-no-leaf.py.txt new file mode 100644 index 0000000000..97c425c05c --- /dev/null +++ b/tests/unit/source_code/samples/root4-no-leaf.py.txt @@ -0,0 +1,2 @@ +# Databricks notebook source +dbutils.notebook.run("./__NO_LEAF__") diff --git a/tests/unit/source_code/samples/simulate-sys-path/child-grand-parent/sibling_notebook.py b/tests/unit/source_code/samples/simulate-sys-path/child-grand-parent/sibling_notebook.py index aa2e848b2d..491230d9a4 100644 --- a/tests/unit/source_code/samples/simulate-sys-path/child-grand-parent/sibling_notebook.py +++ b/tests/unit/source_code/samples/simulate-sys-path/child-grand-parent/sibling_notebook.py @@ -1,6 +1,2 @@ # Databricks notebook source whatever = 12 - -# COMMAND ---------- - -# MAGIC %run ./sibling2_notebook diff --git a/tests/unit/source_code/samples/simulate-sys-path/child-parent/sibling_notebook.py b/tests/unit/source_code/samples/simulate-sys-path/child-parent/sibling_notebook.py index aa2e848b2d..491230d9a4 100644 --- a/tests/unit/source_code/samples/simulate-sys-path/child-parent/sibling_notebook.py +++ b/tests/unit/source_code/samples/simulate-sys-path/child-parent/sibling_notebook.py @@ -1,6 +1,2 @@ # Databricks notebook source whatever = 12 - -# COMMAND ---------- - -# MAGIC %run ./sibling2_notebook diff --git a/tests/unit/source_code/samples/simulate-sys-path/parent-child/child-folder/sibling_notebook.py b/tests/unit/source_code/samples/simulate-sys-path/parent-child/child-folder/sibling_notebook.py index aa2e848b2d..491230d9a4 100644 --- a/tests/unit/source_code/samples/simulate-sys-path/parent-child/child-folder/sibling_notebook.py +++ b/tests/unit/source_code/samples/simulate-sys-path/parent-child/child-folder/sibling_notebook.py @@ -1,6 +1,2 @@ # Databricks notebook source whatever = 12 - -# COMMAND ---------- - -# MAGIC %run ./sibling2_notebook diff --git a/tests/unit/source_code/samples/simulate-sys-path/parent-grand-child/child-folder/child-folder/sibling_notebook.py b/tests/unit/source_code/samples/simulate-sys-path/parent-grand-child/child-folder/child-folder/sibling_notebook.py index aa2e848b2d..491230d9a4 100644 --- a/tests/unit/source_code/samples/simulate-sys-path/parent-grand-child/child-folder/child-folder/sibling_notebook.py +++ b/tests/unit/source_code/samples/simulate-sys-path/parent-grand-child/child-folder/child-folder/sibling_notebook.py @@ -1,6 +1,2 @@ # Databricks notebook source whatever = 12 - -# COMMAND ---------- - -# MAGIC %run ./sibling2_notebook diff --git a/tests/unit/source_code/test_dependencies.py b/tests/unit/source_code/test_dependencies.py index 3d259c1084..71697661ea 100644 --- a/tests/unit/source_code/test_dependencies.py +++ b/tests/unit/source_code/test_dependencies.py @@ -1,8 +1,5 @@ from pathlib import Path -from unittest.mock import create_autospec -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.workspace import ObjectInfo, Language, ObjectType from databricks.labs.ucx.source_code.graph import ( SourceContainer, @@ -12,311 +9,168 @@ ) from databricks.labs.ucx.source_code.notebooks.loaders import ( NotebookResolver, - WorkspaceNotebookLoader, - LocalNotebookLoader, + NotebookLoader, ) from databricks.labs.ucx.source_code.files import FileLoader, LocalFileResolver from databricks.labs.ucx.source_code.path_lookup import PathLookup -from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist -from databricks.labs.ucx.source_code.site_packages import SitePackagesResolver, SitePackages +from databricks.labs.ucx.source_code.whitelist import WhitelistResolver +from databricks.labs.ucx.source_code.site_packages import SitePackageResolver, SitePackages from tests.unit import ( - _load_sources, - _download_side_effect, whitelist_mock, - _load_dependency_side_effect, locate_site_packages, - TestFileLoader, _samples_path, - _local_loader_with_side_effects, + MockPathLookup, ) def test_dependency_graph_builder_visits_workspace_notebook_dependencies(): - paths = ["root3.run.py.txt", "root1.run.py.txt", "leaf1.py.txt", "leaf2.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - visited: dict[str, bool] = {} - - def get_status_side_effect(*args): - path = args[0] - return ObjectInfo(path=path, language=Language.PYTHON, object_type=ObjectType.NOTEBOOK) - - ws = create_autospec(WorkspaceClient) - ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, visited, *args, **kwargs) - ws.workspace.get_status.side_effect = get_status_side_effect - workspace_notebook_loader = WorkspaceNotebookLoader(ws) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(workspace_notebook_loader), - ] - ) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_notebook_dependency_graph(Path("root3.run.py.txt")) - assert len(visited) == len(paths) + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader)]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_notebook_dependency_graph(Path("root3.run.py")) + assert not maybe.failed + assert maybe.graph.all_relative_names() == {"root3.run.py.txt", "root1.run.py.txt", "leaf1.py.txt", "leaf2.py.txt"} def test_dependency_graph_builder_visits_local_notebook_dependencies(): - paths = ["root4.py.txt", "leaf3.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - visited: dict[str, bool] = {} - whi = whitelist_mock() - file_loader = create_autospec(FileLoader) - file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect( - sources, visited, *args - ) - file_loader.is_notebook.return_value = True - file_loader.exists.return_value = True - notebook_loader = create_autospec(LocalNotebookLoader) - notebook_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect( - sources, visited, *args - ) - notebook_loader.is_notebook.return_value = True - notebook_loader.exists.return_value = True - site_packages = SitePackages.parse(locate_site_packages()) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whi), - LocalFileResolver(file_loader), - ] - ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_notebook_dependency_graph(Path("root4.py.txt")) - assert len(visited) == len(paths) + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader)]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_notebook_dependency_graph(Path("root4.py.txt")) + assert not maybe.failed + assert maybe.graph.all_relative_names() == {"root4.py.txt", "leaf3.py.txt"} def test_dependency_graph_builder_visits_workspace_file_dependencies(): - paths = ["root8.py.txt", "leaf1.py.txt", "leaf2.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - visited: dict[str, bool] = {} - file_loader = _local_loader_with_side_effects(FileLoader, sources, visited) whi = whitelist_mock() site_packages = SitePackages.parse(locate_site_packages()) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - notebook_loader = LocalNotebookLoader(provider) + file_loader = FileLoader() + lookup = MockPathLookup() + notebook_loader = NotebookLoader() dependency_resolver = DependencyResolver( [ NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), + SitePackageResolver(site_packages, file_loader, lookup), WhitelistResolver(whi), LocalFileResolver(file_loader), ] ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_local_file_dependency_graph(Path(paths[0])) - assert len(visited) == len(paths) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_local_file_dependency_graph(Path('./root8.py')) + assert not maybe.failed + assert maybe.graph.all_relative_names() == {'leaf1.py.txt', 'leaf2.py.txt', 'root8.py.txt'} def test_dependency_graph_builder_raises_problem_with_unfound_workspace_notebook_dependency(): - paths = ["root1.run.py.txt", "leaf1.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - - def get_status_side_effect(*args): - path = args[0] - if "leaf2" in path: - return None - return ObjectInfo(object_type=ObjectType.NOTEBOOK, language=Language.PYTHON, path=path) - - ws = create_autospec(WorkspaceClient) - ws.workspace.is_notebook.return_value = True - ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) - ws.workspace.get_status.side_effect = get_status_side_effect whi = whitelist_mock() - file_loader = create_autospec(FileLoader) - file_loader.is_notebook.return_value = False site_packages = SitePackages.parse(locate_site_packages()) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - notebook_loader = WorkspaceNotebookLoader(ws) + file_loader = FileLoader() + lookup = MockPathLookup() + notebook_loader = NotebookLoader() dependency_resolver = DependencyResolver( [ NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), + SitePackageResolver(site_packages, file_loader, lookup), WhitelistResolver(whi), LocalFileResolver(file_loader), ] ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_notebook_dependency_graph(Path("root1.run.py.txt")) - assert list(builder.problems) == [ + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_notebook_dependency_graph(Path("./root1-no-leaf.run.py")) + assert list(maybe.problems) == [ DependencyProblem( - 'notebook-not-found', 'Notebook not found: leaf2.py.txt', Path("root1.run.py.txt"), 19, 0, 19, 21 + 'notebook-not-found', + 'Notebook not found: __NOT_FOUND__', + lookup.cwd / 'root1-no-leaf.run.py.txt', + 19, + 0, + 19, + 22, ) ] def test_dependency_graph_builder_raises_problem_with_unfound_local_notebook_dependency(): - paths = ["root4.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - whi = whitelist_mock() - - def is_file_side_effect(*args): - path = args[0] - return path.as_posix() in paths - - file_loader = create_autospec(FileLoader) - file_loader.exists.side_effect = is_file_side_effect - file_loader.is_notebook.side_effect = is_file_side_effect - file_loader.load_dependency.side_effect = lambda *args: _load_dependency_side_effect(sources, {}, *args) - notebook_loader = create_autospec(LocalNotebookLoader) - notebook_loader.exists.side_effect = is_file_side_effect - notebook_loader.is_notebook.side_effect = is_file_side_effect - notebook_loader.load_dependency.side_effect = lambda *args: _load_dependency_side_effect(sources, {}, *args) - site_packages = SitePackages.parse(locate_site_packages()) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whi), - LocalFileResolver(file_loader), - ] - ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_notebook_dependency_graph(Path(paths[0])) - assert list(builder.problems) == [ - DependencyProblem('notebook-not-found', 'Notebook not found: leaf3.py.txt', Path(paths[0]), 1, 0, 1, 38) + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader)]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_notebook_dependency_graph(Path("./root4-no-leaf.py")) + assert list(maybe.problems) == [ + DependencyProblem( + 'notebook-not-found', 'Notebook not found: __NO_LEAF__', lookup.cwd / 'root4-no-leaf.py.txt', 1, 0, 1, 37 + ) ] def test_dependency_graph_builder_raises_problem_with_non_constant_local_notebook_dependency(): - paths = ["root10.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - whi = whitelist_mock() - - def is_file_side_effect(*args): - path = args[0] - return path.as_posix() in paths - - file_loader = create_autospec(FileLoader) - file_loader.exists.side_effect = is_file_side_effect - file_loader.is_notebook.side_effect = is_file_side_effect - file_loader.load_dependency.side_effect = lambda *args: _load_dependency_side_effect(sources, {}, *args) - notebook_loader = create_autospec(LocalNotebookLoader) - notebook_loader.exists.side_effect = is_file_side_effect - notebook_loader.is_notebook.side_effect = is_file_side_effect - notebook_loader.load_dependency.side_effect = lambda *args: _load_dependency_side_effect(sources, {}, *args) - site_packages = SitePackages.parse(locate_site_packages()) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whi), - LocalFileResolver(file_loader), - ] - ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_notebook_dependency_graph(Path(paths[0])) - assert list(builder.problems) == [ + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader)]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_notebook_dependency_graph(Path('./root10.py.txt')) + assert list(maybe.problems) == [ DependencyProblem( - 'dependency-not-constant', "Can't check dependency not provided as a constant", Path(paths[0]), 2, 0, 2, 35 + 'dependency-not-constant', + "Can't check dependency not provided as a constant", + lookup.cwd / 'root10.py.txt', + 2, + 0, + 2, + 35, ) ] def test_dependency_graph_builder_raises_problem_with_invalid_run_cell(): - paths = ["leaf6.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - ws = create_autospec(WorkspaceClient) - ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) - ws.workspace.get_status.return_value = ObjectInfo( - object_type=ObjectType.NOTEBOOK, language=Language.PYTHON, path=paths[0] - ) - whi = whitelist_mock() - file_loader = create_autospec(FileLoader) - file_loader.is_notebook.return_value = False - site_packages = SitePackages.parse(locate_site_packages()) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - notebook_loader = WorkspaceNotebookLoader(ws) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whi), - LocalFileResolver(file_loader), - ] - ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_notebook_dependency_graph(Path(paths[0])) - assert list(builder.problems) == [ - DependencyProblem('invalid-run-cell', 'Missing notebook path in %run command', Path("leaf6.py.txt"), 5, 0, 5, 4) + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader)]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_notebook_dependency_graph(Path('leaf6.py.txt')) + assert list(maybe.problems) == [ + DependencyProblem( + 'invalid-run-cell', 'Missing notebook path in %run command', lookup.cwd / 'leaf6.py.txt', 5, 0, 5, 4 + ) ] def test_dependency_graph_builder_visits_recursive_file_dependencies(): - paths = ["root6.py.txt", "root5.py.txt", "leaf4.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - visited: dict[str, bool] = {} - whi = whitelist_mock() - file_loader = _local_loader_with_side_effects(FileLoader, sources, visited) - site_packages = SitePackages.parse(locate_site_packages()) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - notebook_loader = LocalNotebookLoader(provider) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whi), - LocalFileResolver(file_loader), - ] - ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_local_file_dependency_graph(Path("root6.py.txt")) - assert len(visited) == len(paths) + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader), LocalFileResolver(FileLoader())]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_local_file_dependency_graph(Path("root6.py.txt")) + assert not maybe.failed + assert maybe.graph.all_relative_names() == {"root6.py.txt", "root5.py.txt", "leaf4.py.txt"} def test_dependency_graph_builder_raises_problem_with_unresolved_import(): - paths = ["root7.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - visited: dict[str, bool] = {} - whi = whitelist_mock() - file_loader = _local_loader_with_side_effects(FileLoader, sources, visited) - site_packages = SitePackages.parse(locate_site_packages()) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - notebook_loader = LocalNotebookLoader(provider) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whi), - LocalFileResolver(file_loader), - ] - ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_local_file_dependency_graph(Path(paths[0])) - assert list(builder.problems) == [ + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader), LocalFileResolver(FileLoader())]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_local_file_dependency_graph(Path('root7.py.txt')) + assert list(maybe.problems) == [ DependencyProblem( - 'import-not-found', 'Could not locate import: some_library', Path("root7.py.txt"), 1, 0, 1, 19 + 'import-not-found', 'Could not locate import: some_library', lookup.cwd / "root7.py.txt", 1, 0, 1, 19 ) ] def test_dependency_graph_builder_raises_problem_with_non_constant_notebook_argument(): - paths = ["run_notebooks.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - visited: dict[str, bool] = {} - whi = Whitelist() - file_loader = _local_loader_with_side_effects(FileLoader, sources, visited) - site_packages = SitePackages.parse(locate_site_packages()) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - notebook_loader = LocalNotebookLoader(provider) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whi), - LocalFileResolver(file_loader), - ] - ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_local_file_dependency_graph(Path(paths[0])) - assert list(builder.problems) == [ + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader), LocalFileResolver(FileLoader())]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_local_file_dependency_graph(Path("run_notebooks.py.txt")) + assert list(maybe.problems) == [ DependencyProblem( 'dependency-not-constant', "Can't check dependency not provided as a constant", - Path(paths[0]), + lookup.cwd / "run_notebooks.py.txt", 14, 13, 14, @@ -326,140 +180,75 @@ def test_dependency_graph_builder_raises_problem_with_non_constant_notebook_argu def test_dependency_graph_builder_visits_file_dependencies(): - paths = ["root5.py.txt", "leaf4.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - visited: dict[str, bool] = {} - whi = whitelist_mock() - file_loader = _local_loader_with_side_effects(FileLoader, sources, visited) - site_packages = SitePackages.parse(locate_site_packages()) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - notebook_loader = LocalNotebookLoader(provider) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whi), - LocalFileResolver(file_loader), - ] - ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_local_file_dependency_graph(Path("root5.py.txt")) - assert len(visited) == len(paths) + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader), LocalFileResolver(FileLoader())]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_local_file_dependency_graph(Path("root5.py.txt")) + assert not maybe.failed + assert maybe.graph.all_relative_names() == {"root5.py.txt", "leaf4.py.txt"} def test_dependency_graph_builder_skips_builtin_dependencies(): - paths = ["python_builtins.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - whi = Whitelist() - file_loader = _local_loader_with_side_effects(FileLoader, sources, {}) - site_packages = SitePackages.parse(locate_site_packages()) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - notebook_loader = LocalNotebookLoader(provider) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whi), - LocalFileResolver(file_loader), - ] - ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - graph = builder.build_local_file_dependency_graph(Path("python_builtins.py.txt")) - child = graph.locate_dependency(Path("os")) - assert child - assert len(child.local_dependencies) == 0 - child = graph.locate_dependency(Path("pathlib")) - assert child - assert len(child.local_dependencies) == 0 + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader), LocalFileResolver(FileLoader())]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_local_file_dependency_graph(Path("python_builtins.py.txt")) + assert not maybe.failed + graph = maybe.graph + maybe = graph.locate_dependency(Path("os")) + assert maybe.failed + maybe = graph.locate_dependency(Path("pathlib")) + assert maybe.failed def test_dependency_graph_builder_ignores_known_dependencies(): - paths = ["python_builtins.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - datas = _load_sources(SourceContainer, "sample-python-compatibility-catalog.yml") - whitelist = Whitelist.parse(datas[0]) - file_loader = _local_loader_with_side_effects(FileLoader, sources, {}) - site_packages = SitePackages.parse(locate_site_packages()) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - notebook_loader = LocalNotebookLoader(provider) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whitelist), - LocalFileResolver(file_loader), - ] - ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - graph = builder.build_local_file_dependency_graph(Path("python_builtins.py.txt")) - assert not graph.locate_dependency(Path("databricks")) + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader), LocalFileResolver(FileLoader())]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_local_file_dependency_graph(Path("python_builtins.py.txt")) + assert maybe.graph + graph = maybe.graph + maybe_graph = graph.locate_dependency(Path("databricks")) + assert not maybe_graph.graph def test_dependency_graph_builder_visits_site_packages(empty_index): - datas = _load_sources(SourceContainer, "sample-python-compatibility-catalog.yml") - whitelist = Whitelist.parse(datas[0]) - paths = ["import-site-package.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) provider = PathLookup.from_pathlike_string(Path.cwd(), _samples_path(SourceContainer)) - file_loader = TestFileLoader(provider, sources) + file_loader = FileLoader() site_packages_path = locate_site_packages() site_packages = SitePackages.parse(site_packages_path) - notebook_loader = LocalNotebookLoader(provider) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whitelist), - LocalFileResolver(file_loader), - ] - ) + notebook_loader = NotebookLoader() + resolvers = [ + NotebookResolver(notebook_loader), + SitePackageResolver(site_packages, file_loader, provider), + LocalFileResolver(file_loader), + ] + dependency_resolver = DependencyResolver(resolvers) builder = DependencyGraphBuilder(dependency_resolver, provider) - graph = builder.build_local_file_dependency_graph(Path("import-site-package.py.txt")) - assert graph.locate_dependency(Path(site_packages_path, "certifi/core.py")) - assert not graph.locate_dependency(Path("core.py")) - assert not graph.locate_dependency(Path("core")) + maybe = builder.build_local_file_dependency_graph(Path("import-site-package.py.txt")) + assert not maybe.failed + graph = maybe.graph + maybe = graph.locate_dependency(Path(site_packages_path, "certifi/core.py")) + assert not maybe.failed + maybe = graph.locate_dependency(Path("core.py")) + assert maybe.failed def test_dependency_graph_builder_raises_problem_with_unfound_root_file(empty_index): - site_packages = SitePackages.parse(locate_site_packages()) - whi = whitelist_mock() - file_loader = create_autospec(FileLoader) - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - notebook_loader = LocalNotebookLoader(provider) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whi), - LocalFileResolver(file_loader), - ] - ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_local_file_dependency_graph(Path("root1.run.py.txt")) - assert list(builder.problems) == [DependencyProblem('file-not-found', 'File not found: root1.run.py.txt')] - file_loader.load_dependency.assert_not_called() + lookup = MockPathLookup() + dependency_resolver = DependencyResolver([LocalFileResolver(FileLoader())]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_local_file_dependency_graph(Path("non-existing.py.txt")) + assert list(maybe.problems) == [DependencyProblem('file-not-found', 'File not found: non-existing.py.txt')] def test_dependency_graph_builder_raises_problem_with_unfound_root_notebook(empty_index): - sources: dict[str, str] = {} - ws = create_autospec(WorkspaceClient) - ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) - ws.workspace.get_status.return_value = None - site_packages = SitePackages.parse(locate_site_packages()) - whi = whitelist_mock() - file_loader = create_autospec(FileLoader) - notebook_loader = create_autospec(LocalNotebookLoader) - notebook_loader.is_notebook.return_value = False - provider = PathLookup.from_pathlike_string(Path.cwd(), "") - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whi), - LocalFileResolver(file_loader), - ] - ) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_notebook_dependency_graph(Path("root2.run.py.txt")) - assert list(builder.problems) == [DependencyProblem('notebook-not-found', 'Notebook not found: root2.run.py.txt')] - file_loader.load_dependency.assert_not_called() + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader)]) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_notebook_dependency_graph(Path("unknown_notebook")) + assert list(maybe.problems) == [DependencyProblem('notebook-not-found', 'Notebook not found: unknown_notebook')] diff --git a/tests/unit/source_code/test_files.py b/tests/unit/source_code/test_files.py index cb1c9546b8..2c4aedfdc0 100644 --- a/tests/unit/source_code/test_files.py +++ b/tests/unit/source_code/test_files.py @@ -5,8 +5,9 @@ from databricks.sdk.service.workspace import Language from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.files import LocalFileMigrator +from databricks.labs.ucx.source_code.files import LocalFileMigrator, LocalFileResolver, FileLoader from databricks.labs.ucx.source_code.languages import Languages +from databricks.labs.ucx.source_code.path_lookup import PathLookup def test_files_fix_ignores_unsupported_extensions(): @@ -71,3 +72,29 @@ def test_files_walks_directory(): files.apply(path) languages.fixer.assert_called_with(Language.PYTHON, 'some-code') assert languages.fixer.call_count > 1 + + +def test_triple_dot_import(): + file_loader = FileLoader() + file_resolver = LocalFileResolver(file_loader) + path_lookup = create_autospec(PathLookup) + path_lookup.cwd.as_posix.return_value = '/some/path/to/folder' + path_lookup.resolve.return_value = Path('/some/path/foo.py') + + maybe = file_resolver.resolve_import(path_lookup, "...foo") + assert not maybe.problems + assert maybe.dependency.path == Path('/some/path/foo.py') + path_lookup.resolve.assert_called_once_with(Path('/some/path/to/folder/../../foo.py')) + + +def test_single_dot_import(): + file_loader = FileLoader() + file_resolver = LocalFileResolver(file_loader) + path_lookup = create_autospec(PathLookup) + path_lookup.cwd.as_posix.return_value = '/some/path/to/folder' + path_lookup.resolve.return_value = Path('/some/path/to/folder/foo.py') + + maybe = file_resolver.resolve_import(path_lookup, ".foo") + assert not maybe.problems + assert maybe.dependency.path == Path('/some/path/to/folder/foo.py') + path_lookup.resolve.assert_called_once_with(Path('/some/path/to/folder/foo.py')) diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py index 904304b178..b03f7c2427 100644 --- a/tests/unit/source_code/test_notebook.py +++ b/tests/unit/source_code/test_notebook.py @@ -1,19 +1,18 @@ from pathlib import Path -from unittest.mock import create_autospec import re import pytest -from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.sdk.service.workspace import Language, ObjectType, ObjectInfo -from databricks.sdk import WorkspaceClient from databricks.labs.ucx.source_code.base import Advisory from databricks.labs.ucx.source_code.graph import DependencyGraph, SourceContainer, DependencyResolver from databricks.labs.ucx.source_code.notebooks.sources import Notebook -from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, WorkspaceNotebookLoader +from databricks.labs.ucx.source_code.notebooks.loaders import ( + NotebookResolver, + NotebookLoader, +) from databricks.labs.ucx.source_code.python_linter import PythonLinter -from tests.unit import _load_sources, _download_side_effect - +from tests.unit import _load_sources, MockPathLookup # fmt: off # the following samples are real samples from https://github.com/databricks-industry-solutions @@ -127,25 +126,15 @@ def test_notebook_generates_runnable_cells(source: tuple[str, Language, list[str def test_notebook_builds_leaf_dependency_graph(): - paths = ["leaf1.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - ws = create_autospec(WorkspaceClient) - ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) - ws.workspace.get_status.return_value = ObjectInfo( - object_type=ObjectType.NOTEBOOK, path="leaf1.py.txt", language=Language.PYTHON - ) - notebook_loader = WorkspaceNotebookLoader(ws) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - ] - ) - dependency = dependency_resolver.resolve_notebook(Path(paths[0])) - provider = PathLookup.from_sys_path(Path.cwd()) - graph = DependencyGraph(dependency, None, dependency_resolver, provider) - container = dependency.load() - container.build_dependency_graph(graph, provider) - assert {str(path) for path in graph.all_paths} == {"leaf1.py.txt"} + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader)]) + maybe = dependency_resolver.resolve_notebook(lookup, Path("leaf1.py.txt")) + graph = DependencyGraph(maybe.dependency, None, dependency_resolver, lookup) + container = maybe.dependency.load(lookup) + problems = container.build_dependency_graph(graph) + assert not problems + assert graph.all_paths == {lookup.cwd / "leaf1.py.txt"} def get_status_side_effect(*args): @@ -155,109 +144,68 @@ def get_status_side_effect(*args): def test_notebook_builds_depth1_dependency_graph(): paths = ["root1.run.py.txt", "leaf1.py.txt", "leaf2.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - ws = create_autospec(WorkspaceClient) - ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) - ws.workspace.get_status.side_effect = get_status_side_effect - notebook_loader = WorkspaceNotebookLoader(ws) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - ] - ) - dependency = dependency_resolver.resolve_notebook(Path(paths[0])) - provider = PathLookup.from_sys_path(Path.cwd()) - graph = DependencyGraph(dependency, None, dependency_resolver, provider) - container = dependency.load() - container.build_dependency_graph(graph, provider) - actual = {path[2:] if path.startswith('./') else path for path in (str(path) for path in graph.all_paths)} - assert actual == set(paths) + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader)]) + maybe = dependency_resolver.resolve_notebook(lookup, Path(paths[0])) + graph = DependencyGraph(maybe.dependency, None, dependency_resolver, lookup) + container = maybe.dependency.load(lookup) + problems = container.build_dependency_graph(graph) + assert not problems + assert graph.all_paths == {lookup.cwd / path for path in paths} def test_notebook_builds_depth2_dependency_graph(): paths = ["root2.run.py.txt", "root1.run.py.txt", "leaf1.py.txt", "leaf2.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - ws = create_autospec(WorkspaceClient) - ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) - ws.workspace.get_status.side_effect = get_status_side_effect - notebook_loader = WorkspaceNotebookLoader(ws) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - ] - ) - dependency = dependency_resolver.resolve_notebook(Path(paths[0])) - provider = PathLookup.from_sys_path(Path.cwd()) - graph = DependencyGraph(dependency, None, dependency_resolver, provider) - container = dependency.load() - container.build_dependency_graph(graph, provider) - actual = {path[2:] if path.startswith('./') else path for path in (str(path) for path in graph.all_paths)} - assert actual == set(paths) + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader)]) + maybe = dependency_resolver.resolve_notebook(lookup, Path(paths[0])) + graph = DependencyGraph(maybe.dependency, None, dependency_resolver, lookup) + container = maybe.dependency.load(lookup) + problems = container.build_dependency_graph(graph) + assert not problems + assert graph.all_paths == {lookup.cwd / path for path in paths} def test_notebook_builds_dependency_graph_avoiding_duplicates(): paths = ["root3.run.py.txt", "root1.run.py.txt", "leaf1.py.txt", "leaf2.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - visited = {} - ws = create_autospec(WorkspaceClient) - ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, visited, *args, **kwargs) - ws.workspace.get_status.side_effect = get_status_side_effect - notebook_loader = WorkspaceNotebookLoader(ws) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - ] - ) - dependency = dependency_resolver.resolve_notebook(Path(paths[0])) - provider = PathLookup.from_sys_path(Path.cwd()) - graph = DependencyGraph(dependency, None, dependency_resolver, provider) - container = dependency.load() - container.build_dependency_graph(graph, provider) + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader)]) + maybe = dependency_resolver.resolve_notebook(lookup, Path(paths[0])) + graph = DependencyGraph(maybe.dependency, None, dependency_resolver, lookup) + container = maybe.dependency.load(lookup) + problems = container.build_dependency_graph(graph) + assert not problems # if visited once only, set and list will have same len - assert len(set(visited)) == len(visited) + assert graph.all_paths == {lookup.cwd / path for path in paths} def test_notebook_builds_cyclical_dependency_graph(): paths = ["cyclical1.run.py.txt", "cyclical2.run.py.txt"] - sources: list[str] = _load_sources(SourceContainer, *paths) - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - ws = create_autospec(WorkspaceClient) - ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) - ws.workspace.get_status.side_effect = get_status_side_effect - notebook_loader = WorkspaceNotebookLoader(ws) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - ] - ) - dependency = dependency_resolver.resolve_notebook(Path(paths[0])) - provider = PathLookup.from_sys_path(Path.cwd()) - graph = DependencyGraph(dependency, None, dependency_resolver, provider) - container = dependency.load() - container.build_dependency_graph(graph, provider) - actual = {path[2:] if path.startswith('./') else path for path in (str(path) for path in graph.all_paths)} - assert actual == set(paths) + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader)]) + maybe = dependency_resolver.resolve_notebook(lookup, Path(paths[0])) + graph = DependencyGraph(maybe.dependency, None, dependency_resolver, lookup) + container = maybe.dependency.load(lookup) + problems = container.build_dependency_graph(graph) + assert not problems + assert graph.all_paths == {lookup.cwd / path for path in paths} def test_notebook_builds_python_dependency_graph(): paths = ["root4.py.txt", "leaf3.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - ws = create_autospec(WorkspaceClient) - ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) - ws.workspace.get_status.side_effect = get_status_side_effect - notebook_loader = WorkspaceNotebookLoader(ws) - dependency_resolver = DependencyResolver( - [ - NotebookResolver(notebook_loader), - ] - ) - dependency = dependency_resolver.resolve_notebook(Path(paths[0])) - provider = PathLookup.from_sys_path(Path.cwd()) - graph = DependencyGraph(dependency, None, dependency_resolver, provider) - container = dependency.load() - container.build_dependency_graph(graph, provider) - actual = {path[2:] if path.startswith('./') else path for path in (str(path) for path in graph.all_paths)} - assert actual == set(paths) + lookup = MockPathLookup() + notebook_loader = NotebookLoader() + dependency_resolver = DependencyResolver([NotebookResolver(notebook_loader)]) + maybe = dependency_resolver.resolve_notebook(lookup, Path(paths[0])) + graph = DependencyGraph(maybe.dependency, None, dependency_resolver, lookup) + container = maybe.dependency.load(lookup) + problems = container.build_dependency_graph(graph) + assert not problems + assert graph.all_paths == {lookup.cwd / path for path in paths} def test_detects_manual_migration_in_dbutils_notebook_run_in_python_code_(): diff --git a/tests/unit/source_code/test_notebook_migrator.py b/tests/unit/source_code/test_notebook_migrator.py deleted file mode 100644 index 4b357c9593..0000000000 --- a/tests/unit/source_code/test_notebook_migrator.py +++ /dev/null @@ -1,105 +0,0 @@ -from unittest.mock import create_autospec - -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.workspace import ExportFormat, Language, ObjectInfo, ObjectType - -from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.graph import Dependency -from databricks.labs.ucx.source_code.languages import Languages -from databricks.labs.ucx.source_code.notebooks.sources import Notebook -from databricks.labs.ucx.source_code.notebooks.migrator import NotebookMigrator - - -def test_apply_invalid_object_fails(): - ws = create_autospec(WorkspaceClient) - languages = Languages(MigrationIndex([])) - migrator = NotebookMigrator(ws, languages) - object_info = ObjectInfo(language=Language.PYTHON) - assert not migrator.apply(object_info) - ws.workspace.download.assert_not_called() - ws.workspace.upload.assert_not_called() - - -def test_revert_invalid_object_fails(): - ws = create_autospec(WorkspaceClient) - languages = Languages(MigrationIndex([])) - migrator = NotebookMigrator(ws, languages) - object_info = ObjectInfo(language=Language.PYTHON) - assert not migrator.revert(object_info) - ws.workspace.download.assert_not_called() - ws.workspace.upload.assert_not_called() - - -def test_revert_restores_original_code(): - ws = create_autospec(WorkspaceClient) - ws.workspace.download.return_value.__enter__.return_value.read.return_value = b'original_code' - languages = Languages(MigrationIndex([])) - migrator = NotebookMigrator(ws, languages) - object_info = ObjectInfo(path='path', language=Language.PYTHON) - migrator.revert(object_info) - ws.workspace.download.assert_called_with('path.bak', format=ExportFormat.SOURCE) - ws.workspace.upload.assert_called_with('path', b'original_code') - - -def test_apply_returns_false_when_language_not_supported(): - notebook_code = """# Databricks notebook source -# MAGIC %r -# // original code -""" - ws = create_autospec(WorkspaceClient) - ws.workspace.download.return_value.__enter__.return_value.read.return_value = notebook_code.encode("utf-8") - ws.workspace.get_status.return_value = ObjectInfo( - object_type=ObjectType.NOTEBOOK, language=Language.PYTHON, path="path" - ) - languages = create_autospec(Languages) - languages.is_supported.return_value = False - dependency = create_autospec(Dependency) - dependency.load.return_value = Notebook.parse('path', notebook_code, Language.R) - migrator = NotebookMigrator(ws, languages) - object_info = ObjectInfo(path='path', language=Language.R, object_type=ObjectType.NOTEBOOK) - result = migrator.apply(object_info) - assert not result - - -def test_apply_returns_false_when_no_fixes_applied(): - notebook_code = """# Databricks notebook source -# original code -""" - ws = create_autospec(WorkspaceClient) - ws.workspace.download.return_value.__enter__.return_value.read.return_value = notebook_code.encode("utf-8") - ws.workspace.get_status.return_value = ObjectInfo( - object_type=ObjectType.NOTEBOOK, language=Language.PYTHON, path="path" - ) - languages = create_autospec(Languages) - languages.is_supported.return_value = True - languages.apply_fixes.return_value = "# original code" # cell code - dependency = create_autospec(Dependency) - dependency.load.return_value = Notebook.parse('path', notebook_code, Language.R) - migrator = NotebookMigrator(ws, languages) - object_info = ObjectInfo(path='path', language=Language.PYTHON, object_type=ObjectType.NOTEBOOK) - assert not migrator.apply(object_info) - - -def test_apply_returns_true_and_changes_code_when_fixes_applied(): - original_code = """# Databricks notebook source -# original code -""" - migrated_cell_code = '# migrated code' - migrated_code = """# Databricks notebook source -# migrated code -""" - ws = create_autospec(WorkspaceClient) - ws.workspace.download.return_value.__enter__.return_value.read.return_value = original_code.encode("utf-8") - ws.workspace.get_status.return_value = ObjectInfo( - object_type=ObjectType.NOTEBOOK, language=Language.PYTHON, path="path" - ) - languages = create_autospec(Languages) - languages.is_supported.return_value = True - languages.apply_fixes.return_value = migrated_cell_code - dependency = create_autospec(Dependency) - dependency.load.return_value = Notebook.parse('path', original_code, Language.R) - migrator = NotebookMigrator(ws, languages) - object_info = ObjectInfo(path='path', language=Language.PYTHON, object_type=ObjectType.NOTEBOOK) - assert migrator.apply(object_info) - ws.workspace.upload.assert_any_call('path.bak', original_code.encode("utf-8")) - ws.workspace.upload.assert_any_call('path', migrated_code.encode("utf-8")) diff --git a/tests/unit/source_code/test_path_lookup.py b/tests/unit/source_code/test_path_lookup.py index f55d170be0..3116860edb 100644 --- a/tests/unit/source_code/test_path_lookup.py +++ b/tests/unit/source_code/test_path_lookup.py @@ -46,10 +46,3 @@ def test_lookup_pops_path(): assert popped.as_posix() == "what" paths = list(provider.paths)[1:] assert [path.as_posix() for path in paths] == ["on", "earth"] - - -def test_lookup_pushes_cwd(): - provider = PathLookup.from_sys_path(Path.cwd()) - location = Path("some-location") - provider.push_cwd(location) - assert provider.cwd == location diff --git a/tests/unit/source_code/test_path_lookup_simulation.py b/tests/unit/source_code/test_path_lookup_simulation.py index 03e64a993e..745a943026 100644 --- a/tests/unit/source_code/test_path_lookup_simulation.py +++ b/tests/unit/source_code/test_path_lookup_simulation.py @@ -1,13 +1,18 @@ from pathlib import Path import pytest -from databricks.labs.ucx.source_code.files import LocalFileResolver +from databricks.labs.ucx.source_code.files import LocalFileResolver, FileLoader from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyGraphBuilder, DependencyResolver -from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver -from databricks.labs.ucx.source_code.site_packages import SitePackages, SitePackagesResolver +from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader +from databricks.labs.ucx.source_code.site_packages import SitePackages, SitePackageResolver from databricks.labs.ucx.source_code.whitelist import WhitelistResolver -from tests.unit import _samples_path, whitelist_mock, VisitingFileLoader, VisitingNotebookLoader, locate_site_packages +from tests.unit import ( + _samples_path, + whitelist_mock, + locate_site_packages, + MockPathLookup, +) @pytest.mark.parametrize( @@ -30,43 +35,44 @@ ], ) def test_locates_notebooks(source: list[str], expected: int): - visited: dict[str, bool] = {} elems = [_samples_path(SourceContainer)] elems.extend(source) notebook_path = Path(*elems) - whitelist = whitelist_mock() - provider = PathLookup.from_sys_path(Path.cwd()) - file_loader = VisitingFileLoader(provider, visited) - notebook_loader = VisitingNotebookLoader(provider, visited) + lookup = MockPathLookup() + file_loader = FileLoader() + notebook_loader = NotebookLoader() site_packages = SitePackages.parse(locate_site_packages()) resolvers = [ NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), - WhitelistResolver(whitelist), + SitePackageResolver(site_packages, file_loader, lookup), LocalFileResolver(file_loader), ] - builder = DependencyGraphBuilder(DependencyResolver(resolvers), provider) - builder.build_notebook_dependency_graph(notebook_path) - assert len(visited) == expected + dependency_resolver = DependencyResolver(resolvers) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_notebook_dependency_graph(notebook_path) + assert not maybe.problems + assert maybe.graph is not None + assert len(maybe.graph.all_paths) == expected @pytest.mark.parametrize("source, expected", [(["simulate-sys-path", "siblings", "sibling1_file.py"], 2)]) def test_locates_files(source: list[str], expected: int): - visited: dict[str, bool] = {} elems = [_samples_path(SourceContainer)] elems.extend(source) file_path = Path(*elems) whitelist = whitelist_mock() provider = PathLookup.from_sys_path(Path.cwd()) - file_loader = VisitingFileLoader(provider, visited) - notebook_loader = VisitingNotebookLoader(provider, visited) + file_loader = FileLoader() + notebook_loader = NotebookLoader() site_packages = SitePackages.parse(locate_site_packages()) resolvers = [ NotebookResolver(notebook_loader), - SitePackagesResolver(site_packages, file_loader, provider), + SitePackageResolver(site_packages, file_loader, provider), WhitelistResolver(whitelist), LocalFileResolver(file_loader), ] builder = DependencyGraphBuilder(DependencyResolver(resolvers), provider) - builder.build_local_file_dependency_graph(file_path) - assert len(visited) == expected + maybe = builder.build_local_file_dependency_graph(file_path) + assert not maybe.problems + assert maybe.graph is not None + assert len(maybe.graph.all_dependencies) == expected diff --git a/tests/unit/source_code/test_s3fs.py b/tests/unit/source_code/test_s3fs.py index c0999d8f33..325b6e99a9 100644 --- a/tests/unit/source_code/test_s3fs.py +++ b/tests/unit/source_code/test_s3fs.py @@ -3,17 +3,15 @@ import pytest from databricks.labs.ucx.source_code.graph import ( - SourceContainer, DependencyResolver, DependencyProblem, DependencyGraphBuilder, ) from databricks.labs.ucx.source_code.files import FileLoader, LocalFileResolver -from databricks.labs.ucx.source_code.path_lookup import PathLookup +from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, NotebookResolver from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist from tests.unit import ( - _load_sources, - _local_loader_with_side_effects, + MockPathLookup, ) @@ -29,10 +27,11 @@ DependencyProblem( code='dependency-check', message=S3FS_DEPRECATION_MESSAGE, - start_line=0, + source_path=Path('path.py.txt'), + start_line=1, start_col=0, - end_line=0, - end_col=0, + end_line=1, + end_col=11, ) ], ), @@ -42,10 +41,11 @@ DependencyProblem( code='dependency-check', message=S3FS_DEPRECATION_MESSAGE, - start_line=0, + source_path=Path('path.py.txt'), + start_line=1, start_col=0, - end_line=0, - end_col=0, + end_line=1, + end_col=26, ) ], ), @@ -57,10 +57,11 @@ DependencyProblem( code='dependency-check', message=S3FS_DEPRECATION_MESSAGE, - start_line=0, + source_path=Path('path.py.txt'), + start_line=1, start_col=0, - end_line=0, - end_col=0, + end_line=1, + end_col=18, ) ], ), @@ -71,10 +72,11 @@ DependencyProblem( code='dependency-check', message=S3FS_DEPRECATION_MESSAGE, - start_line=0, - start_col=0, - end_line=0, - end_col=0, + source_path=Path('path.py.txt'), + start_line=2, + start_col=4, + end_line=2, + end_col=15, ) ], ), @@ -84,10 +86,11 @@ DependencyProblem( code='dependency-check', message=S3FS_DEPRECATION_MESSAGE, - start_line=0, + source_path=Path('path.py.txt'), + start_line=1, start_col=0, - end_line=0, - end_col=0, + end_line=1, + end_col=16, ) ], ), @@ -97,32 +100,30 @@ DependencyProblem( code='dependency-check', message='Use of dependency s3fs.subpackage is deprecated', - start_line=0, + source_path=Path('path.py.txt'), + start_line=1, start_col=0, - end_line=0, - end_col=0, + end_line=1, + end_col=37, ) ], ), ("", []), ], ) -def test_detect_s3fs_import(empty_index, source: str, expected: list[DependencyProblem]): - datas = _load_sources(SourceContainer, "s3fs-python-compatibility-catalog.yml") - whitelist = Whitelist.parse(datas[0]) - sources = {"path": source} - file_loader = _local_loader_with_side_effects(FileLoader, sources, {}) - dependency_resolver = DependencyResolver( - [ - WhitelistResolver(whitelist), - LocalFileResolver(file_loader), - ] - ) - provider = PathLookup.from_sys_path(Path.cwd()) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_local_file_dependency_graph(Path("path")) - problems: list[DependencyProblem] = list(dependency_resolver.problems) - assert problems == expected +def test_detect_s3fs_import(empty_index, source: str, expected: list[DependencyProblem], tmp_path): + sample = tmp_path / "test_detect_s3fs_import.py" + sample.write_text(source) + lookup = MockPathLookup(sys_paths=[tmp_path]) + yml = lookup.cwd / "s3fs-python-compatibility-catalog.yml" + whitelist = Whitelist.parse(yml.read_text()) + notebook_loader = NotebookLoader() + file_loader = FileLoader() + resolvers = [NotebookResolver(notebook_loader), LocalFileResolver(file_loader), WhitelistResolver(whitelist)] + dependency_resolver = DependencyResolver(resolvers) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + maybe = builder.build_local_file_dependency_graph(sample) + assert maybe.problems == [_.replace(source_path=sample) for _ in expected] @pytest.mark.parametrize( @@ -132,28 +133,23 @@ def test_detect_s3fs_import(empty_index, source: str, expected: list[DependencyP DependencyProblem( code='dependency-check', message='Use of dependency s3fs is deprecated', - start_line=0, + source_path=Path('root9.py.txt'), + start_line=1, start_col=0, - end_line=0, - end_col=0, + end_line=1, + end_col=12, ), ], ), ) def test_detect_s3fs_import_in_dependencies(empty_index, expected: list[DependencyProblem]): - paths = ["root9.py.txt", "leaf9.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - datas = _load_sources(SourceContainer, "s3fs-python-compatibility-catalog.yml") - whitelist = Whitelist.parse(datas[0]) - file_loader = _local_loader_with_side_effects(FileLoader, sources, {}) - dependency_resolver = DependencyResolver( - [ - WhitelistResolver(whitelist), - LocalFileResolver(file_loader), - ] - ) - provider = PathLookup.from_sys_path(Path.cwd()) - builder = DependencyGraphBuilder(dependency_resolver, provider) - builder.build_local_file_dependency_graph(Path("root9.py.txt")) - problems: list[DependencyProblem] = list(dependency_resolver.problems) - assert problems == expected + lookup = MockPathLookup() + yml = lookup.cwd / "s3fs-python-compatibility-catalog.yml" + file_loader = FileLoader() + whitelist = Whitelist.parse(yml.read_text()) + resolvers = [LocalFileResolver(file_loader), WhitelistResolver(whitelist)] + dependency_resolver = DependencyResolver(resolvers) + builder = DependencyGraphBuilder(dependency_resolver, lookup) + sample = lookup.cwd / "root9.py.txt" + maybe = builder.build_local_file_dependency_graph(sample) + assert maybe.problems == [_.replace(source_path=sample) for _ in expected]