diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 239a5f38ed..d1ce0f7d8f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -55,6 +55,10 @@ unit test coverage suite and the clear difference between _unit tests_ and _inte See https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html for more details +### `if typing.TYPE_CHECKING:` usage + +We don't use `if typing.TYPE_CHECKING:` in our codebase, because PyCharm and `mypy` can handle it without it and `Refactoring -> Move` may produce incorrect results. + ### ..., expression has type "None", variable has type "str" * Add `assert ... is not None` if it's a body of a method. Example: diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 42bd0cf770..6bf15af62e 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -31,6 +31,11 @@ from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from databricks.labs.ucx.hive_metastore.verification import VerifyHasMetastore from databricks.labs.ucx.installer.workflows import DeployedWorkflows +from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader, WorkspaceNotebookLoader +from databricks.labs.ucx.source_code.files import FileLoader, LocalFileResolver, SysPathProvider +from databricks.labs.ucx.source_code.graph import DependencyResolver, DependencyGraphBuilder +from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist +from databricks.labs.ucx.source_code.site_packages import SitePackagesResolver, SitePackages from databricks.labs.ucx.source_code.languages import Languages from databricks.labs.ucx.workspace_access import generic, redash from databricks.labs.ucx.workspace_access.groups import GroupManager @@ -334,3 +339,56 @@ def workspace_info(self): @cached_property def verify_has_metastore(self): return VerifyHasMetastore(self.workspace_client) + + @cached_property + def notebook_loader(self) -> NotebookLoader: + return WorkspaceNotebookLoader(self.workspace_client) + + @cached_property + def notebook_resolver(self): + return NotebookResolver(self.notebook_loader) + + @cached_property + def site_packages(self): + # TODO: actually load the site packages + return SitePackages([]) + + @cached_property + def syspath_provider(self): + return SysPathProvider.from_sys_path() + + @cached_property + def file_loader(self): + return FileLoader(self.syspath_provider) + + @cached_property + def site_packages_resolver(self): + return SitePackagesResolver(self.site_packages, self.file_loader, self.syspath_provider) + + @cached_property + def whitelist(self): + # TODO: fill in the whitelist + return Whitelist() + + @cached_property + def whitelist_resolver(self): + return WhitelistResolver(self.whitelist) + + @cached_property + def file_resolver(self): + return LocalFileResolver(self.file_loader) + + @cached_property + def dependency_resolver(self): + return DependencyResolver( + [ + self.notebook_resolver, + self.site_packages_resolver, + self.whitelist_resolver, + self.file_resolver, + ] + ) + + @cached_property + def dependency_graph_builder(self): + return DependencyGraphBuilder(self.dependency_resolver) diff --git a/src/databricks/labs/ucx/contexts/cli_command.py b/src/databricks/labs/ucx/contexts/cli_command.py index 0a27b8b55a..9f8abb2603 100644 --- a/src/databricks/labs/ucx/contexts/cli_command.py +++ b/src/databricks/labs/ucx/contexts/cli_command.py @@ -18,6 +18,7 @@ from databricks.labs.ucx.azure.locations import ExternalLocationsMigration from databricks.labs.ucx.azure.resources import AzureAPIClient, AzureResources from databricks.labs.ucx.contexts.application import GlobalContext +from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, LocalNotebookLoader from databricks.labs.ucx.source_code.files import LocalFileMigrator from databricks.labs.ucx.workspace_access.clusters import ClusterAccess @@ -168,6 +169,10 @@ def iam_role_migration(self): self.iam_credential_manager, ) + @cached_property + def notebook_loader(self) -> NotebookLoader: + return LocalNotebookLoader(self.syspath_provider) + class AccountContext(CliContext): def __init__(self, ac: AccountClient, named_parameters: dict[str, str] | None = None): diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index c7cefa218a..63878c8c5a 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from abc import abstractmethod from collections.abc import Iterable from dataclasses import dataclass diff --git a/src/databricks/labs/ucx/source_code/dependency_loaders.py b/src/databricks/labs/ucx/source_code/dependency_loaders.py deleted file mode 100644 index f6ceae66ae..0000000000 --- a/src/databricks/labs/ucx/source_code/dependency_loaders.py +++ /dev/null @@ -1,112 +0,0 @@ -from __future__ import annotations - -import abc -import typing -from pathlib import Path - -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.workspace import ObjectType, ObjectInfo, ExportFormat - -from databricks.labs.ucx.source_code.dependencies import Dependency - - -if typing.TYPE_CHECKING: - from databricks.labs.ucx.source_code.site_packages import SitePackage - from databricks.labs.ucx.source_code.dependencies import DependencyGraph - - -class SourceContainer(abc.ABC): - - @abc.abstractmethod - def build_dependency_graph(self, parent: DependencyGraph) -> None: - raise NotImplementedError() - - -class DependencyLoader(abc.ABC): - - @abc.abstractmethod - def load_dependency(self, dependency: Dependency) -> SourceContainer | None: - raise NotImplementedError() - - @abc.abstractmethod - def is_notebook(self, path: Path) -> bool: - raise NotImplementedError() - - -# a DependencyLoader that simply wraps a pre-existing SourceContainer -class WrappingLoader(DependencyLoader): - - def __init__(self, source_container: SourceContainer): - self._source_container = source_container - - def is_file(self, path: Path) -> bool: - raise NotImplementedError() # should never happen - - def is_notebook(self, path: Path) -> bool: - raise NotImplementedError() # should never happen - - def load_dependency(self, dependency: Dependency) -> SourceContainer | None: - return self._source_container - - -class LocalFileLoader(DependencyLoader): - # see https://github.com/databrickslabs/ucx/issues/1499 - def load_dependency(self, dependency: Dependency) -> SourceContainer | None: - raise NotImplementedError() - - def is_file(self, path: Path) -> bool: - raise NotImplementedError() - - def is_notebook(self, path: Path) -> bool: - raise NotImplementedError() - - -class NotebookLoader(DependencyLoader, abc.ABC): - pass - - -class LocalNotebookLoader(NotebookLoader, LocalFileLoader): - # see https://github.com/databrickslabs/ucx/issues/1499 - pass - - -class SitePackageContainer(SourceContainer): - - def __init__(self, file_loader: LocalFileLoader, site_package: SitePackage): - self._file_loader = file_loader - self._site_package = site_package - - def build_dependency_graph(self, parent: DependencyGraph) -> None: - for module_path in self._site_package.module_paths: - parent.register_dependency(Dependency(self._file_loader, module_path)) - - -class WorkspaceNotebookLoader(NotebookLoader): - - def __init__(self, ws: WorkspaceClient): - self._ws = ws - - def is_notebook(self, path: Path): - object_info = self._ws.workspace.get_status(str(path)) - # TODO check error conditions, see https://github.com/databrickslabs/ucx/issues/1361 - return object_info is not None and object_info.object_type is ObjectType.NOTEBOOK - - def load_dependency(self, dependency: Dependency) -> SourceContainer | None: - object_info = self._ws.workspace.get_status(str(dependency.path)) - # TODO check error conditions, see https://github.com/databrickslabs/ucx/issues/1361 - return self._load_notebook(object_info) - - def _load_notebook(self, object_info: ObjectInfo) -> SourceContainer: - # local import to avoid cyclic dependency - # pylint: disable=import-outside-toplevel, cyclic-import - from databricks.labs.ucx.source_code.notebook import Notebook - - assert object_info.path is not None - assert object_info.language is not None - source = self._load_source(object_info) - return Notebook.parse(object_info.path, source, object_info.language) - - def _load_source(self, object_info: ObjectInfo) -> str: - assert object_info.path is not None - with self._ws.workspace.download(object_info.path, format=ExportFormat.SOURCE) as f: - return f.read().decode("utf-8") diff --git a/src/databricks/labs/ucx/source_code/dependency_resolvers.py b/src/databricks/labs/ucx/source_code/dependency_resolvers.py deleted file mode 100644 index efe98a4d4a..0000000000 --- a/src/databricks/labs/ucx/source_code/dependency_resolvers.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import annotations - -import typing -from collections.abc import Callable - -from pathlib import Path - -from databricks.labs.ucx.source_code.dependencies import Dependency, DependencyProblem -from databricks.labs.ucx.source_code.dependency_loaders import SitePackageContainer, WrappingLoader -from databricks.labs.ucx.source_code.site_packages import SitePackages -from databricks.labs.ucx.source_code.whitelist import Whitelist, UCCompatibility - -if typing.TYPE_CHECKING: - from databricks.labs.ucx.source_code.dependency_loaders import LocalFileLoader, NotebookLoader - - -class DependencyResolver: - def __init__( - self, - whitelist: Whitelist, - site_packages: SitePackages, - file_loader: LocalFileLoader, - notebook_loader: NotebookLoader, - ): - self._whitelist = whitelist - self._site_packages = site_packages - self._file_loader = file_loader - self._notebook_loader = notebook_loader - self._problems: list[DependencyProblem] = [] - - @property - def problems(self): - return self._problems - - def add_problems(self, problems: list[DependencyProblem]): - self._problems.extend(problems) - - # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1421 - def resolve_notebook( - self, path: Path, problem_collector: Callable[[DependencyProblem], None] | None = None - ) -> Dependency | None: - if self._notebook_loader.is_notebook(path): - return Dependency(self._notebook_loader, path) - problem = DependencyProblem('dependency-check', f"Notebook not found: {path.as_posix()}") - if problem_collector: - problem_collector(problem) - else: - self._problems.append(problem) - return None - - # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1421 - def resolve_local_file( - self, path: Path, problem_collector: Callable[[DependencyProblem], None] | None = None - ) -> Dependency | None: - if self._file_loader.is_file(path) and not self._file_loader.is_notebook(path): - return Dependency(self._file_loader, path) - problem = DependencyProblem('dependency-check', f"File not found: {path.as_posix()}") - if problem_collector: - problem_collector(problem) - else: - self._problems.append(problem) - return None - - # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1421 - def resolve_import(self, name: str, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: - if self._is_whitelisted(name): - return None - if self._file_loader.is_file(Path(name)): - return Dependency(self._file_loader, Path(name)) - site_package = self._site_packages[name] - if site_package is None: - problem = DependencyProblem(code='dependency-check', message=f"Could not locate import: {name}") - problem_collector(problem) - return None - container = SitePackageContainer(self._file_loader, site_package) - return Dependency(WrappingLoader(container), Path(name)) - - def _is_whitelisted(self, name: str) -> bool: - compatibility = self._whitelist.compatibility(name) - # TODO attach compatibility to dependency, see https://github.com/databrickslabs/ucx/issues/1382 - if compatibility is None: - return False - if compatibility == UCCompatibility.NONE: - # TODO move to linter, see https://github.com/databrickslabs/ucx/issues/1527 - self._problems.append( - DependencyProblem( - code="dependency-check", - message=f"Use of dependency {name} is deprecated", - start_line=0, - start_col=0, - end_line=0, - end_col=0, - ) - ) - return True diff --git a/src/databricks/labs/ucx/source_code/files.py b/src/databricks/labs/ucx/source_code/files.py index be509b4959..f254cd214f 100644 --- a/src/databricks/labs/ucx/source_code/files.py +++ b/src/databricks/labs/ucx/source_code/files.py @@ -4,18 +4,21 @@ import logging import sys from pathlib import Path +from collections.abc import Callable, Iterable from databricks.sdk.service.workspace import Language -from databricks.labs.ucx.source_code.dependencies import ( +from databricks.labs.ucx.source_code.languages import Languages +from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, NOTEBOOK_HEADER +from databricks.labs.ucx.source_code.python_linter import PythonLinter, ASTLinter +from databricks.labs.ucx.source_code.graph import ( DependencyGraph, + SourceContainer, DependencyProblem, + DependencyLoader, + Dependency, + BaseDependencyResolver, ) -from databricks.labs.ucx.source_code.dependency_loaders import SourceContainer -from databricks.labs.ucx.source_code.languages import Languages -from databricks.labs.ucx.source_code.notebook import CellLanguage -from databricks.labs.ucx.source_code.python_linter import PythonLinter, ASTLinter - logger = logging.getLogger(__name__) @@ -55,7 +58,7 @@ def build_dependency_graph(self, parent: DependencyGraph) -> None: parent.add_problems(call_problems) continue problem = DependencyProblem( - code='dependency-check', + code='dependency-not-constant', message="Can't check dependency not provided as a constant", source_path=self._path, start_line=call.lineno, @@ -65,14 +68,8 @@ def build_dependency_graph(self, parent: DependencyGraph) -> None: ) parent.add_problems([problem]) # TODO https://github.com/databrickslabs/ucx/issues/1287 - in_site_packages = "site-packages" in parent.dependency.path.as_posix() - sys_module_keys = sys.modules.keys() for pair in PythonLinter.list_import_sources(linter): import_name = pair[0] - # TODO remove HORRIBLE hack until we implement https://github.com/databrickslabs/ucx/issues/1421 - # if it's a site-package, provide full path until we implement 1421 - if in_site_packages and import_name not in sys_module_keys: - import_name = Path(parent.dependency.path.parent, import_name + ".py").as_posix() import_problems: list[DependencyProblem] = [] parent.register_import(import_name, import_problems.append) node = pair[1] @@ -138,3 +135,90 @@ def _apply_file_fix(self, path): logger.info(f"Overwriting {path}") f.write(code) return True + + +class FileLoader(DependencyLoader): + + def __init__(self, syspath_provider: SysPathProvider): + self._syspath_provider = syspath_provider + + def load_dependency(self, dependency: Dependency) -> SourceContainer | None: + fullpath = self.full_path(dependency.path) + assert fullpath is not None + return LocalFile(fullpath, fullpath.read_text("utf-8"), Language.PYTHON) + + def exists(self, path: Path) -> bool: + return self.full_path(path) is not None + + def full_path(self, path: Path) -> Path | None: + if path.is_file(): + return path + for parent in self._syspath_provider.paths: + child = Path(parent, path) + if child.is_file(): + return child + return None + + def is_notebook(self, path: Path) -> bool: + fullpath = self.full_path(path) + assert fullpath is not None + with fullpath.open(mode="r", encoding="utf-8") as stream: + line = stream.readline() + return NOTEBOOK_HEADER in line + + +class LocalFileResolver(BaseDependencyResolver): + + def __init__(self, file_loader: FileLoader, next_resolver: BaseDependencyResolver | None = None): + super().__init__(next_resolver) + self._file_loader = file_loader + + def with_next_resolver(self, resolver: BaseDependencyResolver) -> BaseDependencyResolver: + return LocalFileResolver(self._file_loader, resolver) + + # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1559 + def resolve_local_file( + self, path: Path, problem_collector: Callable[[DependencyProblem], None] + ) -> Dependency | None: + if self._file_loader.exists(path) and not self._file_loader.is_notebook(path): + return Dependency(self._file_loader, path) + return super().resolve_local_file(path, problem_collector) + + def resolve_import(self, name: str, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: + fullpath = self._file_loader.full_path(Path(f"{name}.py")) + if fullpath is not None: + return Dependency(self._file_loader, fullpath) + return super().resolve_import(name, problem_collector) + + +class SysPathProvider: + + @classmethod + def from_pathlike_string(cls, syspath: str): + paths = syspath.split(':') + return SysPathProvider([Path(path) for path in paths]) + + @classmethod + def from_sys_path(cls): + return SysPathProvider([Path(path) for path in sys.path]) + + def __init__(self, paths: list[Path]): + self._paths = paths + + def push(self, path: Path): + self._paths.insert(0, path) + + def insert(self, index: int, path: Path): + self._paths.insert(index, path) + + def remove(self, index: int): + del self._paths[index] + + def pop(self) -> Path: + result = self._paths[0] + del self._paths[0] + return result + + @property + def paths(self) -> Iterable[Path]: + yield from self._paths diff --git a/src/databricks/labs/ucx/source_code/dependencies.py b/src/databricks/labs/ucx/source_code/graph.py similarity index 59% rename from src/databricks/labs/ucx/source_code/dependencies.py rename to src/databricks/labs/ucx/source_code/graph.py index eedecd1114..f3fe281077 100644 --- a/src/databricks/labs/ucx/source_code/dependencies.py +++ b/src/databricks/labs/ucx/source_code/graph.py @@ -2,72 +2,12 @@ import abc import ast -import typing from dataclasses import dataclass -from collections.abc import Callable from pathlib import Path - +from collections.abc import Callable, Iterable from databricks.labs.ucx.source_code.python_linter import ASTLinter, PythonLinter -if typing.TYPE_CHECKING: - from databricks.labs.ucx.source_code.dependency_resolvers import DependencyResolver - from databricks.labs.ucx.source_code.dependency_loaders import SourceContainer, DependencyLoader - - -MISSING_SOURCE_PATH = "" - - -@dataclass -class DependencyProblem: - code: str - message: str - source_path: Path = Path(MISSING_SOURCE_PATH) - start_line: int = -1 - start_col: int = -1 - end_line: int = -1 - end_col: int = -1 - - def replace( - self, - code: str | None = None, - message: str | None = None, - source_path: Path | None = None, - start_line: int | None = None, - start_col: int | None = None, - end_line: int | None = None, - end_col: int | None = None, - ) -> DependencyProblem: - return DependencyProblem( - code if code is not None else self.code, - message if message is not None else self.message, - source_path if source_path is not None else self.source_path, - start_line if start_line is not None else self.start_line, - start_col if start_col is not None else self.start_col, - end_line if end_line is not None else self.end_line, - end_col if end_col is not None else self.end_col, - ) - - -class Dependency(abc.ABC): - - def __init__(self, loader: DependencyLoader, path: Path): - self._loader = loader - self._path = path - - @property - def path(self) -> Path: - return self._path - - def __hash__(self): - return hash(self.path) - - def __eq__(self, other): - return isinstance(other, type(self)) and self.path == other.path - - def load(self) -> SourceContainer | None: - return self._loader.load_dependency(self) - class DependencyGraph: @@ -94,7 +34,7 @@ def add_problems(self, problems: list[DependencyProblem]): problems = [problem.replace(source_path=self.dependency.path) for problem in problems] self._resolver.add_problems(problems) - # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1421 + # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1559 def register_notebook( self, path: Path, problem_collector: Callable[[DependencyProblem], None] ) -> DependencyGraph | None: @@ -103,7 +43,7 @@ def register_notebook( return None return self.register_dependency(resolved) - # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1421 + # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1559 def register_import( self, name: str, problem_collector: Callable[[DependencyProblem], None] ) -> DependencyGraph | None: @@ -145,10 +85,6 @@ def check_registered_dependency(graph): if graph_posix_path == posix_path: found.append(graph) return True - # TODO remove HORRIBLE hack until we implement https://github.com/databrickslabs/ucx/issues/1421 - if "site-packages" in graph_posix_path and graph_posix_path.endswith(posix_path): - found.append(graph) - return True return False self.root.visit(check_registered_dependency, set()) @@ -159,7 +95,7 @@ def root(self): return self if self._parent is None else self._parent.root @property - def dependencies(self) -> set[Dependency]: + def all_dependencies(self) -> set[Dependency]: dependencies: set[Dependency] = set() def add_to_dependencies(graph: DependencyGraph) -> bool: @@ -172,8 +108,12 @@ def add_to_dependencies(graph: DependencyGraph) -> bool: return dependencies @property - def paths(self) -> set[Path]: - return {d.path for d in self.dependencies} + def local_dependencies(self) -> set[Dependency]: + return {child.dependency for child in self._dependencies.values()} + + @property + def all_paths(self) -> set[Path]: + return {d.path for d in self.all_dependencies} # when visit_node returns True it interrupts the visit def visit(self, visit_node: Callable[[DependencyGraph], bool | None], visited: set[Path]) -> bool | None: @@ -207,7 +147,7 @@ def build_graph_from_python_source(self, python_code: str, problem_collector: Ca problem_collector(problem) else: problem = DependencyProblem( - code='dependency-check', + code='dependency-not-constant', message="Can't check dependency not provided as a constant", start_line=call.lineno, start_col=call.col_offset, @@ -229,6 +169,210 @@ def build_graph_from_python_source(self, python_code: str, problem_collector: Ca problem_collector(problem) +class Dependency(abc.ABC): + + def __init__(self, loader: DependencyLoader, path: Path): + self._loader = loader + self._path = path + + @property + def path(self) -> Path: + return self._path + + def __hash__(self): + return hash(self.path) + + def __eq__(self, other): + return isinstance(other, type(self)) and self.path == other.path + + def load(self) -> SourceContainer | None: + return self._loader.load_dependency(self) + + +class SourceContainer(abc.ABC): + + @abc.abstractmethod + def build_dependency_graph(self, parent: DependencyGraph) -> None: + raise NotImplementedError() + + +class DependencyLoader(abc.ABC): + + @abc.abstractmethod + def load_dependency(self, dependency: Dependency) -> SourceContainer | None: + raise NotImplementedError() + + @abc.abstractmethod + def is_notebook(self, path: Path) -> bool: + raise NotImplementedError() + + +class WrappingLoader(DependencyLoader): + + def __init__(self, source_container: SourceContainer): + self._source_container = source_container + + def is_notebook(self, path: Path) -> bool: + raise NotImplementedError() # should never happen + + def load_dependency(self, dependency: Dependency) -> SourceContainer | None: + return self._source_container + + +class BaseDependencyResolver(abc.ABC): + + def __init__(self, next_resolver: BaseDependencyResolver | None): + self._next_resolver = next_resolver + self._problems: list[DependencyProblem] = [] + + @abc.abstractmethod + def with_next_resolver(self, resolver: BaseDependencyResolver) -> BaseDependencyResolver: + raise NotImplementedError() + + @property + def problems(self): + return self._problems + + def add_problems(self, problems: list[DependencyProblem]): + self._problems.extend(problems) + + @property + def next_resolver(self): + return self._next_resolver + + def resolve_notebook(self, path: Path, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: + assert self._next_resolver is not None + return self._next_resolver.resolve_notebook(path, problem_collector) + + def resolve_local_file( + self, path: Path, problem_collector: Callable[[DependencyProblem], None] + ) -> Dependency | None: + assert self._next_resolver is not None + return self._next_resolver.resolve_local_file(path, problem_collector) + + def resolve_import(self, name: str, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: + assert self._next_resolver is not None + return self._next_resolver.resolve_import(name, problem_collector) + + +class StubResolver(BaseDependencyResolver): + + def __init__(self): + super().__init__(None) + + def with_next_resolver(self, resolver: BaseDependencyResolver) -> BaseDependencyResolver: + raise NotImplementedError("Should never happen!") + + def resolve_notebook(self, path: Path, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: + return None + + def resolve_local_file( + self, path: Path, problem_collector: Callable[[DependencyProblem], None] + ) -> Dependency | None: + return None + + def resolve_import(self, name: str, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: + return None + + +class DependencyResolver: + def __init__(self, resolvers: list[BaseDependencyResolver]): + previous: BaseDependencyResolver = StubResolver() + for resolver in resolvers: + resolver = resolver.with_next_resolver(previous) + previous = resolver + self._resolver: BaseDependencyResolver = previous + + def resolve_notebook( + self, path: Path, problem_collector: Callable[[DependencyProblem], None] | None = None + ) -> Dependency | None: + problems: list[DependencyProblem] = [] + dependency = self._resolver.resolve_notebook(path, problems.append) + if dependency is None: + problem = DependencyProblem('notebook-not-found', f"Notebook not found: {path.as_posix()}") + problems.append(problem) + if problem_collector: + for problem in problems: + problem_collector(problem) + else: + self.add_problems(problems) + return dependency + + def resolve_local_file( + self, path: Path, problem_collector: Callable[[DependencyProblem], None] | None = None + ) -> Dependency | None: + problems: list[DependencyProblem] = [] + dependency = self._resolver.resolve_local_file(path, problems.append) + if dependency is None: + problem = DependencyProblem('file-not-found', f"File not found: {path.as_posix()}") + problems.append(problem) + if problem_collector: + for problem in problems: + problem_collector(problem) + else: + self.add_problems(problems) + return dependency + + def resolve_import( + self, name: str, problem_collector: Callable[[DependencyProblem], None] | None = None + ) -> Dependency | None: + problems: list[DependencyProblem] = [] + dependency = self._resolver.resolve_import(name, problems.append) + if dependency is None: + problem = DependencyProblem('import-not-found', f"Could not locate import: {name}") + problems.append(problem) + if problem_collector: + for problem in problems: + problem_collector(problem) + else: + self.add_problems(problems) + return dependency + + @property + def problems(self) -> Iterable[DependencyProblem]: + resolver = self._resolver + while resolver is not None: + yield from resolver.problems + resolver = resolver.next_resolver + + def add_problems(self, problems: list[DependencyProblem]): + self._resolver.add_problems(problems) + + +MISSING_SOURCE_PATH = "" + + +@dataclass +class DependencyProblem: + code: str + message: str + source_path: Path = Path(MISSING_SOURCE_PATH) + start_line: int = -1 + start_col: int = -1 + end_line: int = -1 + end_col: int = -1 + + def replace( + self, + code: str | None = None, + message: str | None = None, + source_path: Path | None = None, + start_line: int | None = None, + start_col: int | None = None, + end_line: int | None = None, + end_col: int | None = None, + ) -> DependencyProblem: + return DependencyProblem( + code if code is not None else self.code, + message if message is not None else self.message, + source_path if source_path is not None else self.source_path, + start_line if start_line is not None else self.start_line, + start_col if start_col is not None else self.start_col, + end_line if end_line is not None else self.end_line, + end_col if end_col is not None else self.end_col, + ) + + class DependencyGraphBuilder: def __init__(self, resolver: DependencyResolver): diff --git a/src/databricks/labs/ucx/source_code/notebook_linter.py b/src/databricks/labs/ucx/source_code/notebook_linter.py deleted file mode 100644 index 16f09061e9..0000000000 --- a/src/databricks/labs/ucx/source_code/notebook_linter.py +++ /dev/null @@ -1,38 +0,0 @@ -from collections.abc import Iterable - -from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.base import Advice -from databricks.labs.ucx.source_code.notebook import Notebook -from databricks.labs.ucx.source_code.languages import Languages, Language - - -class NotebookLinter: - """ - Parses a Databricks notebook and then applies available linters - to the code cells according to the language of the cell. - """ - - def __init__(self, langs: Languages, notebook: Notebook): - self._languages: Languages = langs - self._notebook: Notebook = notebook - - @classmethod - def from_source(cls, index: MigrationIndex, source: str, default_language: Language) -> 'NotebookLinter': - langs = Languages(index) - notebook = Notebook.parse("", source, default_language) - assert notebook is not None - return cls(langs, notebook) - - def lint(self) -> Iterable[Advice]: - for cell in self._notebook.cells: - if not self._languages.is_supported(cell.language.language): - continue - linter = self._languages.linter(cell.language.language) - for advice in linter.lint(cell.original_code): - yield advice.replace( - start_line=advice.start_line + cell.original_offset, end_line=advice.end_line + cell.original_offset - ) - - @staticmethod - def name() -> str: - return "notebook-linter" diff --git a/src/databricks/labs/ucx/source_code/notebooks/__init__.py b/src/databricks/labs/ucx/source_code/notebooks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/databricks/labs/ucx/source_code/notebook.py b/src/databricks/labs/ucx/source_code/notebooks/cells.py similarity index 83% rename from src/databricks/labs/ucx/source_code/notebook.py rename to src/databricks/labs/ucx/source_code/notebooks/cells.py index 4d40fe62f7..49d4f82d5b 100644 --- a/src/databricks/labs/ucx/source_code/notebook.py +++ b/src/databricks/labs/ucx/source_code/notebooks/cells.py @@ -1,24 +1,16 @@ -from __future__ import annotations # for type hints +from __future__ import annotations import logging from abc import ABC, abstractmethod from ast import parse as parse_python -from collections.abc import Callable from enum import Enum from pathlib import Path - -from sqlglot import ParseError as SQLParseError -from sqlglot import parse as parse_sql +from collections.abc import Callable +from sqlglot import parse as parse_sql, ParseError as SQLParseError from databricks.sdk.service.workspace import Language - -from databricks.labs.ucx.source_code.dependencies import ( - DependencyGraph, - DependencyProblem, -) -from databricks.labs.ucx.source_code.dependency_loaders import SourceContainer +from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem -logger = logging.getLogger(__name__) # use a specific logger for sqlglot warnings so we can disable them selectively sqlglot_logger = logging.getLogger(f"{__name__}.sqlglot") @@ -171,7 +163,7 @@ def build_dependency_graph(self, parent: DependencyGraph, problem_collector: Cal return start_line = self._original_offset + 1 problem = DependencyProblem( - 'dependency-check', + 'invalid-run-cell', "Missing notebook path in %run command", start_line=start_line, start_col=0, @@ -367,56 +359,3 @@ def wrap_with_magic(self, code: str, cell_language: CellLanguage) -> str: if code.endswith('./'): lines.append('\n') return "\n".join(lines) - - -class Notebook(SourceContainer): - - @staticmethod - def parse(path: str, source: str, default_language: Language) -> Notebook: - default_cell_language = CellLanguage.of_language(default_language) - cells = default_cell_language.extract_cells(source) - if cells is None: - raise ValueError(f"Could not parse Notebook: {path}") - return Notebook(path, source, default_language, cells, source.endswith('\n')) - - def __init__(self, path: str, source: str, language: Language, cells: list[Cell], ends_with_lf): - self._path = path - self._source = source - self._language = language - self._cells = cells - self._ends_with_lf = ends_with_lf - - @property - def path(self) -> str: - return self._path - - @property - def cells(self) -> list[Cell]: - return self._cells - - @property - def original_code(self) -> str: - return self._source - - def to_migrated_code(self): - default_language = CellLanguage.of_language(self._language) - header = f"{default_language.comment_prefix} {NOTEBOOK_HEADER}" - sources = [header] - for i, cell in enumerate(self._cells): - migrated_code = cell.migrated_code - if cell.language is not default_language: - migrated_code = default_language.wrap_with_magic(migrated_code, cell.language) - sources.append(migrated_code) - if i < len(self._cells) - 1: - sources.append('') - sources.append(f'{default_language.comment_prefix} {CELL_SEPARATOR}') - sources.append('') - if self._ends_with_lf: - sources.append('') # following join will append lf - return '\n'.join(sources) - - def build_dependency_graph(self, parent: DependencyGraph) -> None: - problems: list[DependencyProblem] = [] - for cell in self._cells: - cell.build_dependency_graph(parent, problems.append) - parent.add_problems(problems) diff --git a/src/databricks/labs/ucx/source_code/notebooks/loaders.py b/src/databricks/labs/ucx/source_code/notebooks/loaders.py new file mode 100644 index 0000000000..6ae700f6cd --- /dev/null +++ b/src/databricks/labs/ucx/source_code/notebooks/loaders.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import abc +from pathlib import Path +from collections.abc import Callable + +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.workspace import ObjectType, ObjectInfo, ExportFormat + +from databricks.labs.ucx.source_code.files import FileLoader +from databricks.labs.ucx.source_code.graph import ( + BaseDependencyResolver, + DependencyProblem, + Dependency, + DependencyLoader, + SourceContainer, +) +from databricks.labs.ucx.source_code.notebooks.sources import Notebook + + +class NotebookResolver(BaseDependencyResolver): + + def __init__(self, notebook_loader: NotebookLoader, next_resolver: BaseDependencyResolver | None = None): + super().__init__(next_resolver) + self._notebook_loader = notebook_loader + + def with_next_resolver(self, resolver: BaseDependencyResolver) -> BaseDependencyResolver: + return NotebookResolver(self._notebook_loader, resolver) + + def resolve_notebook(self, path: Path, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: + if self._notebook_loader.is_notebook(path): + return Dependency(self._notebook_loader, path) + return super().resolve_notebook(path, problem_collector) + + +class NotebookLoader(DependencyLoader, abc.ABC): + pass + + +class WorkspaceNotebookLoader(NotebookLoader): + + def __init__(self, ws: WorkspaceClient): + self._ws = ws + + def is_notebook(self, path: Path): + object_info = self._ws.workspace.get_status(str(path)) + # TODO check error conditions, see https://github.com/databrickslabs/ucx/issues/1361 + return object_info is not None and object_info.object_type is ObjectType.NOTEBOOK + + def load_dependency(self, dependency: Dependency) -> SourceContainer | None: + object_info = self._ws.workspace.get_status(str(dependency.path)) + # TODO check error conditions, see https://github.com/databrickslabs/ucx/issues/1361 + return self._load_notebook(object_info) + + def _load_notebook(self, object_info: ObjectInfo) -> SourceContainer: + assert object_info.path is not None + assert object_info.language is not None + source = self._load_source(object_info) + return Notebook.parse(object_info.path, source, object_info.language) + + def _load_source(self, object_info: ObjectInfo) -> str: + assert object_info.path is not None + with self._ws.workspace.download(object_info.path, format=ExportFormat.SOURCE) as f: + return f.read().decode("utf-8") + + +class LocalNotebookLoader(NotebookLoader, FileLoader): + # see https://github.com/databrickslabs/ucx/issues/1499 + pass diff --git a/src/databricks/labs/ucx/source_code/notebook_migrator.py b/src/databricks/labs/ucx/source_code/notebooks/migrator.py similarity index 84% rename from src/databricks/labs/ucx/source_code/notebook_migrator.py rename to src/databricks/labs/ucx/source_code/notebooks/migrator.py index 8eac50f37d..4201e6a3f0 100644 --- a/src/databricks/labs/ucx/source_code/notebook_migrator.py +++ b/src/databricks/labs/ucx/source_code/notebooks/migrator.py @@ -1,14 +1,15 @@ +from __future__ import annotations + from pathlib import Path from databricks.sdk import WorkspaceClient -from databricks.sdk.service.workspace import ExportFormat, ObjectInfo, ObjectType +from databricks.sdk.service.workspace import ObjectInfo, ExportFormat, ObjectType +from databricks.labs.ucx.source_code.graph import Dependency from databricks.labs.ucx.source_code.languages import Languages -from databricks.labs.ucx.source_code.notebook import Notebook, RunCell -from databricks.labs.ucx.source_code.dependencies import ( - Dependency, -) -from databricks.labs.ucx.source_code.dependency_loaders import WorkspaceNotebookLoader +from databricks.labs.ucx.source_code.notebooks.cells import RunCell +from databricks.labs.ucx.source_code.notebooks.loaders import WorkspaceNotebookLoader +from databricks.labs.ucx.source_code.notebooks.sources import Notebook class NotebookMigrator: diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py new file mode 100644 index 0000000000..90e4d4fe07 --- /dev/null +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from collections.abc import Iterable + +from databricks.sdk.service.workspace import Language + +from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex +from databricks.labs.ucx.source_code.base import Advice + +from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyGraph, DependencyProblem +from databricks.labs.ucx.source_code.languages import Languages +from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, Cell, NOTEBOOK_HEADER, CELL_SEPARATOR + + +class Notebook(SourceContainer): + + @staticmethod + def parse(path: str, source: str, default_language: Language) -> Notebook: + default_cell_language = CellLanguage.of_language(default_language) + cells = default_cell_language.extract_cells(source) + if cells is None: + raise ValueError(f"Could not parse Notebook: {path}") + return Notebook(path, source, default_language, cells, source.endswith('\n')) + + def __init__(self, path: str, source: str, language: Language, cells: list[Cell], ends_with_lf): + self._path = path + self._source = source + self._language = language + self._cells = cells + self._ends_with_lf = ends_with_lf + + @property + def path(self) -> str: + return self._path + + @property + def cells(self) -> list[Cell]: + return self._cells + + @property + def original_code(self) -> str: + return self._source + + def to_migrated_code(self): + default_language = CellLanguage.of_language(self._language) + header = f"{default_language.comment_prefix} {NOTEBOOK_HEADER}" + sources = [header] + for i, cell in enumerate(self._cells): + migrated_code = cell.migrated_code + if cell.language is not default_language: + migrated_code = default_language.wrap_with_magic(migrated_code, cell.language) + sources.append(migrated_code) + if i < len(self._cells) - 1: + sources.append('') + sources.append(f'{default_language.comment_prefix} {CELL_SEPARATOR}') + sources.append('') + if self._ends_with_lf: + sources.append('') # following join will append lf + return '\n'.join(sources) + + def build_dependency_graph(self, parent: DependencyGraph) -> None: + problems: list[DependencyProblem] = [] + for cell in self._cells: + cell.build_dependency_graph(parent, problems.append) + parent.add_problems(problems) + + +class NotebookLinter: + """ + Parses a Databricks notebook and then applies available linters + to the code cells according to the language of the cell. + """ + + def __init__(self, langs: Languages, notebook: Notebook): + self._languages: Languages = langs + self._notebook: Notebook = notebook + + @classmethod + def from_source(cls, index: MigrationIndex, source: str, default_language: Language) -> 'NotebookLinter': + langs = Languages(index) + notebook = Notebook.parse("", source, default_language) + assert notebook is not None + return cls(langs, notebook) + + def lint(self) -> Iterable[Advice]: + for cell in self._notebook.cells: + if not self._languages.is_supported(cell.language.language): + continue + linter = self._languages.linter(cell.language.language) + for advice in linter.lint(cell.original_code): + yield advice.replace( + start_line=advice.start_line + cell.original_offset, + end_line=advice.end_line + cell.original_offset, + ) + + @staticmethod + def name() -> str: + return "notebook-linter" diff --git a/src/databricks/labs/ucx/source_code/site_packages.py b/src/databricks/labs/ucx/source_code/site_packages.py index 918e93e5e6..9dd857150b 100644 --- a/src/databricks/labs/ucx/source_code/site_packages.py +++ b/src/databricks/labs/ucx/source_code/site_packages.py @@ -1,6 +1,72 @@ +from __future__ import annotations + import os from dataclasses import dataclass from pathlib import Path +from collections.abc import Callable +from databricks.labs.ucx.source_code.files import FileLoader, SysPathProvider +from databricks.labs.ucx.source_code.graph import ( + Dependency, + WrappingLoader, + SourceContainer, + DependencyGraph, + BaseDependencyResolver, + DependencyProblem, +) + + +class SitePackagesResolver(BaseDependencyResolver): + + def __init__( + self, + site_packages: SitePackages, + file_loader: FileLoader, + syspath_provider: SysPathProvider, + next_resolver: BaseDependencyResolver | None = None, + ): + super().__init__(next_resolver) + self._site_packages = site_packages + self._file_loader = file_loader + self._syspath_provider = syspath_provider + + def with_next_resolver(self, resolver: BaseDependencyResolver) -> BaseDependencyResolver: + return SitePackagesResolver(self._site_packages, self._file_loader, self._syspath_provider, resolver) + + def resolve_import(self, name: str, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: + site_package = self._site_packages[name] + if site_package is not None: + container = SitePackageContainer(self._file_loader, site_package) + return Dependency(WrappingLoader(container), Path(name)) + return super().resolve_import(name, problem_collector) + + +class SitePackageContainer(SourceContainer): + + def __init__(self, file_loader: FileLoader, site_package: SitePackage): + self._file_loader = file_loader + self._site_package = site_package + + def build_dependency_graph(self, parent: DependencyGraph) -> None: + for module_path in self._site_package.module_paths: + parent.register_dependency(Dependency(self._file_loader, module_path)) + + +class SitePackages: + + @staticmethod + def parse(site_packages_path: Path): + dist_info_dirs = [dir for dir in os.listdir(site_packages_path) if dir.endswith(".dist-info")] + packages = [SitePackage.parse(Path(site_packages_path, dist_info_dir)) for dist_info_dir in dist_info_dirs] + return SitePackages(packages) + + def __init__(self, packages: list[SitePackage]): + self._packages: dict[str, SitePackage] = {} + for package in packages: + for top_level in package.top_levels: + self._packages[top_level] = package + + def __getitem__(self, item: str) -> SitePackage | None: + return self._packages.get(item, None) @dataclass @@ -37,21 +103,3 @@ def top_levels(self) -> list[str]: @property def module_paths(self) -> list[Path]: return [Path(self._dist_info_path.parent, path) for path in self._module_paths] - - -class SitePackages: - - @staticmethod - def parse(site_packages_path: Path): - dist_info_dirs = [dir for dir in os.listdir(site_packages_path) if dir.endswith(".dist-info")] - packages = [SitePackage.parse(Path(site_packages_path, dist_info_dir)) for dist_info_dir in dist_info_dirs] - return SitePackages(packages) - - def __init__(self, packages: list[SitePackage]): - self._packages: dict[str, SitePackage] = {} - for package in packages: - for top_level in package.top_levels: - self._packages[top_level] = package - - def __getitem__(self, item: str) -> SitePackage | None: - return self._packages.get(item, None) diff --git a/src/databricks/labs/ucx/source_code/whitelist.py b/src/databricks/labs/ucx/source_code/whitelist.py index 075084cc3c..e34ad38edd 100644 --- a/src/databricks/labs/ucx/source_code/whitelist.py +++ b/src/databricks/labs/ucx/source_code/whitelist.py @@ -1,10 +1,64 @@ +from __future__ import annotations + import abc import sys -from collections.abc import Iterable from dataclasses import dataclass, field from enum import Enum +from pathlib import Path +from collections.abc import Callable, Iterable from yaml import load_all as load_yaml, Loader +from databricks.labs.ucx.source_code.graph import ( + Dependency, + WrappingLoader, + SourceContainer, + DependencyGraph, + BaseDependencyResolver, + DependencyProblem, +) + + +class WhitelistResolver(BaseDependencyResolver): + + def __init__(self, whitelist: Whitelist, next_resolver: BaseDependencyResolver | None = None): + super().__init__(next_resolver) + self._whitelist = whitelist + + def with_next_resolver(self, resolver: BaseDependencyResolver) -> BaseDependencyResolver: + return WhitelistResolver(self._whitelist, resolver) + + # TODO problem_collector is tactical, pending https://github.com/databrickslabs/ucx/issues/1559 + def resolve_import(self, name: str, problem_collector: Callable[[DependencyProblem], None]) -> Dependency | None: + if self._is_whitelisted(name): + container = StubContainer() + return Dependency(WrappingLoader(container), Path(name)) + return super().resolve_import(name, problem_collector) + + def _is_whitelisted(self, name: str) -> bool: + compatibility = self._whitelist.compatibility(name) + # TODO attach compatibility to dependency, see https://github.com/databrickslabs/ucx/issues/1382 + if compatibility is None: + return False + if compatibility == UCCompatibility.NONE: + # TODO move to linter, see https://github.com/databrickslabs/ucx/issues/1527 + self._problems.append( + DependencyProblem( + code="dependency-check", + message=f"Use of dependency {name} is deprecated", + start_line=0, + start_col=0, + end_line=0, + end_col=0, + ) + ) + return True + + +class StubContainer(SourceContainer): + + def build_dependency_graph(self, parent: DependencyGraph) -> None: + pass + class UCCompatibility(Enum): NONE = "none" diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py index 92ab0735f3..3e4986ca22 100644 --- a/tests/unit/__init__.py +++ b/tests/unit/__init__.py @@ -18,9 +18,10 @@ from databricks.sdk.service.workspace import ExportResponse, GetSecretResponse, Language from databricks.labs.ucx.hive_metastore.mapping import TableMapping, TableToMigrate -from databricks.labs.ucx.source_code.dependency_loaders import SourceContainer -from databricks.labs.ucx.source_code.files import LocalFile -from databricks.labs.ucx.source_code.notebook import Notebook, NOTEBOOK_HEADER +from databricks.labs.ucx.source_code.graph import SourceContainer +from databricks.labs.ucx.source_code.files import LocalFile, FileLoader, SysPathProvider +from databricks.labs.ucx.source_code.notebooks.sources import Notebook +from databricks.labs.ucx.source_code.notebooks.cells import NOTEBOOK_HEADER from databricks.labs.ucx.source_code.whitelist import Whitelist logging.getLogger("tests").setLevel("DEBUG") @@ -132,6 +133,10 @@ def _load_sources(cls: type, *filenames: str): return [installation.load(cls, filename=_) for _ in filenames] +def _samples_path(cls: type) -> str: + return (__dir / _FOLDERS[cls]).as_posix() + + def _cluster_policy(policy_id: str): fixture = _load_fixture(f"{_FOLDERS[Policy]}/{policy_id}.json") definition = json.dumps(fixture["definition"]) @@ -195,7 +200,7 @@ def _load_dependency_side_effect(sources: dict[str, str], visited: dict[str, boo def _is_notebook_side_effect(sources: dict[str, str], *args): dependency = args[0] - filename = str(dependency.path) + filename = dependency.path.as_posix() if filename.startswith('./'): filename = filename[2:] source = sources.get(filename, None) @@ -209,6 +214,59 @@ def _is_notebook_side_effect(sources: dict[str, str], *args): return NOTEBOOK_HEADER in source +def _full_path_side_effect(sources: dict[str, str], *args): + path = args[0] + filename = path.as_posix() + if filename.startswith('./'): + filename = filename[2:] + if filename in sources: + return pathlib.Path(filename) + if filename.find(".py") < 0: + filename = filename + ".py" + if filename.find(".txt") < 0: + filename = filename + ".txt" + if filename in sources: + return pathlib.Path(filename) + return None + + +def _is_file_side_effect(sources: dict[str, str], *args): + return _full_path_side_effect(sources, *args) is not None + + +def _local_loader_with_side_effects(cls: type, sources: dict[str, str], visited: dict[str, bool]): + file_loader = create_autospec(cls) + file_loader.exists.side_effect = lambda *args, **kwargs: _is_file_side_effect(sources, *args) + file_loader.is_notebook.return_value = False + file_loader.full_path.side_effect = lambda *args: _full_path_side_effect(sources, *args) + file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect( + sources, visited, *args + ) + return file_loader + + +class TestFileLoader(FileLoader): + __test__ = False + + def __init__(self, syspath_provider: SysPathProvider, sources: dict[str, str]): + super().__init__(syspath_provider) + self._sources = sources + + def exists(self, path: pathlib.Path): + if super().exists(path): + return True + filename = path.as_posix() + if filename.startswith('./'): + filename = filename[2:] + if filename in self._sources: + return True + if filename.find(".py") < 0: + filename = filename + ".py" + if filename.find(".txt") < 0: + filename = filename + ".txt" + return filename in self._sources + + def workspace_client_mock( cluster_ids: list[str] | None = None, pipeline_ids: list[str] | None = None, diff --git a/tests/unit/source_code/test_dependencies.py b/tests/unit/source_code/test_dependencies.py index 803ef91a23..12c4b04f51 100644 --- a/tests/unit/source_code/test_dependencies.py +++ b/tests/unit/source_code/test_dependencies.py @@ -1,29 +1,32 @@ -import os.path from pathlib import Path from unittest.mock import create_autospec from databricks.sdk import WorkspaceClient from databricks.sdk.service.workspace import ObjectInfo, Language, ObjectType -from databricks.labs.ucx.source_code.dependencies import ( - DependencyGraphBuilder, +from databricks.labs.ucx.source_code.graph import ( + SourceContainer, + DependencyResolver, DependencyProblem, + DependencyGraphBuilder, ) -from databricks.labs.ucx.source_code.dependency_loaders import ( - SourceContainer, - LocalFileLoader, - LocalNotebookLoader, +from databricks.labs.ucx.source_code.notebooks.loaders import ( + NotebookResolver, WorkspaceNotebookLoader, + LocalNotebookLoader, ) -from databricks.labs.ucx.source_code.dependency_resolvers import DependencyResolver -from databricks.labs.ucx.source_code.site_packages import SitePackages -from databricks.labs.ucx.source_code.whitelist import Whitelist +from databricks.labs.ucx.source_code.files import FileLoader, LocalFileResolver, SysPathProvider +from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist +from databricks.labs.ucx.source_code.site_packages import SitePackagesResolver, SitePackages from tests.unit import ( _load_sources, _download_side_effect, whitelist_mock, _load_dependency_side_effect, locate_site_packages, + TestFileLoader, + _samples_path, + _local_loader_with_side_effects, ) @@ -39,11 +42,13 @@ def get_status_side_effect(*args): ws = create_autospec(WorkspaceClient) ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, visited, *args, **kwargs) ws.workspace.get_status.side_effect = get_status_side_effect - whi = whitelist_mock() - file_loader = create_autospec(LocalFileLoader) - file_loader.is_notebook.return_value = False - site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, WorkspaceNotebookLoader(ws))) + workspace_notebook_loader = WorkspaceNotebookLoader(ws) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(workspace_notebook_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) builder.build_notebook_dependency_graph(Path("root3.run.py.txt")) assert len(visited) == len(paths) @@ -53,20 +58,29 @@ def test_dependency_graph_builder_visits_local_notebook_dependencies(): sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) visited: dict[str, bool] = {} whi = whitelist_mock() - file_loader = create_autospec(LocalFileLoader) + file_loader = create_autospec(FileLoader) file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect( sources, visited, *args ) file_loader.is_notebook.return_value = True - file_loader.is_file.return_value = True + file_loader.exists.return_value = True notebook_loader = create_autospec(LocalNotebookLoader) notebook_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect( sources, visited, *args ) notebook_loader.is_notebook.return_value = True - notebook_loader.is_file.return_value = True + notebook_loader.exists.return_value = True site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, notebook_loader)) + provider = SysPathProvider.from_pathlike_string("") + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) builder.build_notebook_dependency_graph(Path("root4.py.txt")) assert len(visited) == len(paths) @@ -75,27 +89,21 @@ def test_dependency_graph_builder_visits_workspace_file_dependencies(): paths = ["root8.py.txt", "leaf1.py.txt", "leaf2.py.txt"] sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) visited: dict[str, bool] = {} - - def get_status_side_effect(*args): - path = args[0] - return ( - ObjectInfo(path=path, object_type=ObjectType.FILE) - if path.startswith("leaf") - else ObjectInfo(path=path, language=Language.PYTHON, object_type=ObjectType.NOTEBOOK) - ) - - ws = create_autospec(WorkspaceClient) - ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, visited, *args, **kwargs) - ws.workspace.get_status.side_effect = get_status_side_effect + file_loader = _local_loader_with_side_effects(FileLoader, sources, visited) whi = whitelist_mock() - file_loader = create_autospec(LocalFileLoader) - file_loader.is_notebook.return_value = False - file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect( - sources, visited, *args - ) site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, WorkspaceNotebookLoader(ws))) - builder.build_local_file_dependency_graph(Path("root8.py.txt")) + provider = SysPathProvider.from_pathlike_string("") + notebook_loader = LocalNotebookLoader(provider) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) + builder.build_local_file_dependency_graph(Path(paths[0])) assert len(visited) == len(paths) @@ -110,17 +118,28 @@ def get_status_side_effect(*args): return ObjectInfo(object_type=ObjectType.NOTEBOOK, language=Language.PYTHON, path=path) ws = create_autospec(WorkspaceClient) + ws.workspace.is_notebook.return_value = True ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) ws.workspace.get_status.side_effect = get_status_side_effect whi = whitelist_mock() - file_loader = create_autospec(LocalFileLoader) + file_loader = create_autospec(FileLoader) file_loader.is_notebook.return_value = False site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, WorkspaceNotebookLoader(ws))) + provider = SysPathProvider.from_pathlike_string("") + notebook_loader = WorkspaceNotebookLoader(ws) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) builder.build_notebook_dependency_graph(Path("root1.run.py.txt")) - assert builder.problems == [ + assert list(builder.problems) == [ DependencyProblem( - 'dependency-check', 'Notebook not found: leaf2.py.txt', Path("root1.run.py.txt"), 19, 0, 19, 21 + 'notebook-not-found', 'Notebook not found: leaf2.py.txt', Path("root1.run.py.txt"), 19, 0, 19, 21 ) ] @@ -134,19 +153,28 @@ def is_file_side_effect(*args): path = args[0] return path.as_posix() in paths - file_loader = create_autospec(LocalFileLoader) - file_loader.is_file.side_effect = is_file_side_effect + file_loader = create_autospec(FileLoader) + file_loader.exists.side_effect = is_file_side_effect file_loader.is_notebook.side_effect = is_file_side_effect file_loader.load_dependency.side_effect = lambda *args: _load_dependency_side_effect(sources, {}, *args) notebook_loader = create_autospec(LocalNotebookLoader) - notebook_loader.is_file.side_effect = is_file_side_effect + notebook_loader.exists.side_effect = is_file_side_effect notebook_loader.is_notebook.side_effect = is_file_side_effect notebook_loader.load_dependency.side_effect = lambda *args: _load_dependency_side_effect(sources, {}, *args) site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, notebook_loader)) + provider = SysPathProvider.from_pathlike_string("") + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) builder.build_notebook_dependency_graph(Path(paths[0])) - assert builder.problems == [ - DependencyProblem('dependency-check', 'Notebook not found: leaf3.py.txt', Path(paths[0]), 1, 0, 1, 38) + assert list(builder.problems) == [ + DependencyProblem('notebook-not-found', 'Notebook not found: leaf3.py.txt', Path(paths[0]), 1, 0, 1, 38) ] @@ -159,20 +187,29 @@ def is_file_side_effect(*args): path = args[0] return path.as_posix() in paths - file_loader = create_autospec(LocalFileLoader) - file_loader.is_file.side_effect = is_file_side_effect + file_loader = create_autospec(FileLoader) + file_loader.exists.side_effect = is_file_side_effect file_loader.is_notebook.side_effect = is_file_side_effect file_loader.load_dependency.side_effect = lambda *args: _load_dependency_side_effect(sources, {}, *args) notebook_loader = create_autospec(LocalNotebookLoader) - notebook_loader.is_file.side_effect = is_file_side_effect + notebook_loader.exists.side_effect = is_file_side_effect notebook_loader.is_notebook.side_effect = is_file_side_effect notebook_loader.load_dependency.side_effect = lambda *args: _load_dependency_side_effect(sources, {}, *args) site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, notebook_loader)) + provider = SysPathProvider.from_pathlike_string("") + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) builder.build_notebook_dependency_graph(Path(paths[0])) - assert builder.problems == [ + assert list(builder.problems) == [ DependencyProblem( - 'dependency-check', "Can't check dependency not provided as a constant", Path(paths[0]), 2, 0, 2, 35 + 'dependency-not-constant', "Can't check dependency not provided as a constant", Path(paths[0]), 2, 0, 2, 35 ) ] @@ -186,38 +223,70 @@ def test_dependency_graph_builder_raises_problem_with_invalid_run_cell(): object_type=ObjectType.NOTEBOOK, language=Language.PYTHON, path=paths[0] ) whi = whitelist_mock() - file_loader = create_autospec(LocalFileLoader) + file_loader = create_autospec(FileLoader) file_loader.is_notebook.return_value = False site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, WorkspaceNotebookLoader(ws))) + provider = SysPathProvider.from_pathlike_string("") + notebook_loader = WorkspaceNotebookLoader(ws) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) builder.build_notebook_dependency_graph(Path(paths[0])) - assert builder.problems == [ - DependencyProblem('dependency-check', 'Missing notebook path in %run command', Path("leaf6.py.txt"), 5, 0, 5, 4) + assert list(builder.problems) == [ + DependencyProblem('invalid-run-cell', 'Missing notebook path in %run command', Path("leaf6.py.txt"), 5, 0, 5, 4) ] -def test_dependency_graph_builder_raises_problem_with_unresolved_import(): - paths = ["root7.py.txt"] +def test_dependency_graph_builder_visits_recursive_file_dependencies(): + paths = ["root6.py.txt", "root5.py.txt", "leaf4.py.txt"] sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) visited: dict[str, bool] = {} whi = whitelist_mock() + file_loader = _local_loader_with_side_effects(FileLoader, sources, visited) + site_packages = SitePackages.parse(locate_site_packages()) + provider = SysPathProvider.from_pathlike_string("") + notebook_loader = LocalNotebookLoader(provider) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) + builder.build_local_file_dependency_graph(Path("root6.py.txt")) + assert len(visited) == len(paths) - def is_file_side_effect(*args): - path = args[0] - return str(path) in paths - file_loader = create_autospec(LocalFileLoader) - file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect( - sources, visited, *args - ) - file_loader.is_file.side_effect = is_file_side_effect - file_loader.is_notebook.return_value = False +def test_dependency_graph_builder_raises_problem_with_unresolved_import(): + paths = ["root7.py.txt"] + sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) + visited: dict[str, bool] = {} + whi = whitelist_mock() + file_loader = _local_loader_with_side_effects(FileLoader, sources, visited) site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, LocalNotebookLoader())) + provider = SysPathProvider.from_pathlike_string("") + notebook_loader = LocalNotebookLoader(provider) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) builder.build_local_file_dependency_graph(Path(paths[0])) - assert builder.problems == [ + assert list(builder.problems) == [ DependencyProblem( - 'dependency-check', 'Could not locate import: some_library', Path("root7.py.txt"), 1, 0, 1, 19 + 'import-not-found', 'Could not locate import: some_library', Path("root7.py.txt"), 1, 0, 1, 19 ) ] @@ -227,23 +296,29 @@ def test_dependency_graph_builder_raises_problem_with_non_constant_notebook_argu sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) visited: dict[str, bool] = {} whi = Whitelist() - - def is_file_side_effect(*args): - path = args[0] - return str(path) in paths - - file_loader = create_autospec(LocalFileLoader) - file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect( - sources, visited, *args - ) - file_loader.is_file.side_effect = is_file_side_effect - file_loader.is_notebook.return_value = False + file_loader = _local_loader_with_side_effects(FileLoader, sources, visited) site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, LocalNotebookLoader())) + provider = SysPathProvider.from_pathlike_string("") + notebook_loader = LocalNotebookLoader(provider) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) builder.build_local_file_dependency_graph(Path(paths[0])) - assert builder.problems == [ + assert list(builder.problems) == [ DependencyProblem( - 'dependency-check', "Can't check dependency not provided as a constant", Path(paths[0]), 14, 13, 14, 50 + 'dependency-not-constant', + "Can't check dependency not provided as a constant", + Path(paths[0]), + 14, + 13, + 14, + 50, ) ] @@ -253,53 +328,47 @@ def test_dependency_graph_builder_visits_file_dependencies(): sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) visited: dict[str, bool] = {} whi = whitelist_mock() - file_loader = create_autospec(LocalFileLoader) - file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect( - sources, visited, *args - ) - file_loader.is_file.return_value = True - file_loader.is_notebook.return_value = False + file_loader = _local_loader_with_side_effects(FileLoader, sources, visited) site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, LocalNotebookLoader())) - builder.build_local_file_dependency_graph(Path("root5.py.txt")) - assert len(visited) == len(paths) - - -def test_dependency_graph_builder_visits_recursive_file_dependencies(): - paths = ["root6.py.txt", "root5.py.txt", "leaf4.py.txt"] - sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - visited: dict[str, bool] = {} - whi = whitelist_mock() - file_loader = create_autospec(LocalFileLoader) - file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect( - sources, visited, *args + provider = SysPathProvider.from_pathlike_string("") + notebook_loader = LocalNotebookLoader(provider) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] ) - file_loader.is_file.return_value = True - file_loader.is_notebook.return_value = False - site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, LocalNotebookLoader())) - builder.build_local_file_dependency_graph(Path("root6.py.txt")) + builder = DependencyGraphBuilder(dependency_resolver) + builder.build_local_file_dependency_graph(Path("root5.py.txt")) assert len(visited) == len(paths) -def test_dependency_graph_builder_ignores_builtin_dependencies(): +def test_dependency_graph_builder_skips_builtin_dependencies(): paths = ["python_builtins.py.txt"] sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) whi = Whitelist() - - def is_file_side_effect(*args): - path = args[0] - return str(path) in paths - - file_loader = create_autospec(LocalFileLoader) - file_loader.is_file.side_effect = is_file_side_effect - file_loader.is_notebook.return_value = False - file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect(sources, {}, *args) + file_loader = _local_loader_with_side_effects(FileLoader, sources, {}) site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, LocalNotebookLoader())) + provider = SysPathProvider.from_pathlike_string("") + notebook_loader = LocalNotebookLoader(provider) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) graph = builder.build_local_file_dependency_graph(Path("python_builtins.py.txt")) - assert not graph.locate_dependency(Path("os")) - assert not graph.locate_dependency(Path("path")) + child = graph.locate_dependency(Path("os")) + assert child + assert len(child.local_dependencies) == 0 + child = graph.locate_dependency(Path("pathlib")) + assert child + assert len(child.local_dependencies) == 0 def test_dependency_graph_builder_ignores_known_dependencies(): @@ -307,17 +376,19 @@ def test_dependency_graph_builder_ignores_known_dependencies(): sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) datas = _load_sources(SourceContainer, "sample-python-compatibility-catalog.yml") whitelist = Whitelist.parse(datas[0]) - - def is_file_side_effect(*args): - path = args[0] - return str(path) in paths - - file_loader = create_autospec(LocalFileLoader) - file_loader.is_file.side_effect = is_file_side_effect - file_loader.is_notebook.return_value = False - file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect(sources, {}, *args) + file_loader = _local_loader_with_side_effects(FileLoader, sources, {}) site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whitelist, site_packages, file_loader, LocalNotebookLoader())) + provider = SysPathProvider.from_pathlike_string("") + notebook_loader = LocalNotebookLoader(provider) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whitelist), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) graph = builder.build_local_file_dependency_graph(Path("python_builtins.py.txt")) assert not graph.locate_dependency(Path("databricks")) @@ -327,33 +398,43 @@ def test_dependency_graph_builder_visits_site_packages(empty_index): whitelist = Whitelist.parse(datas[0]) paths = ["import-site-package.py.txt"] sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) - file_loader = create_autospec(LocalFileLoader) - - def is_file_side_effect(*args): - path = args[0] - filename = path.as_posix() - return filename in paths or os.path.isfile(filename) - - file_loader.is_file.side_effect = is_file_side_effect - file_loader.is_notebook.return_value = False - file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect(sources, {}, *args) - site_packages = SitePackages.parse(locate_site_packages()) - builder = DependencyGraphBuilder(DependencyResolver(whitelist, site_packages, file_loader, LocalNotebookLoader())) + provider = SysPathProvider.from_pathlike_string(_samples_path(SourceContainer)) + file_loader = TestFileLoader(provider, sources) + site_packages_path = locate_site_packages() + site_packages = SitePackages.parse(site_packages_path) + notebook_loader = LocalNotebookLoader(provider) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whitelist), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) graph = builder.build_local_file_dependency_graph(Path("import-site-package.py.txt")) - assert graph.locate_dependency(Path("certifi/core.py")) + assert graph.locate_dependency(Path(site_packages_path, "certifi/core.py")) + assert not graph.locate_dependency(Path("core.py")) + assert not graph.locate_dependency(Path("core")) def test_dependency_graph_builder_raises_problem_with_unfound_root_file(empty_index): - sources: dict[str, str] = {} - ws = create_autospec(WorkspaceClient) - ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) - ws.workspace.get_status.return_value = None site_packages = SitePackages.parse(locate_site_packages()) whi = whitelist_mock() - file_loader = create_autospec(LocalFileLoader) - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, LocalNotebookLoader())) + file_loader = create_autospec(FileLoader) + provider = SysPathProvider.from_pathlike_string("") + notebook_loader = LocalNotebookLoader(provider) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) builder.build_local_file_dependency_graph(Path("root1.run.py.txt")) - assert builder.problems == [DependencyProblem('dependency-check', 'File not found: root1.run.py.txt')] + assert list(builder.problems) == [DependencyProblem('file-not-found', 'File not found: root1.run.py.txt')] file_loader.load_dependency.assert_not_called() @@ -364,10 +445,19 @@ def test_dependency_graph_builder_raises_problem_with_unfound_root_notebook(empt ws.workspace.get_status.return_value = None site_packages = SitePackages.parse(locate_site_packages()) whi = whitelist_mock() - file_loader = create_autospec(LocalFileLoader) + file_loader = create_autospec(FileLoader) notebook_loader = create_autospec(LocalNotebookLoader) notebook_loader.is_notebook.return_value = False - builder = DependencyGraphBuilder(DependencyResolver(whi, site_packages, file_loader, notebook_loader)) + provider = SysPathProvider.from_pathlike_string("") + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + SitePackagesResolver(site_packages, file_loader, provider), + WhitelistResolver(whi), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) builder.build_notebook_dependency_graph(Path("root2.run.py.txt")) - assert builder.problems == [DependencyProblem('dependency-check', 'Notebook not found: root2.run.py.txt')] + assert list(builder.problems) == [DependencyProblem('notebook-not-found', 'Notebook not found: root2.run.py.txt')] file_loader.load_dependency.assert_not_called() diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py index 7b4ca0ab94..ac6c0868b0 100644 --- a/tests/unit/source_code/test_notebook.py +++ b/tests/unit/source_code/test_notebook.py @@ -7,15 +7,11 @@ from databricks.sdk import WorkspaceClient from databricks.labs.ucx.source_code.base import Advisory -from databricks.labs.ucx.source_code.dependencies import ( - DependencyGraph, -) -from databricks.labs.ucx.source_code.dependency_loaders import SourceContainer, LocalFileLoader, WorkspaceNotebookLoader -from databricks.labs.ucx.source_code.dependency_resolvers import DependencyResolver -from databricks.labs.ucx.source_code.notebook import Notebook +from databricks.labs.ucx.source_code.graph import DependencyGraph, SourceContainer, DependencyResolver +from databricks.labs.ucx.source_code.notebooks.sources import Notebook +from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, WorkspaceNotebookLoader from databricks.labs.ucx.source_code.python_linter import PythonLinter -from databricks.labs.ucx.source_code.site_packages import SitePackages -from tests.unit import _load_sources, _download_side_effect, whitelist_mock, locate_site_packages +from tests.unit import _load_sources, _download_side_effect # fmt: off @@ -137,15 +133,17 @@ def test_notebook_builds_leaf_dependency_graph(): ws.workspace.get_status.return_value = ObjectInfo( object_type=ObjectType.NOTEBOOK, path="leaf1.py.txt", language=Language.PYTHON ) - loader = create_autospec(LocalFileLoader) - loader.is_notebook.return_value = True - site_packages = SitePackages.parse(locate_site_packages()) - resolver = DependencyResolver(whitelist_mock(), site_packages, loader, WorkspaceNotebookLoader(ws)) - dependency = resolver.resolve_notebook(Path(paths[0])) - graph = DependencyGraph(dependency, None, resolver) + notebook_loader = WorkspaceNotebookLoader(ws) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + ] + ) + dependency = dependency_resolver.resolve_notebook(Path(paths[0])) + graph = DependencyGraph(dependency, None, dependency_resolver) container = dependency.load() container.build_dependency_graph(graph) - assert {str(path) for path in graph.paths} == {"leaf1.py.txt"} + assert {str(path) for path in graph.all_paths} == {"leaf1.py.txt"} def get_status_side_effect(*args): @@ -159,15 +157,17 @@ def test_notebook_builds_depth1_dependency_graph(): ws = create_autospec(WorkspaceClient) ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) ws.workspace.get_status.side_effect = get_status_side_effect - loader = create_autospec(LocalFileLoader) - loader.is_notebook.return_value = False - site_packages = SitePackages.parse(locate_site_packages()) - resolver = DependencyResolver(whitelist_mock(), site_packages, loader, WorkspaceNotebookLoader(ws)) - dependency = resolver.resolve_notebook(Path(paths[0])) - graph = DependencyGraph(dependency, None, resolver) + notebook_loader = WorkspaceNotebookLoader(ws) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + ] + ) + dependency = dependency_resolver.resolve_notebook(Path(paths[0])) + graph = DependencyGraph(dependency, None, dependency_resolver) container = dependency.load() container.build_dependency_graph(graph) - actual = {path[2:] if path.startswith('./') else path for path in (str(path) for path in graph.paths)} + actual = {path[2:] if path.startswith('./') else path for path in (str(path) for path in graph.all_paths)} assert actual == set(paths) @@ -177,15 +177,17 @@ def test_notebook_builds_depth2_dependency_graph(): ws = create_autospec(WorkspaceClient) ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) ws.workspace.get_status.side_effect = get_status_side_effect - loader = create_autospec(LocalFileLoader) - loader.is_notebook.return_value = False - site_packages = SitePackages.parse(locate_site_packages()) - resolver = DependencyResolver(whitelist_mock(), site_packages, loader, WorkspaceNotebookLoader(ws)) - dependency = resolver.resolve_notebook(Path(paths[0])) - graph = DependencyGraph(dependency, None, resolver) + notebook_loader = WorkspaceNotebookLoader(ws) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + ] + ) + dependency = dependency_resolver.resolve_notebook(Path(paths[0])) + graph = DependencyGraph(dependency, None, dependency_resolver) container = dependency.load() container.build_dependency_graph(graph) - actual = {path[2:] if path.startswith('./') else path for path in (str(path) for path in graph.paths)} + actual = {path[2:] if path.startswith('./') else path for path in (str(path) for path in graph.all_paths)} assert actual == set(paths) @@ -196,12 +198,14 @@ def test_notebook_builds_dependency_graph_avoiding_duplicates(): ws = create_autospec(WorkspaceClient) ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, visited, *args, **kwargs) ws.workspace.get_status.side_effect = get_status_side_effect - loader = create_autospec(LocalFileLoader) - loader.is_notebook.return_value = True - site_packages = SitePackages.parse(locate_site_packages()) - resolver = DependencyResolver(whitelist_mock(), site_packages, loader, WorkspaceNotebookLoader(ws)) - dependency = resolver.resolve_notebook(Path(paths[0])) - graph = DependencyGraph(dependency, None, resolver) + notebook_loader = WorkspaceNotebookLoader(ws) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + ] + ) + dependency = dependency_resolver.resolve_notebook(Path(paths[0])) + graph = DependencyGraph(dependency, None, dependency_resolver) container = dependency.load() container.build_dependency_graph(graph) # if visited once only, set and list will have same len @@ -215,15 +219,17 @@ def test_notebook_builds_cyclical_dependency_graph(): ws = create_autospec(WorkspaceClient) ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) ws.workspace.get_status.side_effect = get_status_side_effect - loader = create_autospec(LocalFileLoader) - loader.is_notebook.return_value = False - site_packages = SitePackages.parse(locate_site_packages()) - resolver = DependencyResolver(whitelist_mock(), site_packages, loader, WorkspaceNotebookLoader(ws)) - dependency = resolver.resolve_notebook(Path(paths[0])) - graph = DependencyGraph(dependency, None, resolver) + notebook_loader = WorkspaceNotebookLoader(ws) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + ] + ) + dependency = dependency_resolver.resolve_notebook(Path(paths[0])) + graph = DependencyGraph(dependency, None, dependency_resolver) container = dependency.load() container.build_dependency_graph(graph) - actual = {path[2:] if path.startswith('./') else path for path in (str(path) for path in graph.paths)} + actual = {path[2:] if path.startswith('./') else path for path in (str(path) for path in graph.all_paths)} assert actual == set(paths) @@ -233,15 +239,17 @@ def test_notebook_builds_python_dependency_graph(): ws = create_autospec(WorkspaceClient) ws.workspace.download.side_effect = lambda *args, **kwargs: _download_side_effect(sources, {}, *args, **kwargs) ws.workspace.get_status.side_effect = get_status_side_effect - loader = create_autospec(LocalFileLoader) - loader.is_notebook.return_value = False - site_packages = SitePackages.parse(locate_site_packages()) - resolver = DependencyResolver(whitelist_mock(), site_packages, loader, WorkspaceNotebookLoader(ws)) - dependency = resolver.resolve_notebook(Path(paths[0])) - graph = DependencyGraph(dependency, None, resolver) + notebook_loader = WorkspaceNotebookLoader(ws) + dependency_resolver = DependencyResolver( + [ + NotebookResolver(notebook_loader), + ] + ) + dependency = dependency_resolver.resolve_notebook(Path(paths[0])) + graph = DependencyGraph(dependency, None, dependency_resolver) container = dependency.load() container.build_dependency_graph(graph) - actual = {path[2:] if path.startswith('./') else path for path in (str(path) for path in graph.paths)} + actual = {path[2:] if path.startswith('./') else path for path in (str(path) for path in graph.all_paths)} assert actual == set(paths) diff --git a/tests/unit/source_code/test_notebook_linter.py b/tests/unit/source_code/test_notebook_linter.py index e95359f2b8..b2d1a568df 100644 --- a/tests/unit/source_code/test_notebook_linter.py +++ b/tests/unit/source_code/test_notebook_linter.py @@ -3,7 +3,7 @@ from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex from databricks.labs.ucx.source_code import languages from databricks.labs.ucx.source_code.base import Deprecation, Advisory, Advice -from databricks.labs.ucx.source_code.notebook_linter import NotebookLinter +from databricks.labs.ucx.source_code.notebooks.sources import NotebookLinter index = MigrationIndex([]) diff --git a/tests/unit/source_code/test_notebook_migrator.py b/tests/unit/source_code/test_notebook_migrator.py index 1914aacd29..4b357c9593 100644 --- a/tests/unit/source_code/test_notebook_migrator.py +++ b/tests/unit/source_code/test_notebook_migrator.py @@ -4,12 +4,10 @@ from databricks.sdk.service.workspace import ExportFormat, Language, ObjectInfo, ObjectType from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex -from databricks.labs.ucx.source_code.dependencies import ( - Dependency, -) +from databricks.labs.ucx.source_code.graph import Dependency from databricks.labs.ucx.source_code.languages import Languages -from databricks.labs.ucx.source_code.notebook import Notebook -from databricks.labs.ucx.source_code.notebook_migrator import NotebookMigrator +from databricks.labs.ucx.source_code.notebooks.sources import Notebook +from databricks.labs.ucx.source_code.notebooks.migrator import NotebookMigrator def test_apply_invalid_object_fails(): diff --git a/tests/unit/source_code/test_s3fs.py b/tests/unit/source_code/test_s3fs.py index 6c7d0a8107..e101675e0c 100644 --- a/tests/unit/source_code/test_s3fs.py +++ b/tests/unit/source_code/test_s3fs.py @@ -1,17 +1,19 @@ from pathlib import Path -from unittest.mock import create_autospec import pytest -from databricks.labs.ucx.source_code.dependencies import ( - DependencyGraphBuilder, +from databricks.labs.ucx.source_code.graph import ( + SourceContainer, + DependencyResolver, DependencyProblem, + DependencyGraphBuilder, +) +from databricks.labs.ucx.source_code.files import FileLoader, LocalFileResolver +from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist +from tests.unit import ( + _load_sources, + _local_loader_with_side_effects, ) -from databricks.labs.ucx.source_code.dependency_loaders import SourceContainer, LocalFileLoader, LocalNotebookLoader -from databricks.labs.ucx.source_code.dependency_resolvers import DependencyResolver -from databricks.labs.ucx.source_code.site_packages import SitePackages -from databricks.labs.ucx.source_code.whitelist import Whitelist -from tests.unit import _load_sources, _load_dependency_side_effect, locate_site_packages S3FS_DEPRECATION_MESSAGE = "Use of dependency s3fs is deprecated" @@ -108,20 +110,21 @@ def test_detect_s3fs_import(empty_index, source: str, expected: list[DependencyP datas = _load_sources(SourceContainer, "s3fs-python-compatibility-catalog.yml") whitelist = Whitelist.parse(datas[0]) sources = {"path": source} - file_loader = create_autospec(LocalFileLoader) - file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect(sources, {}, *args) - file_loader.is_file.return_value = True - file_loader.is_notebook.return_value = False - site_packages = SitePackages.parse(locate_site_packages()) - resolver = DependencyResolver(whitelist, site_packages, file_loader, LocalNotebookLoader()) - builder = DependencyGraphBuilder(resolver) + file_loader = _local_loader_with_side_effects(FileLoader, sources, {}) + dependency_resolver = DependencyResolver( + [ + WhitelistResolver(whitelist), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) builder.build_local_file_dependency_graph(Path("path")) - problems: list[DependencyProblem] = resolver.problems + problems: list[DependencyProblem] = list(dependency_resolver.problems) assert problems == expected @pytest.mark.parametrize( - " expected", + "expected", ( [ DependencyProblem( @@ -140,13 +143,14 @@ def test_detect_s3fs_import_in_dependencies(empty_index, expected: list[Dependen sources: dict[str, str] = dict(zip(paths, _load_sources(SourceContainer, *paths))) datas = _load_sources(SourceContainer, "s3fs-python-compatibility-catalog.yml") whitelist = Whitelist.parse(datas[0]) - file_loader = create_autospec(LocalFileLoader) - file_loader.load_dependency.side_effect = lambda *args, **kwargs: _load_dependency_side_effect(sources, {}, *args) - file_loader.is_file.return_value = True - file_loader.is_notebook.return_value = False - site_packages = SitePackages.parse(locate_site_packages()) - resolver = DependencyResolver(whitelist, site_packages, file_loader, LocalNotebookLoader()) - builder = DependencyGraphBuilder(resolver) + file_loader = _local_loader_with_side_effects(FileLoader, sources, {}) + dependency_resolver = DependencyResolver( + [ + WhitelistResolver(whitelist), + LocalFileResolver(file_loader), + ] + ) + builder = DependencyGraphBuilder(dependency_resolver) builder.build_local_file_dependency_graph(Path("root9.py.txt")) - problems: list[DependencyProblem] = resolver.problems + problems: list[DependencyProblem] = list(dependency_resolver.problems) assert problems == expected diff --git a/tests/unit/source_code/test_syspath_provider.py b/tests/unit/source_code/test_syspath_provider.py new file mode 100644 index 0000000000..7780cb7316 --- /dev/null +++ b/tests/unit/source_code/test_syspath_provider.py @@ -0,0 +1,48 @@ +from pathlib import Path + +from databricks.labs.ucx.source_code.files import SysPathProvider + + +def test_provider_is_initialized_with_syspath(): + provider = SysPathProvider.from_sys_path() + assert provider is not None + paths = list(provider.paths) + filtered = list(filter(lambda path: "ucx" in path.as_posix(), paths)) + assert len(filtered) > 0 + + +def test_provider_is_initialized_with_handmade_string(): + provider = SysPathProvider.from_pathlike_string("what:on:earth") + assert provider is not None + paths = list(provider.paths) + assert ["what", "on", "earth"] == [path.as_posix() for path in paths] + + +def test_provider_pushes(): + provider = SysPathProvider.from_pathlike_string("what:on:earth") + provider.push(Path("is")) + provider.push(Path("this")) + paths = list(provider.paths) + assert [path.as_posix() for path in paths] == ["this", "is", "what", "on", "earth"] + + +def test_provider_inserts(): + provider = SysPathProvider.from_pathlike_string("what:on:earth") + provider.insert(1, Path("is")) + paths = list(provider.paths) + assert [path.as_posix() for path in paths] == ["what", "is", "on", "earth"] + + +def test_provider_removes(): + provider = SysPathProvider.from_pathlike_string("what:is:on:earth") + provider.remove(1) + paths = list(provider.paths) + assert [path.as_posix() for path in paths] == ["what", "on", "earth"] + + +def test_provider_pops(): + provider = SysPathProvider.from_pathlike_string("what:on:earth") + popped = provider.pop() + assert popped.as_posix() == "what" + paths = list(provider.paths) + assert [path.as_posix() for path in paths] == ["on", "earth"] diff --git a/tests/unit/source_code/test_whitelist.py b/tests/unit/source_code/test_whitelist.py index 4222203fbb..7cb151c987 100644 --- a/tests/unit/source_code/test_whitelist.py +++ b/tests/unit/source_code/test_whitelist.py @@ -1,5 +1,5 @@ -from databricks.labs.ucx.source_code.dependency_loaders import SourceContainer -from databricks.labs.ucx.source_code.whitelist import Whitelist, UCCompatibility +from databricks.labs.ucx.source_code.graph import SourceContainer +from databricks.labs.ucx.source_code.whitelist import UCCompatibility, Whitelist from tests.unit import _load_sources