Skip to content

Commit

Permalink
Allow-listing language update (#2101)
Browse files Browse the repository at this point in the history
## Changes

This PR updates the language we use for allow-listing functionality.
Some public APIs are affected, but I think the compatibility issues here
are acceptable.
  • Loading branch information
asnare authored Jul 9, 2024
1 parent a3f70b9 commit 2f02467
Show file tree
Hide file tree
Showing 17 changed files with 105 additions and 105 deletions.
4 changes: 2 additions & 2 deletions docs/assessment.md
Original file line number Diff line number Diff line change
Expand Up @@ -531,13 +531,13 @@ The Databricks ML Runtime is not supported on Shared Compute mode clusters. Reco

### AF301.1 - spark.catalog.x

The `spark.catalog.` pattern was found. Commonly used functions in spark.catalog, such as tableExists, listTables, setDefault catalog are not allowed/whitelisted on shared clusters due to security reasons. `spark.sql("<sql command>)` may be a better alternative. DBR 14.1 and above have made these commands available. Upgrade your DBR version.
The `spark.catalog.` pattern was found. Commonly used functions in spark.catalog, such as tableExists, listTables, setDefault catalog are not allowed on shared clusters due to security reasons. `spark.sql("<sql command>)` may be a better alternative. DBR 14.1 and above have made these commands available. Upgrade your DBR version.

[[back to top](#migration-assessment-report)]

### AF301.2 - spark.catalog.x (spark._jsparkSession.catalog)

The `spark._jsparkSession.catalog` pattern was found. Commonly used functions in spark.catalog, such as tableExists, listTables, setDefault catalog are not allowed/whitelisted on shared clusters due to security reasons. `spark.sql("<sql command>)` may be a better alternative. The corresponding `spark.catalog.x` methods may work on DBR 14.1 and above.
The `spark._jsparkSession.catalog` pattern was found. Commonly used functions in spark.catalog, such as tableExists, listTables, setDefault catalog are not allowed on shared clusters due to security reasons. `spark.sql("<sql command>)` may be a better alternative. The corresponding `spark.catalog.x` methods may work on DBR 14.1 and above.

[[back to top](#migration-assessment-report)]

Expand Down
10 changes: 5 additions & 5 deletions src/databricks/labs/ucx/contexts/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
from databricks.labs.ucx.source_code.linters.files import FileLoader, FolderLoader, ImportFileResolver
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.labs.ucx.source_code.graph import DependencyResolver
from databricks.labs.ucx.source_code.known import Whitelist
from databricks.labs.ucx.source_code.known import KnownList
from databricks.labs.ucx.source_code.redash import Redash
from databricks.labs.ucx.workspace_access import generic, redash
from databricks.labs.ucx.workspace_access.groups import GroupManager
Expand Down Expand Up @@ -355,7 +355,7 @@ def verify_has_metastore(self):

@cached_property
def pip_resolver(self):
return PythonLibraryResolver(self.whitelist)
return PythonLibraryResolver(self.allow_list)

@cached_property
def notebook_loader(self) -> NotebookLoader:
Expand Down Expand Up @@ -384,12 +384,12 @@ def folder_loader(self):
return FolderLoader(self.file_loader)

@cached_property
def whitelist(self):
return Whitelist()
def allow_list(self):
return KnownList()

@cached_property
def file_resolver(self):
return ImportFileResolver(self.file_loader, self.whitelist)
return ImportFileResolver(self.file_loader, self.allow_list)

@cached_property
def dependency_resolver(self):
Expand Down
4 changes: 2 additions & 2 deletions src/databricks/labs/ucx/source_code/known.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def as_dict(self):
_DEFAULT_ENCODING = sys.getdefaultencoding()


class Whitelist:
class KnownList:
def __init__(self):
self._module_problems = collections.OrderedDict()
self._library_problems = collections.defaultdict(list)
Expand Down Expand Up @@ -237,4 +237,4 @@ def __repr__(self):

if __name__ == "__main__":
logger = get_logger(__file__) # this only works for __main__
Whitelist.rebuild(Path.cwd())
KnownList.rebuild(Path.cwd())
12 changes: 6 additions & 6 deletions src/databricks/labs/ucx/source_code/linters/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from databricks.labs.ucx.source_code.base import LocatedAdvice, CurrentSessionState
from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, SUPPORTED_EXTENSION_LANGUAGES
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.labs.ucx.source_code.known import Whitelist
from databricks.labs.ucx.source_code.known import KnownList
from databricks.sdk.service.workspace import Language
from databricks.labs.blueprint.tui import Prompts

Expand Down Expand Up @@ -246,9 +246,9 @@ def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> So

class ImportFileResolver(BaseImportResolver, BaseFileResolver):

def __init__(self, file_loader: FileLoader, whitelist: Whitelist):
def __init__(self, file_loader: FileLoader, allow_list: KnownList):
super().__init__()
self._whitelist = whitelist
self._allow_list = allow_list
self._file_loader = file_loader

def resolve_local_file(self, path_lookup, path: Path) -> MaybeDependency:
Expand All @@ -259,16 +259,16 @@ def resolve_local_file(self, path_lookup, path: Path) -> MaybeDependency:
return MaybeDependency(None, [problem])

def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency:
maybe = self._resolve_whitelist(name)
maybe = self._resolve_allow_list(name)
if maybe is not None:
return maybe
maybe = self._resolve_import(path_lookup, name)
if maybe is not None:
return maybe
return self._fail('import-not-found', f"Could not locate import: {name}")

def _resolve_whitelist(self, name: str) -> MaybeDependency | None:
compatibility = self._whitelist.module_compatibility(name)
def _resolve_allow_list(self, name: str) -> MaybeDependency | None:
compatibility = self._allow_list.module_compatibility(name)
if not compatibility.known:
logger.debug(f"Resolving unknown import: {name}")
return None
Expand Down
12 changes: 6 additions & 6 deletions src/databricks/labs/ucx/source_code/python_libraries.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,26 @@
DependencyProblem,
)
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.labs.ucx.source_code.known import Whitelist
from databricks.labs.ucx.source_code.known import KnownList

logger = logging.getLogger(__name__)


class PythonLibraryResolver(LibraryResolver):
# TODO: https://github.com/databrickslabs/ucx/issues/1640

def __init__(self, whitelist: Whitelist, runner: Callable[[str], tuple[int, str, str]] = run_command) -> None:
self._whitelist = whitelist
def __init__(self, allow_list: KnownList, runner: Callable[[str], tuple[int, str, str]] = run_command) -> None:
self._allow_list = allow_list
self._runner = runner

def register_library(self, path_lookup: PathLookup, *libraries: str) -> list[DependencyProblem]:
"""We delegate to pip to install the library and augment the path look-up to resolve the library at import.
This gives us the flexibility to install any library that is not in the whitelist, and we don't have to
This gives us the flexibility to install any library that is not in the allow-list, and we don't have to
bother about parsing cross-version dependencies in our code."""
if len(libraries) == 0:
return []
if len(libraries) == 1: # Multiple libraries might be installation flags
compatibility = self._whitelist.distribution_compatibility(libraries[0])
compatibility = self._allow_list.distribution_compatibility(libraries[0])
if compatibility.known:
return compatibility.problems
return self._install_library(path_lookup, *libraries)
Expand Down Expand Up @@ -133,4 +133,4 @@ def _get_setup() -> Callable | None:
return setup

def __str__(self) -> str:
return f"PythonLibraryResolver(whitelist={self._whitelist})"
return f"PythonLibraryResolver(allow_list={self._allow_list})"
22 changes: 11 additions & 11 deletions tests/integration/source_code/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
from databricks.labs.ucx.source_code.base import CurrentSessionState
from databricks.labs.ucx.source_code.known import UNKNOWN, Whitelist
from databricks.labs.ucx.source_code.known import UNKNOWN, KnownList
from databricks.labs.ucx.source_code.linters.files import LocalCodeLinter
from databricks.labs.ucx.source_code.linters.context import LinterContext
from databricks.labs.ucx.source_code.path_lookup import PathLookup
Expand Down Expand Up @@ -260,11 +260,11 @@ def test_workflow_linter_lints_job_with_missing_library(
make_directory,
):
expected_problem_message = "Could not locate import: databricks.labs.ucx"
whitelist = create_autospec(Whitelist) # databricks is in default list
whitelist.module_compatibility.return_value = UNKNOWN
allow_list = create_autospec(KnownList) # databricks is in default list
allow_list.module_compatibility.return_value = UNKNOWN

simple_ctx = simple_ctx.replace(
whitelist=whitelist,
allow_list=allow_list,
path_lookup=PathLookup(Path("/non/existing/path"), []), # Avoid finding current project
)

Expand All @@ -274,7 +274,7 @@ def test_workflow_linter_lints_job_with_missing_library(
problems = simple_ctx.workflow_linter.lint_job(job_without_ucx_library.job_id)

assert len([problem for problem in problems if problem.message == expected_problem_message]) > 0
whitelist.module_compatibility.assert_called_once_with("databricks.labs.ucx")
allow_list.module_compatibility.assert_called_once_with("databricks.labs.ucx")


def test_workflow_linter_lints_job_with_wheel_dependency(
Expand All @@ -288,7 +288,7 @@ def test_workflow_linter_lints_job_with_wheel_dependency(
expected_problem_message = "Could not locate import: databricks.labs.ucx"

simple_ctx = simple_ctx.replace(
whitelist=Whitelist(), # databricks is in default list
allow_list=KnownList(), # databricks is in default list
path_lookup=PathLookup(Path("/non/existing/path"), []), # Avoid finding current project
)

Expand Down Expand Up @@ -348,12 +348,12 @@ def test_job_spark_python_task_linter_unhappy_path(


def test_workflow_linter_lints_python_wheel_task(simple_ctx, ws, make_job, make_random):
whitelist = create_autospec(Whitelist) # databricks is in default list
whitelist.module_compatibility.return_value = UNKNOWN
whitelist.distribution_compatibility.return_value = UNKNOWN
allow_list = create_autospec(KnownList) # databricks is in default list
allow_list.module_compatibility.return_value = UNKNOWN
allow_list.distribution_compatibility.return_value = UNKNOWN

simple_ctx = simple_ctx.replace(
whitelist=whitelist,
allow_list=allow_list,
path_lookup=PathLookup(Path("/non/existing/path"), []), # Avoid finding current project
)

Expand All @@ -379,7 +379,7 @@ def test_workflow_linter_lints_python_wheel_task(simple_ctx, ws, make_job, make_

assert len([problem for problem in problems if problem.code == "library-dist-info-not-found"]) == 0
assert len([problem for problem in problems if problem.code == "library-entrypoint-not-found"]) == 0
whitelist.distribution_compatibility.assert_called_once()
allow_list.distribution_compatibility.assert_called_once()


def test_job_dlt_task_linter_unhappy_path(
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/source_code/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
)
from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex
from databricks.labs.ucx.source_code.graph import DependencyResolver
from databricks.labs.ucx.source_code.known import Whitelist
from databricks.labs.ucx.source_code.known import KnownList
from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, NotebookResolver
from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
Expand Down Expand Up @@ -49,8 +49,8 @@ def extended_test_index():

@pytest.fixture
def simple_dependency_resolver(mock_path_lookup):
whitelist = Whitelist()
library_resolver = PythonLibraryResolver(whitelist)
allow_list = KnownList()
library_resolver = PythonLibraryResolver(allow_list)
notebook_resolver = NotebookResolver(NotebookLoader())
import_resolver = ImportFileResolver(FileLoader(), whitelist)
import_resolver = ImportFileResolver(FileLoader(), allow_list)
return DependencyResolver(library_resolver, notebook_resolver, import_resolver, mock_path_lookup)
18 changes: 9 additions & 9 deletions tests/unit/source_code/linters/test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader
from databricks.labs.ucx.source_code.notebooks.migrator import NotebookMigrator
from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
from databricks.labs.ucx.source_code.known import Whitelist
from databricks.labs.ucx.source_code.known import KnownList

from databricks.sdk.service.workspace import Language

Expand Down Expand Up @@ -110,13 +110,13 @@ def test_linter_walks_directory(mock_path_lookup, migration_index):
mock_path_lookup.append_path(Path(_samples_path(SourceContainer)))
file_loader = FileLoader()
folder_loader = FolderLoader(file_loader)
whitelist = Whitelist()
pip_resolver = PythonLibraryResolver(whitelist)
allow_list = KnownList()
pip_resolver = PythonLibraryResolver(allow_list)
session_state = CurrentSessionState()
resolver = DependencyResolver(
pip_resolver,
NotebookResolver(NotebookLoader()),
ImportFileResolver(file_loader, whitelist),
ImportFileResolver(file_loader, allow_list),
mock_path_lookup,
)
path = Path(Path(__file__).parent, "../samples", "simulate-sys-path")
Expand All @@ -129,7 +129,7 @@ def test_linter_walks_directory(mock_path_lookup, migration_index):


def test_triple_dot_import():
file_resolver = ImportFileResolver(FileLoader(), Whitelist())
file_resolver = ImportFileResolver(FileLoader(), KnownList())
path_lookup = create_autospec(PathLookup)
path_lookup.cwd.as_posix.return_value = '/some/path/to/folder'
path_lookup.resolve.return_value = Path('/some/path/foo.py')
Expand All @@ -141,7 +141,7 @@ def test_triple_dot_import():


def test_single_dot_import():
file_resolver = ImportFileResolver(FileLoader(), Whitelist())
file_resolver = ImportFileResolver(FileLoader(), KnownList())
path_lookup = create_autospec(PathLookup)
path_lookup.cwd.as_posix.return_value = '/some/path/to/folder'
path_lookup.resolve.return_value = Path('/some/path/to/folder/foo.py')
Expand All @@ -168,10 +168,10 @@ def test_known_issues(path: Path, migration_index):
folder_loader = FolderLoader(file_loader)
path_lookup = PathLookup.from_sys_path(Path.cwd())
session_state = CurrentSessionState()
whitelist = Whitelist()
allow_list = KnownList()
notebook_resolver = NotebookResolver(NotebookLoader())
import_resolver = ImportFileResolver(file_loader, whitelist)
pip_resolver = PythonLibraryResolver(whitelist)
import_resolver = ImportFileResolver(file_loader, allow_list)
pip_resolver = PythonLibraryResolver(allow_list)
resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, path_lookup)
linter = LocalCodeLinter(
file_loader,
Expand Down
10 changes: 5 additions & 5 deletions tests/unit/source_code/notebooks/test_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
NotebookLoader,
)
from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
from databricks.labs.ucx.source_code.known import Whitelist
from databricks.labs.ucx.source_code.known import KnownList


def test_basic_cell_extraction() -> None:
Expand Down Expand Up @@ -119,7 +119,7 @@ def test_pip_cell_build_dependency_graph_reports_unknown_library(mock_path_looku
dependency = Dependency(FileLoader(), Path("test"))
notebook_loader = NotebookLoader()
notebook_resolver = NotebookResolver(notebook_loader)
pip_resolver = PythonLibraryResolver(Whitelist())
pip_resolver = PythonLibraryResolver(KnownList())
dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup)
graph = DependencyGraph(dependency, None, dependency_resolver, mock_path_lookup, CurrentSessionState())

Expand All @@ -137,10 +137,10 @@ def test_pip_cell_build_dependency_graph_resolves_installed_library(mock_path_lo
dependency = Dependency(FileLoader(), Path("test"))
notebook_loader = NotebookLoader()
notebook_resolver = NotebookResolver(notebook_loader)
whitelist = Whitelist()
allow_list = KnownList()
file_loader = FileLoader()
pip_resolver = PythonLibraryResolver(whitelist)
import_resolver = ImportFileResolver(file_loader, whitelist)
pip_resolver = PythonLibraryResolver(allow_list)
import_resolver = ImportFileResolver(file_loader, allow_list)
dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, mock_path_lookup)
graph = DependencyGraph(dependency, None, dependency_resolver, mock_path_lookup, CurrentSessionState())

Expand Down
16 changes: 8 additions & 8 deletions tests/unit/source_code/test_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
NotebookLoader,
)
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.labs.ucx.source_code.known import Whitelist
from databricks.labs.ucx.source_code.known import KnownList
from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
from tests.unit import (
locate_site_packages,
Expand Down Expand Up @@ -125,8 +125,8 @@ def test_dependency_resolver_terminates_at_known_libraries(empty_index, mock_not
site_packages_path = locate_site_packages()
lookup.append_path(site_packages_path)
file_loader = FileLoader()
import_resolver = ImportFileResolver(file_loader, Whitelist())
library_resolver = PythonLibraryResolver(Whitelist())
import_resolver = ImportFileResolver(file_loader, KnownList())
library_resolver = PythonLibraryResolver(KnownList())
resolver = DependencyResolver(library_resolver, mock_notebook_resolver, import_resolver, lookup)
maybe = resolver.build_local_file_dependency_graph(Path("import-site-package.py"), CurrentSessionState())
assert not maybe.failed
Expand Down Expand Up @@ -157,9 +157,9 @@ def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> So
return None

file_loader = FailingFileLoader()
whitelist = Whitelist()
import_resolver = ImportFileResolver(file_loader, whitelist)
pip_resolver = PythonLibraryResolver(whitelist)
allow_list = KnownList()
import_resolver = ImportFileResolver(file_loader, allow_list)
pip_resolver = PythonLibraryResolver(allow_list)
resolver = DependencyResolver(pip_resolver, mock_notebook_resolver, import_resolver, mock_path_lookup)
maybe = resolver.build_local_file_dependency_graph(Path("import-sub-site-package.py"), CurrentSessionState())
assert list(maybe.problems) == [
Expand All @@ -177,7 +177,7 @@ def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> So

notebook_loader = FailingNotebookLoader()
notebook_resolver = NotebookResolver(notebook_loader)
pip_resolver = PythonLibraryResolver(Whitelist())
pip_resolver = PythonLibraryResolver(KnownList())
resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup)
maybe = resolver.build_notebook_dependency_graph(Path("root5.py"), CurrentSessionState())
assert list(maybe.problems) == [
Expand All @@ -186,7 +186,7 @@ def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> So


def test_dependency_resolver_raises_problem_with_missing_file_loader(mock_notebook_resolver, mock_path_lookup):
library_resolver = PythonLibraryResolver(Whitelist())
library_resolver = PythonLibraryResolver(KnownList())
import_resolver = create_autospec(BaseImportResolver)
import_resolver.resolve_import.return_value = None
resolver = DependencyResolver(library_resolver, mock_notebook_resolver, import_resolver, mock_path_lookup)
Expand Down
Loading

0 comments on commit 2f02467

Please sign in to comment.