Skip to content

Commit

Permalink
Yield DependencyProblem if job on runtime DBR14+ and using .egg dep…
Browse files Browse the repository at this point in the history
…endency (#2020)

## Changes
Create DependencyProblem if job on runtime DBR14+ and using .egg
dependency

### Linked issues

Fix #1793

---------

Co-authored-by: Serge Smertin <[email protected]>
Co-authored-by: Serge Smertin <[email protected]>
  • Loading branch information
3 people authored Jul 17, 2024
1 parent beaf251 commit 1ae0050
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 7 deletions.
22 changes: 15 additions & 7 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProb
return list(self._register_task_dependencies(parent))

def _register_task_dependencies(self, graph: DependencyGraph) -> Iterable[DependencyProblem]:
yield from self._register_cluster_info()
yield from self._register_libraries(graph)
yield from self._register_existing_cluster_id(graph)
yield from self._register_notebook(graph)
Expand All @@ -110,7 +111,6 @@ def _register_task_dependencies(self, graph: DependencyGraph) -> Iterable[Depend
yield from self._register_run_job_task(graph)
yield from self._register_pipeline_task(graph)
yield from self._register_spark_submit_task(graph)
yield from self._register_cluster_info()

def _register_libraries(self, graph: DependencyGraph) -> Iterable[DependencyProblem]:
if not self._task.libraries:
Expand All @@ -124,12 +124,7 @@ def _register_library(self, graph: DependencyGraph, library: compute.Library) ->
if problems:
yield from problems
if library.egg:
logger.info(f"Registering library from {library.egg}")
with self._ws.workspace.download(library.egg, format=ExportFormat.AUTO) as remote_file:
with tempfile.TemporaryDirectory() as directory:
local_file = Path(directory) / Path(library.egg).name
local_file.write_bytes(remote_file.read())
yield from graph.register_library(local_file.as_posix())
yield from self._register_egg(graph, library)
if library.whl:
with self._ws.workspace.download(library.whl, format=ExportFormat.AUTO) as remote_file:
with tempfile.TemporaryDirectory() as directory:
Expand All @@ -153,6 +148,19 @@ def _register_library(self, graph: DependencyGraph, library: compute.Library) ->
# TODO: https://github.com/databrickslabs/ucx/issues/1641
yield DependencyProblem('not-yet-implemented', 'Jar library is not yet implemented')

def _register_egg(self, graph, library):
if self.runtime_version > (14, 0):
yield DependencyProblem(
code='not-supported',
message='Installing eggs is no longer supported on Databricks 14.0 or higher',
)
logger.info(f"Registering library from {library.egg}")
with self._ws.workspace.download(library.egg, format=ExportFormat.AUTO) as remote_file:
with tempfile.TemporaryDirectory() as directory:
local_file = Path(directory) / Path(library.egg).name
local_file.write_bytes(remote_file.read())
yield from graph.register_library(local_file.as_posix())

def _register_notebook(self, graph: DependencyGraph) -> Iterable[DependencyProblem]:
if not self._task.notebook_task:
return []
Expand Down
39 changes: 39 additions & 0 deletions tests/integration/source_code/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,3 +440,42 @@ def test_job_dlt_task_linter_happy_path(

problems = simple_ctx.workflow_linter.lint_job(j.job_id)
assert len([problem for problem in problems if problem.message == "Could not locate import: greenlet"]) == 0


def test_job_dependency_problem_egg_dbr14plus(
make_cluster, make_job, make_directory, make_notebook, make_random, simple_ctx, ws
):
egg_file = Path(__file__).parent / "../../unit/source_code/samples/distribution/dist/thingy-0.0.1-py3.10.egg"
task_spark_conf = None
entrypoint = make_directory()
notebook_path = make_notebook()
remote_egg_file = f"{entrypoint}/{egg_file.name}"
with egg_file.open("rb") as f:
ws.workspace.upload(remote_egg_file, f.read(), format=ImportFormat.AUTO)
library = compute.Library(egg=remote_egg_file)
task = jobs.Task(
task_key=make_random(4),
description=make_random(4),
new_cluster=compute.ClusterSpec(
num_workers=1,
node_type_id=ws.clusters.select_node_type(local_disk=True, min_memory_gb=16),
spark_version=ws.clusters.select_spark_version(latest=True),
spark_conf=task_spark_conf,
),
libraries=[library],
notebook_task=jobs.NotebookTask(notebook_path=str(notebook_path)),
)

j = make_job(tasks=[task])

problems = simple_ctx.workflow_linter.lint_job(j.job_id)
assert (
len(
[
problem
for problem in problems
if problem.message == "Installing eggs is no longer supported on Databricks 14.0 or higher"
]
)
== 1
)

0 comments on commit 1ae0050

Please sign in to comment.