diff --git a/src/databricks/labs/ucx/assessment/crawlers.py b/src/databricks/labs/ucx/assessment/crawlers.py index 6594373c55..0d1a324a88 100644 --- a/src/databricks/labs/ucx/assessment/crawlers.py +++ b/src/databricks/labs/ucx/assessment/crawlers.py @@ -31,6 +31,7 @@ _AZURE_SP_CONF_FAILURE_MSG = "Uses azure service principal credentials config in" _SECRET_LIST_LENGTH = 3 _CLIENT_ENDPOINT_LENGTH = 6 +_INIT_SCRIPT_DBFS_PATH = 2 @dataclass @@ -86,13 +87,14 @@ class GlobalInitScriptInfo: def _get_init_script_data(w, init_script_info): if init_script_info.dbfs: - file_api_format_destination = init_script_info.dbfs.destination.split(":")[1] - if file_api_format_destination: - try: - data = w.dbfs.read(file_api_format_destination).data - return base64.b64decode(data).decode("utf-8") - except Exception: - return None + if len(init_script_info.dbfs.destination.split(":")) == _INIT_SCRIPT_DBFS_PATH: + file_api_format_destination = init_script_info.dbfs.destination.split(":")[1] + if file_api_format_destination: + try: + data = w.dbfs.read(file_api_format_destination).data + return base64.b64decode(data).decode("utf-8") + except Exception: + return None if init_script_info.workspace: workspace_file_destination = init_script_info.workspace.destination if workspace_file_destination: diff --git a/tests/unit/assessment/test_assessment.py b/tests/unit/assessment/test_assessment.py index 7a882fe4fc..c94b47f648 100644 --- a/tests/unit/assessment/test_assessment.py +++ b/tests/unit/assessment/test_assessment.py @@ -2318,3 +2318,160 @@ def test_list_all_pipeline_with_conf_spn_secret_unavlbl(mocker): result_set = AzureServicePrincipalCrawler(ws, MockBackend(), "ucx")._list_all_pipeline_with_spn_in_spark_conf() assert len(result_set) == 0 + + +def test_cluster_init_script_check_dbfs(mocker): + sample_clusters = [ + ClusterDetails( + autoscale=AutoScale(min_workers=1, max_workers=6), + cluster_source=ClusterSource.UI, + spark_context_id=5134472582179565315, + spark_env_vars=None, + spark_version="12.3.x-cpu-ml-scala2.12", + cluster_id="0810-225833-atlanta69", + cluster_name="Tech Summit FY24 Cluster-1", + init_scripts=[ + InitScriptInfo( + dbfs=DbfsStorageInfo(destination="dbfs:"), + s3=None, + volumes=None, + workspace=None, + ), + InitScriptInfo( + dbfs=DbfsStorageInfo(destination="dbfs"), + s3=None, + volumes=None, + workspace=None, + ), + InitScriptInfo( + dbfs=DbfsStorageInfo(destination=":/users/test@test.com/init_scripts/test.sh"), + s3=None, + volumes=None, + workspace=None, + ), + InitScriptInfo( + dbfs=None, + s3=None, + volumes=None, + workspace=WorkspaceStorageInfo( + destination="/Users/dipankar.kushari@databricks.com/init_script_1.sh" + ), + ), + ], + ) + ] + ws = mocker.Mock() + ws.clusters.list.return_value = sample_clusters + ws.dbfs.read().data = "JXNoCmVjaG8gIj0=" + init_crawler = ClustersCrawler(ws, MockBackend(), "ucx").snapshot() + assert len(init_crawler) == 1 + + +def test_job_cluster_init_script_check_dbfs(): + sample_jobs = [ + BaseJob( + created_time=1694536604319, + creator_user_name="anonymous@databricks.com", + job_id=536591785949415, + settings=JobSettings( + compute=None, + continuous=None, + tasks=[ + Task( + task_key="Ingest", + existing_cluster_id="0807-225846-avon493", + notebook_task=NotebookTask( + notebook_path="/Users/foo.bar@databricks.com/Customers/Example/Test/Load" + ), + timeout_seconds=0, + ) + ], + timeout_seconds=0, + ), + ), + BaseJob( + created_time=1694536604321, + creator_user_name="anonymous@databricks.com", + job_id=536591785949416, + settings=JobSettings( + compute=None, + continuous=None, + tasks=[ + Task( + task_key="Ingest", + existing_cluster_id="0810-229933-chicago99", + notebook_task=NotebookTask( + notebook_path="/Users/foo.bar@databricks.com/Customers/Example/Test/Load" + ), + timeout_seconds=0, + ) + ], + timeout_seconds=0, + ), + ), + BaseJob( + created_time=1694536604319, + creator_user_name="anonymous@databricks.com", + job_id=536591785949417, + settings=JobSettings( + compute=None, + continuous=None, + tasks=[ + Task( + task_key="Ingest", + existing_cluster_id="0811-929933-maine96", + notebook_task=NotebookTask( + notebook_path="/Users/foo.bar@databricks.com/Customers/Example/Test/Load" + ), + timeout_seconds=0, + ) + ], + timeout_seconds=0, + ), + ), + ] + + sample_clusters = [ + ClusterDetails( + init_scripts=[ + InitScriptInfo( + dbfs=DbfsStorageInfo(destination="dbfs"), + s3=None, + volumes=None, + workspace=None, + ), + InitScriptInfo( + dbfs=DbfsStorageInfo(destination="dbfs:"), + s3=None, + volumes=None, + workspace=None, + ), + InitScriptInfo( + dbfs=DbfsStorageInfo(destination=":/users/test@test.com/init_scripts/test.sh"), + s3=None, + volumes=None, + workspace=None, + ), + InitScriptInfo( + dbfs=None, + s3=None, + volumes=None, + workspace=WorkspaceStorageInfo( + destination="/Users/dipankar.kushari@databricks.com/init_script_1.sh" + ), + ), + ], + autoscale=AutoScale(min_workers=1, max_workers=6), + spark_context_id=5134472582179566666, + spark_env_vars=None, + spark_version="13.3.x-cpu-ml-scala2.12", + cluster_id="0807-225846-avon493", + cluster_source=ClusterSource.JOB, + ) + ] + ws = Mock() + ws.workspace.export().content = "JXNoCmVjaG8gIj0=" + result_set = JobsCrawler(ws, MockBackend(), "ucx")._assess_jobs( + sample_jobs, {c.cluster_id: c for c in sample_clusters} + ) + assert len(result_set) == 3