diff --git a/.github/workflows/acceptance.yml b/.github/workflows/acceptance.yml index 407cdb2d6c..99991b2ddd 100644 --- a/.github/workflows/acceptance.yml +++ b/.github/workflows/acceptance.yml @@ -39,12 +39,16 @@ jobs: - name: Install hatch run: pip install hatch==1.9.4 + - name: Fetch relevant branches + run: | + git fetch origin $GITHUB_BASE_REF:$GITHUB_BASE_REF + git fetch origin $GITHUB_HEAD_REF:$GITHUB_HEAD_REF + - name: Run integration tests - # ... uses: databrickslabs/sandbox/acceptance@acceptance/v0.2.2 with: vault_uri: ${{ secrets.VAULT_URI }} - timeout: 45m + timeout: 55m env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ARM_CLIENT_ID: ${{ secrets.ARM_CLIENT_ID }} diff --git a/README.md b/README.md index b7d1f5ba1d..c96ee92ede 100644 --- a/README.md +++ b/README.md @@ -618,6 +618,9 @@ in the Migration dashboard. The `experimental-workflow-linter` workflow lints accessible code belonging to all workflows/jobs present in the workspace. The linting emits problems indicating what to resolve for making the code Unity Catalog compatible. +Once the workflow completes, the output will be stored in `$inventory_database.workflow_problems` table, and displayed +in the Migration dashboard. + ![code compatibility problems](docs/code_compatibility_problems.png) [[back to top](#databricks-labs-ucx)] diff --git a/src/databricks/labs/ucx/config.py b/src/databricks/labs/ucx/config.py index 95da81fbf9..f516ef079b 100644 --- a/src/databricks/labs/ucx/config.py +++ b/src/databricks/labs/ucx/config.py @@ -41,7 +41,7 @@ class WorkspaceConfig: # pylint: disable=too-many-instance-attributes uber_spn_id: str | None = None uber_instance_profile: str | None = None - is_terraform_used: bool = False # Not used, keep for backwards compatability + is_terraform_used: bool = False # Not used, keep for backwards compatibility # Whether the assessment should capture a specific list of databases, if not specified, it will list all databases. include_databases: list[str] | None = None @@ -52,6 +52,9 @@ class WorkspaceConfig: # pylint: disable=too-many-instance-attributes exclude_paths_in_mount: list[str] | None = None include_paths_in_mount: list[str] | None = None + # Used for limiting the number of jobs to be analysed + include_job_ids: list[int] | None = None + # Whether to trigger assessment job after installation trigger_job: bool = False diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 1daf712b84..fac2106163 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -402,6 +402,7 @@ def workflow_linter(self): self.dependency_resolver, self.path_lookup, MigrationIndex([]), # TODO: bring back self.tables_migrator.index() + self.config.include_job_ids, ) @cached_property diff --git a/src/databricks/labs/ucx/mixins/fixtures.py b/src/databricks/labs/ucx/mixins/fixtures.py index 64f6804b9a..194f0178fa 100644 --- a/src/databricks/labs/ucx/mixins/fixtures.py +++ b/src/databricks/labs/ucx/mixins/fixtures.py @@ -13,8 +13,6 @@ from typing import BinaryIO import pytest -from databricks.labs.blueprint.entrypoint import is_in_debug -from databricks.labs.blueprint.wheels import ProductInfo from databricks.labs.lsql.backends import StatementExecutionBackend from databricks.labs.blueprint.commands import CommandExecutor from databricks.sdk import AccountClient, WorkspaceClient @@ -53,7 +51,6 @@ ) from databricks.sdk.service.workspace import ImportFormat, Language -from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.workspace_access.groups import MigratedGroup # this file will get to databricks-labs-pytester project and be maintained/refactored there @@ -1400,33 +1397,3 @@ def get_test_purge_time() -> str: def get_purge_suffix() -> str: """HEX-encoded purge time suffix for test objects.""" return f'ra{int(get_test_purge_time()):x}' - - -@pytest.fixture -def modified_or_skip(): - product_info = ProductInfo.from_class(WorkspaceConfig) - checkout_root = product_info.checkout_root() - - def run_command(command: str) -> str: - with subprocess.Popen( - command.split(), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=checkout_root, - ) as process: - output, error = process.communicate() - if process.returncode != 0: - pytest.fail(f"Command failed: {command}\n{error.decode('utf-8')}", pytrace=False) - return output.decode("utf-8").strip() - - def inner(package: str): - if is_in_debug(): - return # not skipping, as we're debugging - if 'TEST_NIGHTLY' in os.environ: - return # or during nightly runs - current_branch = run_command("git branch --show-current") - changed_files = run_command(f"git diff origin/main..{current_branch} --name-only") - if package not in changed_files: - pytest.skip(f"Skipping long test as {package} was not modified in branch {current_branch}") - - return inner diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 2d00db8a76..45c99e5206 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -287,17 +287,22 @@ def __init__( resolver: DependencyResolver, path_lookup: PathLookup, migration_index: MigrationIndex, + include_job_ids: list[int] | None = None, ): self._ws = ws self._resolver = resolver self._path_lookup = path_lookup self._migration_index = migration_index + self._include_job_ids = include_job_ids def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): tasks = [] all_jobs = list(self._ws.jobs.list()) logger.info(f"Preparing {len(all_jobs)} linting jobs...") for job in all_jobs: + if self._include_job_ids and job.job_id not in self._include_job_ids: + logger.info(f"Skipping job {job.job_id}...") + continue tasks.append(functools.partial(self.lint_job, job.job_id)) logger.info(f"Running {tasks} linting tasks in parallel...") job_problems, errors = Threads.gather('linting workflows', tasks) diff --git a/src/databricks/labs/ucx/source_code/known-readme.txt b/src/databricks/labs/ucx/source_code/known-readme.txt deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index a9b763ecb3..94ee7dee37 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -969,6 +969,53 @@ "bioinfokit.help": [], "bioinfokit.visuz": [] }, + "bitsandbytes": { + "bitsandbytes": [], + "bitsandbytes.autograd": [], + "bitsandbytes.autograd._functions": [], + "bitsandbytes.cextension": [], + "bitsandbytes.cuda_setup": [], + "bitsandbytes.cuda_setup.env_vars": [], + "bitsandbytes.cuda_setup.main": [], + "bitsandbytes.functional": [], + "bitsandbytes.nn": [], + "bitsandbytes.nn.modules": [], + "bitsandbytes.nn.triton_based_modules": [], + "bitsandbytes.optim": [], + "bitsandbytes.optim.adagrad": [], + "bitsandbytes.optim.adam": [], + "bitsandbytes.optim.adamw": [], + "bitsandbytes.optim.lamb": [], + "bitsandbytes.optim.lars": [], + "bitsandbytes.optim.lion": [], + "bitsandbytes.optim.optimizer": [], + "bitsandbytes.optim.rmsprop": [], + "bitsandbytes.optim.sgd": [], + "bitsandbytes.research": [], + "bitsandbytes.research.autograd": [], + "bitsandbytes.research.autograd._functions": [], + "bitsandbytes.research.nn": [], + "bitsandbytes.research.nn.modules": [], + "bitsandbytes.triton": [], + "bitsandbytes.triton.dequantize_rowwise": [], + "bitsandbytes.triton.int8_matmul_mixed_dequantize": [], + "bitsandbytes.triton.int8_matmul_rowwise_dequantize": [], + "bitsandbytes.triton.quantize_columnwise_and_transpose": [], + "bitsandbytes.triton.quantize_global": [], + "bitsandbytes.triton.quantize_rowwise": [], + "bitsandbytes.triton.triton_utils": [], + "bitsandbytes.utils": [], + "tests": [], + "tests.test_autograd": [], + "tests.test_cuda_setup_evaluator": [], + "tests.test_functional": [], + "tests.test_generation": [], + "tests.test_linear4bit": [], + "tests.test_linear8bitlt": [], + "tests.test_modules": [], + "tests.test_optim": [], + "tests.test_triton": [] + }, "black": { "_black_version": [], "black": [], @@ -1692,6 +1739,44 @@ "dbldatagen.text_generators": [], "dbldatagen.utils": [] }, + "dbtunnel": { + "dbtunnel": [], + "dbtunnel.arize_phoenix_ui": [], + "dbtunnel.bokeh": [], + "dbtunnel.chainlit": [], + "dbtunnel.cli": [], + "dbtunnel.cli.cli": [], + "dbtunnel.code_server": [], + "dbtunnel.dash": [], + "dbtunnel.fastapi": [], + "dbtunnel.flask": [], + "dbtunnel.gradio": [], + "dbtunnel.ngrok": [], + "dbtunnel.nicegui": [], + "dbtunnel.ray": [], + "dbtunnel.relay": [], + "dbtunnel.shiny": [], + "dbtunnel.solara": [], + "dbtunnel.stable_diffusion_ui": [], + "dbtunnel.streamlit": [], + "dbtunnel.tunnels": [], + "dbtunnel.utils": [], + "dbtunnel.uvicorn": [], + "dbtunnel.vendor": [], + "dbtunnel.vendor.asgiproxy": [], + "dbtunnel.vendor.asgiproxy.base": [], + "dbtunnel.vendor.asgiproxy.config": [], + "dbtunnel.vendor.asgiproxy.context": [], + "dbtunnel.vendor.asgiproxy.frameworks": [], + "dbtunnel.vendor.asgiproxy.proxies": [], + "dbtunnel.vendor.asgiproxy.proxies.http": [], + "dbtunnel.vendor.asgiproxy.proxies.websocket": [], + "dbtunnel.vendor.asgiproxy.simple_proxy": [], + "dbtunnel.vendor.asgiproxy.templates": [], + "dbtunnel.vendor.asgiproxy.utils": [], + "dbtunnel.vendor.asgiproxy.utils.headers": [], + "dbtunnel.vendor.asgiproxy.utils.streams": [] + }, "dbx": { "dbx": [], "dbx.api": [], @@ -2886,6 +2971,26 @@ "emmv": { "emmv": [] }, + "einops": { + "einops": [], + "einops._backends": [], + "einops._torch_specific": [], + "einops.array_api": [], + "einops.einops": [], + "einops.experimental": [], + "einops.experimental.indexing": [], + "einops.layers": [], + "einops.layers._einmix": [], + "einops.layers.chainer": [], + "einops.layers.flax": [], + "einops.layers.keras": [], + "einops.layers.oneflow": [], + "einops.layers.paddle": [], + "einops.layers.tensorflow": [], + "einops.layers.torch": [], + "einops.packing": [], + "einops.parsing": [] + }, "entrypoints": { "entrypoints": [] }, @@ -3602,6 +3707,12 @@ "fastcluster": { "fastcluster": [] }, + "fasttext": { + "fasttext.FastText": [], + "fasttext": [], + "fasttext.util": [], + "fasttext.util.util": [] + }, "filelock": { "filelock": [], "filelock._api": [], @@ -3737,6 +3848,205 @@ "fsspec.transaction": [], "fsspec.utils": [] }, + "fugue": { + "fugue": [], + "fugue._utils": [], + "fugue._utils.display": [], + "fugue._utils.exception": [], + "fugue._utils.interfaceless": [], + "fugue._utils.io": [], + "fugue._utils.misc": [], + "fugue._utils.registry": [], + "fugue.api": [], + "fugue.bag": [], + "fugue.bag.array_bag": [], + "fugue.bag.bag": [], + "fugue.collections": [], + "fugue.collections.partition": [], + "fugue.collections.sql": [], + "fugue.collections.yielded": [], + "fugue.column": [], + "fugue.column.expressions": [], + "fugue.column.functions": [], + "fugue.column.sql": [], + "fugue.constants": [], + "fugue.dataframe": [], + "fugue.dataframe.api": [], + "fugue.dataframe.array_dataframe": [], + "fugue.dataframe.arrow_dataframe": [], + "fugue.dataframe.dataframe": [], + "fugue.dataframe.dataframe_iterable_dataframe": [], + "fugue.dataframe.dataframes": [], + "fugue.dataframe.function_wrapper": [], + "fugue.dataframe.iterable_dataframe": [], + "fugue.dataframe.pandas_dataframe": [], + "fugue.dataframe.utils": [], + "fugue.dataset": [], + "fugue.dataset.api": [], + "fugue.dataset.dataset": [], + "fugue.dev": [], + "fugue.exceptions": [], + "fugue.execution": [], + "fugue.execution.api": [], + "fugue.execution.execution_engine": [], + "fugue.execution.factory": [], + "fugue.execution.native_execution_engine": [], + "fugue.extensions": [], + "fugue.extensions._builtins": [], + "fugue.extensions._builtins.creators": [], + "fugue.extensions._builtins.outputters": [], + "fugue.extensions._builtins.processors": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sc is not supported on UC Shared Clusters. Rewrite it using spark" + } + ], + "fugue.extensions._utils": [], + "fugue.extensions.context": [], + "fugue.extensions.creator": [], + "fugue.extensions.creator.convert": [], + "fugue.extensions.creator.creator": [], + "fugue.extensions.outputter": [], + "fugue.extensions.outputter.convert": [], + "fugue.extensions.outputter.outputter": [], + "fugue.extensions.processor": [], + "fugue.extensions.processor.convert": [], + "fugue.extensions.processor.processor": [], + "fugue.extensions.transformer": [], + "fugue.extensions.transformer.constants": [], + "fugue.extensions.transformer.convert": [], + "fugue.extensions.transformer.transformer": [], + "fugue.plugins": [], + "fugue.registry": [], + "fugue.rpc": [], + "fugue.rpc.base": [], + "fugue.rpc.flask": [], + "fugue.sql": [], + "fugue.sql._utils": [], + "fugue.sql._visitors": [], + "fugue.sql.api": [], + "fugue.sql.workflow": [], + "fugue.test": [], + "fugue.test.pandas_tester": [], + "fugue.test.plugins": [], + "fugue.workflow": [], + "fugue.workflow._checkpoint": [], + "fugue.workflow._tasks": [], + "fugue.workflow._workflow_context": [], + "fugue.workflow.api": [], + "fugue.workflow.input": [], + "fugue.workflow.module": [], + "fugue.workflow.workflow": [], + "fugue_contrib": [], + "fugue_contrib.contrib": [], + "fugue_contrib.seaborn": [], + "fugue_contrib.viz": [], + "fugue_contrib.viz._ext": [], + "fugue_dask": [], + "fugue_dask._constants": [], + "fugue_dask._io": [], + "fugue_dask._utils": [], + "fugue_dask.dataframe": [], + "fugue_dask.execution_engine": [], + "fugue_dask.registry": [], + "fugue_dask.tester": [], + "fugue_duckdb": [], + "fugue_duckdb._io": [], + "fugue_duckdb._utils": [], + "fugue_duckdb.dask": [], + "fugue_duckdb.dataframe": [], + "fugue_duckdb.execution_engine": [ + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "fugue_duckdb.registry": [], + "fugue_duckdb.tester": [], + "fugue_ibis": [], + "fugue_ibis._compat": [], + "fugue_ibis._utils": [], + "fugue_ibis.dataframe": [], + "fugue_ibis.execution_engine": [ + { + "code": "table-migrate", + "message": "The default format changed in Databricks Runtime 8.0, from Parquet to Delta" + } + ], + "fugue_notebook": [], + "fugue_notebook.env": [], + "fugue_notebook.nbextension": [], + "fugue_polars": [], + "fugue_polars._utils": [], + "fugue_polars.polars_dataframe": [], + "fugue_polars.registry": [], + "fugue_ray": [], + "fugue_ray._constants": [], + "fugue_ray._utils": [], + "fugue_ray._utils.cluster": [], + "fugue_ray._utils.dataframe": [], + "fugue_ray._utils.io": [], + "fugue_ray.dataframe": [], + "fugue_ray.execution_engine": [], + "fugue_ray.registry": [], + "fugue_ray.tester": [], + "fugue_spark": [], + "fugue_spark._constants": [], + "fugue_spark._utils": [], + "fugue_spark._utils.convert": [], + "fugue_spark._utils.io": [], + "fugue_spark._utils.misc": [], + "fugue_spark._utils.partition": [ + { + "code": "rdd-in-shared-clusters", + "message": "RDD APIs are not supported on UC Shared Clusters. Rewrite it using DataFrame API" + }, + { + "code": "rdd-in-shared-clusters", + "message": "RDD APIs are not supported on UC Shared Clusters. Use mapInArrow() or Pandas UDFs instead" + } + ], + "fugue_spark.dataframe": [ + { + "code": "rdd-in-shared-clusters", + "message": "RDD APIs are not supported on UC Shared Clusters. Rewrite it using DataFrame API" + } + ], + "fugue_spark.execution_engine": [ + { + "code": "legacy-context-in-shared-clusters", + "message": "sc is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext and getConf are not supported on UC Shared Clusters. Rewrite it using spark.conf" + }, + { + "code": "legacy-context-in-shared-clusters", + "message": "sparkContext is not supported on UC Shared Clusters. Rewrite it using spark" + }, + { + "code": "rdd-in-shared-clusters", + "message": "RDD APIs are not supported on UC Shared Clusters. Rewrite it using DataFrame API" + } + ], + "fugue_spark.registry": [ + { + "code": "rdd-in-shared-clusters", + "message": "RDD APIs are not supported on UC Shared Clusters. Rewrite it using DataFrame API" + } + ], + "fugue_spark.tester": [], + "fugue_sql": [], + "fugue_sql.exceptions": [], + "fugue_test": [], + "fugue_test.bag_suite": [], + "fugue_test.builtin_suite": [], + "fugue_test.dataframe_suite": [], + "fugue_test.execution_suite": [], + "fugue_test.fixtures": [], + "fugue_version": [] + }, "gast": { "gast": [] }, @@ -3899,6 +4209,316 @@ "h5py": { "h5py": [] }, + "hail": { + "hail": [], + "hail.backend": [], + "hail.backend.backend": [], + "hail.backend.local_backend": [], + "hail.backend.py4j_backend": [ + { + "code": "jvm-access-in-shared-clusters", + "message": "Cannot access Spark Driver JVM on UC Shared Clusters" + } + ], + "hail.backend.service_backend": [], + "hail.backend.spark_backend": [ + { + "code": "jvm-access-in-shared-clusters", + "message": "Cannot access Spark Driver JVM on UC Shared Clusters" + }, + { + "code": "legacy-context-in-shared-clusters", + "message": "sc is not supported on UC Shared Clusters. Rewrite it using spark" + } + ], + "hail.builtin_references": [], + "hail.conftest": [], + "hail.context": [], + "hail.experimental": [], + "hail.experimental.datasets": [], + "hail.experimental.datasets_metadata": [], + "hail.experimental.db": [], + "hail.experimental.export_entries_by_col": [], + "hail.experimental.expressions": [], + "hail.experimental.filtering_allele_frequency": [], + "hail.experimental.full_outer_join_mt": [], + "hail.experimental.function": [], + "hail.experimental.haplotype_freq_em": [], + "hail.experimental.import_gtf": [], + "hail.experimental.interact": [], + "hail.experimental.ld_score_regression": [], + "hail.experimental.ldscore": [], + "hail.experimental.ldscsim": [], + "hail.experimental.lens": [], + "hail.experimental.loop": [], + "hail.experimental.pca": [], + "hail.experimental.phase_by_transmission": [], + "hail.experimental.plots": [], + "hail.experimental.sparse_mt": [], + "hail.experimental.sparse_mt.densify": [], + "hail.experimental.sparse_mt.sparse_split_multi": [], + "hail.experimental.table_ndarray_utils": [], + "hail.experimental.tidyr": [], + "hail.experimental.time": [], + "hail.experimental.write_multiple": [], + "hail.expr": [], + "hail.expr.aggregators": [], + "hail.expr.aggregators.aggregators": [], + "hail.expr.blockmatrix_type": [], + "hail.expr.builders": [], + "hail.expr.expressions": [], + "hail.expr.expressions.base_expression": [], + "hail.expr.expressions.expression_typecheck": [], + "hail.expr.expressions.expression_utils": [], + "hail.expr.expressions.indices": [], + "hail.expr.expressions.typed_expressions": [], + "hail.expr.functions": [], + "hail.expr.matrix_type": [], + "hail.expr.nat": [], + "hail.expr.table_type": [], + "hail.expr.type_parsing": [], + "hail.expr.types": [], + "hail.fs": [], + "hail.fs.hadoop_fs": [], + "hail.genetics": [], + "hail.genetics.allele_type": [], + "hail.genetics.call": [], + "hail.genetics.locus": [], + "hail.genetics.pedigree": [], + "hail.genetics.reference_genome": [], + "hail.ggplot": [], + "hail.ggplot.aes": [], + "hail.ggplot.coord_cartesian": [], + "hail.ggplot.facets": [], + "hail.ggplot.geoms": [], + "hail.ggplot.ggplot": [], + "hail.ggplot.labels": [], + "hail.ggplot.scale": [], + "hail.ggplot.stats": [], + "hail.ggplot.utils": [], + "hail.hail_logging": [], + "hail.ir": [], + "hail.ir.base_ir": [], + "hail.ir.blockmatrix_ir": [], + "hail.ir.blockmatrix_reader": [], + "hail.ir.blockmatrix_writer": [], + "hail.ir.export_type": [], + "hail.ir.ir": [], + "hail.ir.matrix_ir": [], + "hail.ir.matrix_reader": [], + "hail.ir.matrix_writer": [], + "hail.ir.register_aggregators": [], + "hail.ir.register_functions": [], + "hail.ir.renderer": [], + "hail.ir.table_ir": [], + "hail.ir.table_reader": [], + "hail.ir.table_writer": [], + "hail.ir.utils": [], + "hail.linalg": [], + "hail.linalg.blockmatrix": [], + "hail.linalg.utils": [], + "hail.linalg.utils.misc": [], + "hail.matrixtable": [], + "hail.methods": [], + "hail.methods.family_methods": [], + "hail.methods.impex": [ + { + "code": "jvm-access-in-shared-clusters", + "message": "Cannot access Spark Driver JVM on UC Shared Clusters" + } + ], + "hail.methods.import_lines_helpers": [], + "hail.methods.misc": [], + "hail.methods.pca": [], + "hail.methods.qc": [], + "hail.methods.relatedness": [], + "hail.methods.relatedness.identity_by_descent": [], + "hail.methods.relatedness.king": [], + "hail.methods.relatedness.mating_simulation": [], + "hail.methods.relatedness.pc_relate": [], + "hail.methods.statgen": [], + "hail.nd": [], + "hail.nd.nd": [], + "hail.plot": [], + "hail.plot.plots": [], + "hail.stats": [], + "hail.stats.linear_mixed_model": [], + "hail.table": [], + "hail.typecheck": [], + "hail.typecheck.check": [], + "hail.utils": [], + "hail.utils.byte_reader": [], + "hail.utils.deduplicate": [], + "hail.utils.frozendict": [], + "hail.utils.genomic_range_table": [], + "hail.utils.hadoop_utils": [], + "hail.utils.interval": [], + "hail.utils.java": [], + "hail.utils.jsonx": [], + "hail.utils.linkedlist": [], + "hail.utils.misc": [], + "hail.utils.placement_tree": [], + "hail.utils.struct": [], + "hail.utils.tutorial": [], + "hail.vds": [], + "hail.vds.combiner": [], + "hail.vds.combiner.combine": [], + "hail.vds.combiner.variant_dataset_combiner": [], + "hail.vds.functions": [], + "hail.vds.methods": [], + "hail.vds.sample_qc": [], + "hail.vds.variant_dataset": [], + "hailtop": [], + "hailtop.aiocloud": [], + "hailtop.aiocloud.aioaws": [], + "hailtop.aiocloud.aioaws.fs": [], + "hailtop.aiocloud.aioazure": [], + "hailtop.aiocloud.aioazure.client": [], + "hailtop.aiocloud.aioazure.client.arm_client": [], + "hailtop.aiocloud.aioazure.client.base_client": [], + "hailtop.aiocloud.aioazure.client.compute_client": [], + "hailtop.aiocloud.aioazure.client.graph_client": [], + "hailtop.aiocloud.aioazure.client.network_client": [], + "hailtop.aiocloud.aioazure.client.pricing_client": [], + "hailtop.aiocloud.aioazure.client.resources_client": [], + "hailtop.aiocloud.aioazure.credentials": [], + "hailtop.aiocloud.aioazure.fs": [], + "hailtop.aiocloud.aiogoogle": [], + "hailtop.aiocloud.aiogoogle.client": [], + "hailtop.aiocloud.aiogoogle.client.base_client": [], + "hailtop.aiocloud.aiogoogle.client.bigquery_client": [], + "hailtop.aiocloud.aiogoogle.client.billing_client": [], + "hailtop.aiocloud.aiogoogle.client.compute_client": [], + "hailtop.aiocloud.aiogoogle.client.container_client": [], + "hailtop.aiocloud.aiogoogle.client.iam_client": [], + "hailtop.aiocloud.aiogoogle.client.logging_client": [], + "hailtop.aiocloud.aiogoogle.client.metadata_server_client": [], + "hailtop.aiocloud.aiogoogle.client.storage_client": [], + "hailtop.aiocloud.aiogoogle.credentials": [], + "hailtop.aiocloud.aiogoogle.user_config": [], + "hailtop.aiocloud.aioterra": [], + "hailtop.aiocloud.aioterra.azure": [], + "hailtop.aiocloud.aioterra.azure.client": [], + "hailtop.aiocloud.aioterra.azure.client.terra_client": [], + "hailtop.aiocloud.aioterra.azure.fs": [], + "hailtop.aiocloud.common": [], + "hailtop.aiocloud.common.base_client": [], + "hailtop.aiocloud.common.credentials": [], + "hailtop.aiocloud.common.session": [], + "hailtop.aiogoogle": [], + "hailtop.aiotools": [], + "hailtop.aiotools.aio_contextlib": [], + "hailtop.aiotools.copy": [], + "hailtop.aiotools.delete": [], + "hailtop.aiotools.diff": [], + "hailtop.aiotools.fs": [], + "hailtop.aiotools.fs.copier": [], + "hailtop.aiotools.fs.exceptions": [], + "hailtop.aiotools.fs.fs": [], + "hailtop.aiotools.fs.stream": [], + "hailtop.aiotools.local_fs": [], + "hailtop.aiotools.router_fs": [], + "hailtop.aiotools.tasks": [], + "hailtop.aiotools.utils": [], + "hailtop.aiotools.validators": [], + "hailtop.aiotools.weighted_semaphore": [], + "hailtop.auth": [], + "hailtop.auth.auth": [], + "hailtop.auth.flow": [], + "hailtop.auth.sql_config": [], + "hailtop.auth.tokens": [], + "hailtop.batch": [], + "hailtop.batch.backend": [], + "hailtop.batch.batch": [], + "hailtop.batch.batch_pool_executor": [], + "hailtop.batch.conftest": [], + "hailtop.batch.docker": [], + "hailtop.batch.exceptions": [], + "hailtop.batch.globals": [], + "hailtop.batch.hail_genetics_images": [], + "hailtop.batch.job": [], + "hailtop.batch.resource": [], + "hailtop.batch.utils": [], + "hailtop.batch_client": [], + "hailtop.batch_client.aioclient": [], + "hailtop.batch_client.client": [], + "hailtop.batch_client.globals": [], + "hailtop.batch_client.parse": [], + "hailtop.batch_client.types": [], + "hailtop.cleanup_gcr": [], + "hailtop.config": [], + "hailtop.config.deploy_config": [], + "hailtop.config.user_config": [], + "hailtop.config.variables": [], + "hailtop.dictfix": [], + "hailtop.frozendict": [], + "hailtop.fs": [], + "hailtop.fs.fs": [], + "hailtop.fs.fs_utils": [], + "hailtop.fs.router_fs": [], + "hailtop.fs.stat_result": [], + "hailtop.hail_decorator": [], + "hailtop.hail_event_loop": [], + "hailtop.hail_frozenlist": [], + "hailtop.hail_logging": [], + "hailtop.hailctl": [], + "hailtop.hailctl.auth": [], + "hailtop.hailctl.auth.cli": [], + "hailtop.hailctl.auth.create_user": [], + "hailtop.hailctl.auth.delete_user": [], + "hailtop.hailctl.auth.login": [], + "hailtop.hailctl.batch": [], + "hailtop.hailctl.batch.batch_cli_utils": [], + "hailtop.hailctl.batch.billing": [], + "hailtop.hailctl.batch.billing.cli": [], + "hailtop.hailctl.batch.cli": [], + "hailtop.hailctl.batch.initialize": [], + "hailtop.hailctl.batch.list_batches": [], + "hailtop.hailctl.batch.submit": [], + "hailtop.hailctl.batch.utils": [], + "hailtop.hailctl.config": [], + "hailtop.hailctl.config.cli": [], + "hailtop.hailctl.config.config_variables": [], + "hailtop.hailctl.dataproc": [], + "hailtop.hailctl.dataproc.cli": [], + "hailtop.hailctl.dataproc.cluster_config": [], + "hailtop.hailctl.dataproc.connect": [], + "hailtop.hailctl.dataproc.deploy_metadata": [], + "hailtop.hailctl.dataproc.diagnose": [], + "hailtop.hailctl.dataproc.gcloud": [], + "hailtop.hailctl.dataproc.modify": [], + "hailtop.hailctl.dataproc.start": [], + "hailtop.hailctl.dataproc.submit": [], + "hailtop.hailctl.dataproc.utils": [], + "hailtop.hailctl.describe": [], + "hailtop.hailctl.dev": [], + "hailtop.hailctl.dev.ci_client": [], + "hailtop.hailctl.dev.cli": [], + "hailtop.hailctl.dev.config": [], + "hailtop.hailctl.hdinsight": [], + "hailtop.hailctl.hdinsight.cli": [], + "hailtop.hailctl.hdinsight.start": [], + "hailtop.hailctl.hdinsight.submit": [], + "hailtop.httpx": [], + "hailtop.humanizex": [], + "hailtop.test_utils": [], + "hailtop.timex": [], + "hailtop.tls": [], + "hailtop.utils": [], + "hailtop.utils.filesize": [], + "hailtop.utils.gcs_requester_pays": [], + "hailtop.utils.process": [], + "hailtop.utils.rate_limiter": [], + "hailtop.utils.rates": [], + "hailtop.utils.rich_progress_bar": [], + "hailtop.utils.serialization": [], + "hailtop.utils.time": [], + "hailtop.utils.utils": [], + "hailtop.utils.validate": [], + "hailtop.utils.validate.validate": [], + "hailtop.uvloopx": [], + "hailtop.yamlx": [] + }, "html5lib": { "html5lib": [] }, @@ -4373,6 +4993,13 @@ "jupyter_core": { "jupyter": [] }, + "kaleido": { + "kaleido": [], + "kaleido._version": [], + "kaleido.scopes": [], + "kaleido.scopes.base": [], + "kaleido.scopes.plotly": [] + }, "keras": { "keras": [] }, @@ -5965,7 +6592,16 @@ "lightgbm.plotting": [], "lightgbm.sklearn": [] }, - "lxml": { + "livereload": { + "livereload": [], + "livereload.cli": [], + "livereload.handlers": [], + "livereload.management.commands": [], + "livereload.management.commands.livereload": [], + "livereload.server": [], + "livereload.watcher": [] + }, + "lxml": { "lxml": [] }, "mako": { diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index b56bdc544d..e9d5ded9ba 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -28,7 +28,12 @@ class Matcher(ABC): session_state: CurrentSessionState | None = None def matches(self, node: NodeNG): - return isinstance(node, Call) and isinstance(node.func, Attribute) and self._get_table_arg(node) is not None + return ( + isinstance(node, Call) + and self._get_table_arg(node) is not None + and isinstance(node.func, Attribute) + and Tree(node.func.expr).is_from_module("spark") + ) @abstractmethod def lint( diff --git a/src/databricks/labs/ucx/source_code/linters/python_ast.py b/src/databricks/labs/ucx/source_code/linters/python_ast.py index 704a17293f..c676ced966 100644 --- a/src/databricks/labs/ucx/source_code/linters/python_ast.py +++ b/src/databricks/labs/ucx/source_code/linters/python_ast.py @@ -8,6 +8,7 @@ from astroid import ( # type: ignore Assign, + AssignName, Attribute, Call, Const, @@ -223,6 +224,27 @@ def append_statements(self, tree: Tree) -> Tree: # the following may seem strange but it's actually ok to use the original module as tree root return tree + def is_from_module(self, module_name: str): + # if his is the call's root node, check it against the required module + if isinstance(self._node, Name): + if self._node.name == module_name: + return True + root = self.root + if not isinstance(root, Module): + return False + for value in root.globals.get(self._node.name, []): + if not isinstance(value, AssignName) or not isinstance(value.parent, Assign): + continue + if Tree(value.parent.value).is_from_module(module_name): + return True + return False + # walk up intermediate calls such as spark.range(...) + if isinstance(self._node, Call): + return isinstance(self._node.func, Attribute) and Tree(self._node.func.expr).is_from_module(module_name) + if isinstance(self._node, Attribute): + return Tree(self._node.expr).is_from_module(module_name) + return False + class TreeVisitor: @@ -282,6 +304,8 @@ def visit_importfrom(self, node: ImportFrom): def _matches(self, node: NodeNG, depth: int): if depth >= len(self._match_nodes): return False + if isinstance(node, Call): + return self._matches(node.func, depth) name, match_node = self._match_nodes[depth] if not isinstance(node, match_node): return False diff --git a/src/databricks/labs/ucx/source_code/linters/table_creation.py b/src/databricks/labs/ucx/source_code/linters/table_creation.py index 147c72e30c..ad45ec0006 100644 --- a/src/databricks/labs/ucx/source_code/linters/table_creation.py +++ b/src/databricks/labs/ucx/source_code/linters/table_creation.py @@ -50,7 +50,11 @@ def get_advice_span(self, node: NodeNG) -> Range | None: if call_args_count < self.min_args or call_args_count > self.max_args: return None - # Check 3: check presence of the format specifier: + # Check 3: ensure this is a spark call + if not Tree(node.func.expr).is_from_module("spark"): + return None + + # Check 4: check presence of the format specifier: # Option A: format specifier may be given as a direct parameter to the table-creating call # >>> df.saveToTable("c.db.table", format="csv") format_arg = Tree.get_arg(node, self.format_arg_index, self.format_arg_name) diff --git a/tests/integration/README.md b/tests/integration/README.md new file mode 100644 index 0000000000..7a709895ac --- /dev/null +++ b/tests/integration/README.md @@ -0,0 +1,12 @@ +# Selective running of tests per pull request + +Only relevant integration tests are run on the pull request. The logic is as follows: + +1. We're running `git diff --name-only` and get a list of file names +2. For every child folder in `tests/integration` we check if any of the files in the folder are in the list of changed files +3. If any of the files in the folder are in the list of changed files, we run the tests in that folder +4. If none of the files in the folder are in the list of changed files, we skip the tests in that folder + +Filtering does not apply for: +1. Debugging +2. Nightly diff --git a/tests/integration/installer/__init__.py b/tests/integration/account/__init__.py similarity index 100% rename from tests/integration/installer/__init__.py rename to tests/integration/account/__init__.py diff --git a/tests/integration/test_account.py b/tests/integration/account/test_account.py similarity index 92% rename from tests/integration/test_account.py rename to tests/integration/account/test_account.py index f2a62de2c3..321f5043f6 100644 --- a/tests/integration/test_account.py +++ b/tests/integration/account/test_account.py @@ -25,7 +25,13 @@ def delete_ucx_created_groups(): def test_create_account_level_groups( - make_ucx_group, make_group, make_user, acc, ws, make_random, clean_account_level_groups + make_ucx_group, + make_group, + make_user, + acc, + ws, + make_random, + clean_account_level_groups, ): suffix = get_purge_suffix() make_ucx_group(f"test_ucx_migrate_invalid-{suffix}", f"test_ucx_migrate_invalid-{suffix}") diff --git a/tests/integration/assessment/test_ext_hms.py b/tests/integration/assessment/test_ext_hms.py new file mode 100644 index 0000000000..c366d2a09e --- /dev/null +++ b/tests/integration/assessment/test_ext_hms.py @@ -0,0 +1,51 @@ +import dataclasses +from datetime import timedelta + +from databricks.sdk.errors import ( + InvalidParameterValue, + NotFound, +) +from databricks.sdk.retries import retried +from databricks.sdk.service.iam import PermissionLevel + + +@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=5)) +def test_running_real_assessment_job_ext_hms( + ws, + installation_ctx, + env_or_skip, + make_cluster_policy, + make_cluster_policy_permissions, +): + ext_hms_ctx = installation_ctx.replace( + skip_dashboards=True, + config_transform=lambda wc: dataclasses.replace( + wc, + override_clusters=None, + ), + extend_prompts={ + r"Instance pool id to be set.*": env_or_skip("TEST_INSTANCE_POOL_ID"), + r".*Do you want to update the existing installation?.*": 'yes', + r".*connect to the external metastore?.*": "yes", + r"Choose a cluster policy": "0", + }, + ) + ws_group_a, _ = ext_hms_ctx.make_ucx_group(wait_for_provisioning=True) + + cluster_policy = make_cluster_policy() + make_cluster_policy_permissions( + object_id=cluster_policy.policy_id, + permission_level=PermissionLevel.CAN_USE, + group_name=ws_group_a.display_name, + ) + ext_hms_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] + ext_hms_ctx.workspace_installation.run() + + ext_hms_ctx.deployed_workflows.run_workflow("assessment") + + # assert the workflow is successful. the tasks on sql warehouse will fail so skip checking them + assert ext_hms_ctx.deployed_workflows.validate_step("assessment") + + after = ext_hms_ctx.generic_permissions_support.load_as_dict("cluster-policies", cluster_policy.policy_id) + assert ws_group_a.display_name in after, f"Group {ws_group_a.display_name} not found in cluster policy" + assert after[ws_group_a.display_name] == PermissionLevel.CAN_USE diff --git a/tests/integration/assessment/test_workflows.py b/tests/integration/assessment/test_workflows.py new file mode 100644 index 0000000000..7dd70800c2 --- /dev/null +++ b/tests/integration/assessment/test_workflows.py @@ -0,0 +1,26 @@ +from datetime import timedelta + +from databricks.sdk.errors import NotFound, InvalidParameterValue +from databricks.sdk.retries import retried +from databricks.sdk.service.iam import PermissionLevel + + +@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8)) +def test_running_real_assessment_job(ws, installation_ctx, make_cluster_policy, make_cluster_policy_permissions): + + ctx = installation_ctx.replace(skip_dashboards=False) + ws_group_a, _ = ctx.make_ucx_group() + + cluster_policy = make_cluster_policy() + make_cluster_policy_permissions( + object_id=cluster_policy.policy_id, + permission_level=PermissionLevel.CAN_USE, + group_name=ws_group_a.display_name, + ) + ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] + ctx.workspace_installation.run() + + ctx.deployed_workflows.run_workflow("assessment") + + after = ctx.generic_permissions_support.load_as_dict("cluster-policies", cluster_policy.policy_id) + assert after[ws_group_a.display_name] == PermissionLevel.CAN_USE diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index bb21e645b9..88ebce013f 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -7,8 +7,10 @@ from functools import partial, cached_property from datetime import timedelta import shutil +import subprocess import databricks.sdk.core import pytest # pylint: disable=wrong-import-order +from databricks.labs.blueprint.entrypoint import is_in_debug from databricks.labs.blueprint.installation import Installation, MockInstallation from databricks.labs.blueprint.parallel import Threads from databricks.labs.blueprint.tui import MockPrompts @@ -885,3 +887,81 @@ def prepared_principal_acl(runtime_ctx, env_or_skip, make_mounted_location, make f"{dst_catalog.name}.{dst_schema.name}", dst_catalog.name, ) + + +def modified_or_skip(package: str): + info = ProductInfo.from_class(WorkspaceConfig) + checkout_root = info.checkout_root() + + def _run(command: str) -> str: + with subprocess.Popen( + command.split(), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=checkout_root, + ) as process: + output, error = process.communicate() + if process.returncode != 0: + pytest.fail(f"Command failed: {command}\n{error.decode('utf-8')}", pytrace=False) + return output.decode("utf-8").strip() + + def check(): + if is_in_debug(): + return True # not skipping, as we're debugging + if 'TEST_NIGHTLY' in os.environ: + return True # or during nightly runs + current_branch = _run("git branch --show-current") + changed_files = _run(f"git diff origin/main..{current_branch} --name-only") + if package in changed_files: + return True + return False + + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if not check(): + pytest.skip(f"Skipping long test as {package} was not modified") + return func(*args, **kwargs) + + return wrapper + + return decorator + + +def pytest_ignore_collect(path): + if not os.path.isdir(path): + logger.debug(f"pytest_ignore_collect: not a dir: {path}") + return False + if is_in_debug(): + logger.debug(f"pytest_ignore_collect: in debug: {path}") + return False # not skipping, as we're debugging + if 'TEST_NIGHTLY' in os.environ: + logger.debug(f"pytest_ignore_collect: nightly: {path}") + return False # or during nightly runs + + checkout_root = ProductInfo.from_class(WorkspaceConfig).checkout_root() + + def _run(command: str) -> str: + with subprocess.Popen( + command.split(), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=checkout_root, + ) as process: + output, error = process.communicate() + if process.returncode != 0: + raise ValueError(f"Command failed: {command}\n{error.decode('utf-8')}") + return output.decode("utf-8").strip() + + try: # pylint: disable=too-many-try-statements + target_branch = os.environ.get('GITHUB_BASE_REF', 'main') + current_branch = os.environ.get('GITHUB_HEAD_REF', _run("git branch --show-current")) + changed_files = _run(f"git diff {target_branch}..{current_branch} --name-only") + if path.basename in changed_files: + logger.debug(f"pytest_ignore_collect: in changed files: {path} - {changed_files}") + return False + logger.debug(f"pytest_ignore_collect: skip: {path}") + return True + except ValueError as err: + logger.debug(f"pytest_ignore_collect: error: {err}") + return False diff --git a/tests/integration/hive_metastore/test_catalog_schema.py b/tests/integration/hive_metastore/test_catalog_schema.py index c06472c401..713bfba1f6 100644 --- a/tests/integration/hive_metastore/test_catalog_schema.py +++ b/tests/integration/hive_metastore/test_catalog_schema.py @@ -18,7 +18,11 @@ @retried(on=[NotFound], timeout=timedelta(minutes=2)) def test_create_catalog_schema_with_principal_acl_azure( - ws, make_user, prepared_principal_acl, make_cluster_permissions, make_cluster + ws, + make_user, + prepared_principal_acl, + make_cluster_permissions, + make_cluster, ): if not ws.config.is_azure: pytest.skip("only works in azure test env") diff --git a/tests/integration/hive_metastore/test_ext_hms.py b/tests/integration/hive_metastore/test_ext_hms.py new file mode 100644 index 0000000000..de7ea1c250 --- /dev/null +++ b/tests/integration/hive_metastore/test_ext_hms.py @@ -0,0 +1,60 @@ +import dataclasses +import logging +from datetime import timedelta + +import pytest +from databricks.labs.blueprint.installer import RawState +from databricks.labs.lsql.backends import CommandExecutionBackend, SqlBackend +from databricks.sdk.errors import NotFound, InvalidParameterValue +from databricks.sdk.retries import retried + +logger = logging.getLogger(__name__) + + +@pytest.fixture +def sql_backend(ws, env_or_skip) -> SqlBackend: + cluster_id = env_or_skip("TEST_EXT_HMS_CLUSTER_ID") + return CommandExecutionBackend(ws, cluster_id) + + +@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=5)) +@pytest.mark.parametrize('prepare_tables_for_migration', ['regular'], indirect=True) +def test_migration_job_ext_hms(ws, installation_ctx, prepare_tables_for_migration, env_or_skip): + ext_hms_cluster_id = env_or_skip("TEST_EXT_HMS_CLUSTER_ID") + tables, dst_schema = prepare_tables_for_migration + ext_hms_ctx = installation_ctx.replace( + config_transform=lambda wc: dataclasses.replace( + wc, + override_clusters={ + "main": ext_hms_cluster_id, + "table_migration": ext_hms_cluster_id, + }, + ), + extend_prompts={ + r"Parallelism for migrating.*": "1000", + r"Min workers for auto-scale.*": "2", + r"Max workers for auto-scale.*": "20", + r"Instance pool id to be set.*": env_or_skip("TEST_INSTANCE_POOL_ID"), + r".*Do you want to update the existing installation?.*": 'yes', + r".*connect to the external metastore?.*": "yes", + r"Choose a cluster policy": "0", + }, + ) + ext_hms_ctx.workspace_installation.run() + ext_hms_ctx.deployed_workflows.run_workflow("migrate-tables") + # assert the workflow is successful + assert ext_hms_ctx.deployed_workflows.validate_step("migrate-tables") + + # assert the tables are migrated + for table in tables.values(): + try: + assert ws.tables.get(f"{dst_schema.catalog_name}.{dst_schema.name}.{table.name}").name + except NotFound: + assert False, f"{table.name} not found in {dst_schema.catalog_name}.{dst_schema.name}" + # assert the cluster is configured correctly with ext hms + install_state = ext_hms_ctx.installation.load(RawState) + for job_cluster in ws.jobs.get(install_state.resources["jobs"]["migrate-tables"]).settings.job_clusters: + if ws.config.is_azure: + assert "spark.sql.hive.metastore.version" in job_cluster.new_cluster.spark_conf + if ws.config.is_aws: + assert "spark.databricks.hive.metastore.glueCatalog.enabled" in job_cluster.new_cluster.spark_conf diff --git a/tests/integration/hive_metastore/test_workflows.py b/tests/integration/hive_metastore/test_workflows.py index acac209ed8..6be34f0446 100644 --- a/tests/integration/hive_metastore/test_workflows.py +++ b/tests/integration/hive_metastore/test_workflows.py @@ -20,10 +20,8 @@ def test_table_migration_job_refreshes_migration_status( installation_ctx, prepare_tables_for_migration, workflow, - modified_or_skip, ): """The migration status should be refreshed after the migration job.""" - modified_or_skip("hive_metastore") tables, _ = prepare_tables_for_migration ctx = installation_ctx.replace( extend_prompts={ @@ -63,3 +61,45 @@ def test_table_migration_job_refreshes_migration_status( "\n".join(asserts) + " given migration statuses " + "\n".join([str(status) for status in migration_statuses]) ) assert len(asserts) == 0, assert_message + + +@retried(on=[NotFound], timeout=timedelta(minutes=8)) +@pytest.mark.parametrize('prepare_tables_for_migration', [('hiveserde')], indirect=True) +def test_hiveserde_table_in_place_migration_job(ws, installation_ctx, prepare_tables_for_migration): + tables, dst_schema = prepare_tables_for_migration + ctx = installation_ctx.replace( + extend_prompts={ + r".*Do you want to update the existing installation?.*": 'yes', + }, + ) + ctx.workspace_installation.run() + ctx.deployed_workflows.run_workflow("migrate-external-hiveserde-tables-in-place-experimental") + # assert the workflow is successful + assert ctx.deployed_workflows.validate_step("migrate-external-hiveserde-tables-in-place-experimental") + # assert the tables are migrated + for table in tables.values(): + try: + assert ws.tables.get(f"{dst_schema.catalog_name}.{dst_schema.name}.{table.name}").name + except NotFound: + assert False, f"{table.name} not found in {dst_schema.catalog_name}.{dst_schema.name}" + + +@retried(on=[NotFound], timeout=timedelta(minutes=8)) +@pytest.mark.parametrize('prepare_tables_for_migration', [('hiveserde')], indirect=True) +def test_hiveserde_table_ctas_migration_job(ws, installation_ctx, prepare_tables_for_migration): + tables, dst_schema = prepare_tables_for_migration + ctx = installation_ctx.replace( + extend_prompts={ + r".*Do you want to update the existing installation?.*": 'yes', + }, + ) + ctx.workspace_installation.run() + ctx.deployed_workflows.run_workflow("migrate-external-tables-ctas") + # assert the workflow is successful + assert ctx.deployed_workflows.validate_step("migrate-external-tables-ctas") + # assert the tables are migrated + for table in tables.values(): + try: + assert ws.tables.get(f"{dst_schema.catalog_name}.{dst_schema.name}.{table.name}").name + except NotFound: + assert False, f"{table.name} not found in {dst_schema.catalog_name}.{dst_schema.name}" diff --git a/tests/performance/__init__.py b/tests/integration/install/__init__.py similarity index 100% rename from tests/performance/__init__.py rename to tests/integration/install/__init__.py diff --git a/tests/integration/test_installation.py b/tests/integration/install/test_installation.py similarity index 97% rename from tests/integration/test_installation.py rename to tests/integration/install/test_installation.py index c771744f99..6529d571ab 100644 --- a/tests/integration/test_installation.py +++ b/tests/integration/install/test_installation.py @@ -91,12 +91,8 @@ def factory( @retried(on=[NotFound, ResourceConflict], timeout=timedelta(minutes=10)) def test_experimental_permissions_migration_for_group_with_same_name( - installation_ctx, - make_cluster_policy, - make_cluster_policy_permissions, - modified_or_skip, + installation_ctx, make_cluster_policy, make_cluster_policy_permissions ): - modified_or_skip("workspace_access") ws_group, acc_group = installation_ctx.make_ucx_group() migrated_group = MigratedGroup.partial_info(ws_group, acc_group) cluster_policy = make_cluster_policy() @@ -174,8 +170,7 @@ def test_job_cluster_policy(ws, installation_ctx): @retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=5)) -def test_running_real_remove_backup_groups_job(ws, installation_ctx, modified_or_skip): - modified_or_skip("workspace_access") +def test_running_real_remove_backup_groups_job(ws, installation_ctx): ws_group_a, _ = installation_ctx.make_ucx_group() installation_ctx.__dict__['include_group_names'] = [ws_group_a.display_name] @@ -363,14 +358,7 @@ def test_check_inventory_database_exists(ws, installation_ctx): @retried(on=[NotFound], timeout=timedelta(minutes=5)) @pytest.mark.parametrize('prepare_tables_for_migration', [('regular')], indirect=True) -def test_table_migration_job( - ws, - installation_ctx, - env_or_skip, - prepare_tables_for_migration, - modified_or_skip, -): - modified_or_skip("hive_metastore") +def test_table_migration_job(ws, installation_ctx, env_or_skip, prepare_tables_for_migration): ctx = installation_ctx.replace( config_transform=lambda wc: replace(wc, override_clusters=None), @@ -408,14 +396,8 @@ def test_table_migration_job( @retried(on=[NotFound], timeout=timedelta(minutes=8)) @pytest.mark.parametrize('prepare_tables_for_migration', [('regular')], indirect=True) -def test_table_migration_job_cluster_override( - ws, - installation_ctx, - prepare_tables_for_migration, - env_or_skip, - modified_or_skip, -): - modified_or_skip("hive_metastore") +def test_table_migration_job_cluster_override(ws, installation_ctx, prepare_tables_for_migration, env_or_skip): + tables, dst_schema = prepare_tables_for_migration ctx = installation_ctx.replace( extend_prompts={ diff --git a/tests/integration/installer/test_logs.py b/tests/integration/install/test_logs.py similarity index 100% rename from tests/integration/installer/test_logs.py rename to tests/integration/install/test_logs.py diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index b581346c3f..023d68e2f4 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -15,6 +15,7 @@ from databricks.sdk.service.workspace import ImportFormat from databricks.labs.ucx.hive_metastore.migration_status import MigrationIndex +from databricks.labs.ucx.mixins.fixtures import get_purge_suffix from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.known import UNKNOWN, KnownList from databricks.labs.ucx.source_code.linters.files import LocalCodeLinter @@ -24,14 +25,12 @@ @retried(on=[NotFound], timeout=timedelta(minutes=5)) -def test_running_real_workflow_linter_job(installation_ctx, make_notebook, make_directory, make_job, modified_or_skip): - modified_or_skip("source_code") +def test_running_real_workflow_linter_job(installation_ctx, make_notebook, make_directory, make_job): # Deprecated file system path in call to: /mnt/things/e/f/g lint_problem = b"display(spark.read.csv('/mnt/things/e/f/g'))" notebook = make_notebook(path=f"{make_directory()}/notebook.ipynb", content=lint_problem) - make_job(notebook_path=notebook) - - ctx = installation_ctx + job = make_job(notebook_path=notebook) + ctx = installation_ctx.replace(config_transform=lambda wc: replace(wc, include_job_ids=[job.job_id])) ctx.workspace_installation.run() ctx.deployed_workflows.run_workflow("experimental-workflow-linter") ctx.deployed_workflows.validate_step("experimental-workflow-linter") @@ -48,7 +47,8 @@ def test_linter_from_context(simple_ctx, make_job, make_notebook): # but it's executed on the caller side and is easier to debug. # ensure we have at least 1 job that fails notebook_path = make_notebook(content=io.BytesIO(b"import xyz")) - make_job(notebook_path=notebook_path) + job = make_job(notebook_path=notebook_path) + simple_ctx.config.include_job_ids = [job.job_id] simple_ctx.workflow_linter.refresh_report(simple_ctx.sql_backend, simple_ctx.inventory_database) cursor = simple_ctx.sql_backend.fetch( @@ -120,7 +120,7 @@ def test_job_linter_some_notebook_graph_with_problems(simple_ctx, ws, make_job, "second_notebook:3 [dbfs-usage] Deprecated file system path: /mnt/something", } - entrypoint = WorkspacePath(ws, f"~/linter-{make_random(4)}").expanduser() + entrypoint = WorkspacePath(ws, f"~/linter-{make_random(4)}-{get_purge_suffix()}").expanduser() entrypoint.mkdir() main_notebook = entrypoint / 'main' @@ -155,7 +155,7 @@ def test_workflow_linter_lints_job_with_import_pypi_library( make_notebook, make_random, ): - entrypoint = WorkspacePath(ws, f"~/linter-{make_random(4)}").expanduser() + entrypoint = WorkspacePath(ws, f"~/linter-{make_random(4)}-{get_purge_suffix()}").expanduser() entrypoint.mkdir() simple_ctx = simple_ctx.replace( @@ -306,7 +306,13 @@ def test_workflow_linter_lints_job_with_wheel_dependency( def test_job_spark_python_task_linter_happy_path( - simple_ctx, ws, make_job, make_random, make_cluster, make_notebook, make_directory + simple_ctx, + ws, + make_job, + make_random, + make_cluster, + make_notebook, + make_directory, ): entrypoint = make_directory() @@ -384,7 +390,14 @@ def test_workflow_linter_lints_python_wheel_task(simple_ctx, ws, make_job, make_ def test_job_dlt_task_linter_unhappy_path( - simple_ctx, ws, make_job, make_random, make_cluster, make_notebook, make_directory, make_pipeline + simple_ctx, + ws, + make_job, + make_random, + make_cluster, + make_notebook, + make_directory, + make_pipeline, ): entrypoint = make_directory() make_notebook(path=f"{entrypoint}/notebook.py", content=b"import greenlet") @@ -403,7 +416,14 @@ def test_job_dlt_task_linter_unhappy_path( def test_job_dlt_task_linter_happy_path( - simple_ctx, ws, make_job, make_random, make_cluster, make_notebook, make_directory, make_pipeline + simple_ctx, + ws, + make_job, + make_random, + make_cluster, + make_notebook, + make_directory, + make_pipeline, ): entrypoint = make_directory() make_notebook(path=f"{entrypoint}/notebook.py", content=b"import greenlet") diff --git a/tests/integration/test_ext_hms.py b/tests/integration/test_ext_hms.py deleted file mode 100644 index 2bd9657aeb..0000000000 --- a/tests/integration/test_ext_hms.py +++ /dev/null @@ -1,186 +0,0 @@ -import dataclasses -import logging -import os -import sys -from collections.abc import Iterator, Sequence -from datetime import timedelta -from types import UnionType -from typing import Any, ClassVar, Protocol, TypeVar - -import pytest -from databricks.labs.blueprint.commands import CommandExecutor -from databricks.labs.blueprint.installer import RawState -from databricks.labs.lsql.backends import SqlBackend -from databricks.labs.lsql.core import Row -from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import ( - InvalidParameterValue, - NotFound, -) -from databricks.sdk.retries import retried -from databricks.sdk.service.compute import Language -from databricks.sdk.service.iam import PermissionLevel - -logger = logging.getLogger(__name__) - - -class DataclassInstance(Protocol): - __dataclass_fields__: ClassVar[dict] - - -Result = TypeVar("Result", bound=DataclassInstance) -Dataclass = type[DataclassInstance] - - -class CommandContextBackend(SqlBackend): - def __init__(self, ws: WorkspaceClient, cluster_id, *, max_records_per_batch: int = 1000): - self._sql = CommandExecutor(ws.clusters, ws.command_execution, lambda: cluster_id, language=Language.SQL) - self._max_records_per_batch = max_records_per_batch - debug_truncate_bytes = ws.config.debug_truncate_bytes - self._debug_truncate_bytes = debug_truncate_bytes if isinstance(debug_truncate_bytes, int) else 96 - - def execute(self, sql: str, *, catalog: str | None = None, schema: str | None = None) -> None: - _ = catalog, schema - logger.debug(f"[api][execute] {self._only_n_bytes(sql, self._debug_truncate_bytes)}") - self._sql.run(sql) - - def fetch(self, sql: str, *, catalog: str | None = None, schema: str | None = None) -> Iterator[Row]: - _ = catalog, schema - logger.debug(f"[api][fetch] {self._only_n_bytes(sql, self._debug_truncate_bytes)}") - return self._sql.run(sql, result_as_json=True) - - def save_table(self, full_name: str, rows: Sequence[DataclassInstance], klass: Dataclass, mode="append"): - rows = self._filter_none_rows(rows, klass) - self.create_table(full_name, klass) - if len(rows) == 0: - return - fields = dataclasses.fields(klass) - field_names = [f.name for f in fields] - if mode == "overwrite": - self.execute(f"TRUNCATE TABLE {full_name}") - for i in range(0, len(rows), self._max_records_per_batch): - batch = rows[i : i + self._max_records_per_batch] - vals = "), (".join(self._row_to_sql(r, fields) for r in batch) - sql = f'INSERT INTO {full_name} ({", ".join(field_names)}) VALUES ({vals})' - self.execute(sql) - - @staticmethod - def _row_to_sql(row: DataclassInstance, fields: tuple[dataclasses.Field[Any], ...]): - data = [] - for f in fields: - value = getattr(row, f.name) - field_type = f.type - if isinstance(field_type, UnionType): - field_type = field_type.__args__[0] - if value is None: - data.append("NULL") - elif field_type == bool: - data.append("TRUE" if value else "FALSE") - elif field_type == str: - value = str(value).replace("'", "''") - data.append(f"'{value}'") - elif field_type == int: - data.append(f"{value}") - else: - msg = f"unknown type: {field_type}" - raise ValueError(msg) - return ", ".join(data) - - -@pytest.fixture -def sql_backend(ws, env_or_skip) -> SqlBackend: - cluster_id = env_or_skip("TEST_EXT_HMS_CLUSTER_ID") - return CommandContextBackend(ws, cluster_id) - - -@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=5)) -@pytest.mark.parametrize('prepare_tables_for_migration', ['regular'], indirect=True) -def test_migration_job_ext_hms(ws, installation_ctx, prepare_tables_for_migration, env_or_skip, modified_or_skip): - modified_or_skip("hive_metastore") - # this test spins up clusters using ext hms cluster policy, which will have a startup time of ~ 7-10m - # skip this test if not in nightly test job or debug mode - if os.path.basename(sys.argv[0]) not in {"_jb_pytest_runner.py", "testlauncher.py"}: - env_or_skip("TEST_NIGHTLY") - - ext_hms_cluster_id = env_or_skip("TEST_EXT_HMS_CLUSTER_ID") - tables, dst_schema = prepare_tables_for_migration - ext_hms_ctx = installation_ctx.replace( - config_transform=lambda wc: dataclasses.replace( - wc, - override_clusters={ - "main": ext_hms_cluster_id, - "table_migration": ext_hms_cluster_id, - }, - ), - extend_prompts={ - r"Parallelism for migrating.*": "1000", - r"Min workers for auto-scale.*": "2", - r"Max workers for auto-scale.*": "20", - r"Instance pool id to be set.*": env_or_skip("TEST_INSTANCE_POOL_ID"), - r".*Do you want to update the existing installation?.*": 'yes', - r".*connect to the external metastore?.*": "yes", - r"Choose a cluster policy": "0", - }, - ) - ext_hms_ctx.workspace_installation.run() - ext_hms_ctx.deployed_workflows.run_workflow("migrate-tables") - # assert the workflow is successful - assert ext_hms_ctx.deployed_workflows.validate_step("migrate-tables") - - # assert the tables are migrated - for table in tables.values(): - try: - assert ws.tables.get(f"{dst_schema.catalog_name}.{dst_schema.name}.{table.name}").name - except NotFound: - assert False, f"{table.name} not found in {dst_schema.catalog_name}.{dst_schema.name}" - # assert the cluster is configured correctly with ext hms - install_state = ext_hms_ctx.installation.load(RawState) - for job_cluster in ws.jobs.get(install_state.resources["jobs"]["migrate-tables"]).settings.job_clusters: - if ws.config.is_azure: - assert "spark.sql.hive.metastore.version" in job_cluster.new_cluster.spark_conf - if ws.config.is_aws: - assert "spark.databricks.hive.metastore.glueCatalog.enabled" in job_cluster.new_cluster.spark_conf - - -@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=5)) -def test_running_real_assessment_job_ext_hms( - ws, - installation_ctx, - env_or_skip, - make_cluster_policy, - make_cluster_policy_permissions, - modified_or_skip, -): - modified_or_skip("assessment") - ext_hms_ctx = installation_ctx.replace( - skip_dashboards=True, - config_transform=lambda wc: dataclasses.replace( - wc, - override_clusters=None, - ), - extend_prompts={ - r"Instance pool id to be set.*": env_or_skip("TEST_INSTANCE_POOL_ID"), - r".*Do you want to update the existing installation?.*": 'yes', - r".*connect to the external metastore?.*": "yes", - r"Choose a cluster policy": "0", - }, - ) - ws_group_a, _ = ext_hms_ctx.make_ucx_group(wait_for_provisioning=True) - - cluster_policy = make_cluster_policy() - make_cluster_policy_permissions( - object_id=cluster_policy.policy_id, - permission_level=PermissionLevel.CAN_USE, - group_name=ws_group_a.display_name, - ) - ext_hms_ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] - ext_hms_ctx.workspace_installation.run() - - ext_hms_ctx.deployed_workflows.run_workflow("assessment") - - # assert the workflow is successful. the tasks on sql warehouse will fail so skip checking them - assert ext_hms_ctx.deployed_workflows.validate_step("assessment") - - after = ext_hms_ctx.generic_permissions_support.load_as_dict("cluster-policies", cluster_policy.policy_id) - assert ws_group_a.display_name in after, f"Group {ws_group_a.display_name} not found in cluster policy" - assert after[ws_group_a.display_name] == PermissionLevel.CAN_USE diff --git a/tests/integration/test_runtime.py b/tests/integration/workspace_access/test_workflows.py similarity index 56% rename from tests/integration/test_runtime.py rename to tests/integration/workspace_access/test_workflows.py index 193c218738..aea31cca39 100644 --- a/tests/integration/test_runtime.py +++ b/tests/integration/workspace_access/test_workflows.py @@ -1,46 +1,14 @@ -import logging from datetime import timedelta -import pytest # pylint: disable=wrong-import-order +import pytest from databricks.labs.blueprint.parallel import ManyError -from databricks.sdk.errors import ( - InvalidParameterValue, - NotFound, -) + +from databricks.sdk.errors import NotFound, InvalidParameterValue from databricks.sdk.retries import retried from databricks.sdk.service import sql from databricks.sdk.service.iam import PermissionLevel from databricks.sdk.service.workspace import AclPermission -logger = logging.getLogger(__name__) - - -@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8)) -def test_running_real_assessment_job( - ws, - installation_ctx, - make_cluster_policy, - make_cluster_policy_permissions, - modified_or_skip, -): - modified_or_skip("assessment") - ctx = installation_ctx.replace(skip_dashboards=False) - ws_group_a, _ = ctx.make_ucx_group() - - cluster_policy = make_cluster_policy() - make_cluster_policy_permissions( - object_id=cluster_policy.policy_id, - permission_level=PermissionLevel.CAN_USE, - group_name=ws_group_a.display_name, - ) - ctx.__dict__['include_object_permissions'] = [f"cluster-policies:{cluster_policy.policy_id}"] - ctx.workspace_installation.run() - - ctx.deployed_workflows.run_workflow("assessment") - - after = ctx.generic_permissions_support.load_as_dict("cluster-policies", cluster_policy.policy_id) - assert after[ws_group_a.display_name] == PermissionLevel.CAN_USE - @retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=8)) def test_running_real_migrate_groups_job( @@ -50,9 +18,7 @@ def test_running_real_migrate_groups_job( make_cluster_policy_permissions, make_secret_scope, make_secret_scope_acl, - modified_or_skip, ): - modified_or_skip("workspace_access") ws_group_a, acc_group_a = installation_ctx.make_ucx_group() cluster_policy = make_cluster_policy() @@ -94,9 +60,7 @@ def test_running_real_validate_groups_permissions_job( make_cluster_policy_permissions, make_secret_scope, make_secret_scope_acl, - modified_or_skip, ): - modified_or_skip("workspace_access") ws_group_a, _ = installation_ctx.make_ucx_group() query = make_query() @@ -135,13 +99,9 @@ def test_running_real_validate_groups_permissions_job( @retried(on=[NotFound], timeout=timedelta(minutes=8)) def test_running_real_validate_groups_permissions_job_fails( - ws, - installation_ctx, - make_cluster_policy, - make_cluster_policy_permissions, - modified_or_skip, + ws, installation_ctx, make_cluster_policy, make_cluster_policy_permissions ): - modified_or_skip("workspace_access") + ws_group_a, _ = installation_ctx.make_ucx_group() cluster_policy = make_cluster_policy() @@ -164,57 +124,3 @@ def test_running_real_validate_groups_permissions_job_fails( with pytest.raises(ManyError): installation_ctx.deployed_workflows.run_workflow("validate-groups-permissions") - - -@retried(on=[NotFound], timeout=timedelta(minutes=8)) -@pytest.mark.parametrize('prepare_tables_for_migration', [('hiveserde')], indirect=True) -def test_hiveserde_table_in_place_migration_job( - ws, - installation_ctx, - prepare_tables_for_migration, - modified_or_skip, -): - modified_or_skip("hive_metastore") - tables, dst_schema = prepare_tables_for_migration - ctx = installation_ctx.replace( - extend_prompts={ - r".*Do you want to update the existing installation?.*": 'yes', - }, - ) - ctx.workspace_installation.run() - ctx.deployed_workflows.run_workflow("migrate-external-hiveserde-tables-in-place-experimental") - # assert the workflow is successful - assert ctx.deployed_workflows.validate_step("migrate-external-hiveserde-tables-in-place-experimental") - # assert the tables are migrated - for table in tables.values(): - try: - assert ws.tables.get(f"{dst_schema.catalog_name}.{dst_schema.name}.{table.name}").name - except NotFound: - assert False, f"{table.name} not found in {dst_schema.catalog_name}.{dst_schema.name}" - - -@retried(on=[NotFound], timeout=timedelta(minutes=8)) -@pytest.mark.parametrize('prepare_tables_for_migration', [('hiveserde')], indirect=True) -def test_hiveserde_table_ctas_migration_job( - ws, - installation_ctx, - prepare_tables_for_migration, - modified_or_skip, -): - modified_or_skip("hive_metastore") - tables, dst_schema = prepare_tables_for_migration - ctx = installation_ctx.replace( - extend_prompts={ - r".*Do you want to update the existing installation?.*": 'yes', - }, - ) - ctx.workspace_installation.run() - ctx.deployed_workflows.run_workflow("migrate-external-tables-ctas") - # assert the workflow is successful - assert ctx.deployed_workflows.validate_step("migrate-external-tables-ctas") - # assert the tables are migrated - for table in tables.values(): - try: - assert ws.tables.get(f"{dst_schema.catalog_name}.{dst_schema.name}.{table.name}").name - except NotFound: - assert False, f"{table.name} not found in {dst_schema.catalog_name}.{dst_schema.name}" diff --git a/tests/performance/conftest.py b/tests/performance/conftest.py deleted file mode 100644 index c29924dc81..0000000000 --- a/tests/performance/conftest.py +++ /dev/null @@ -1,9 +0,0 @@ -import logging - -# pylint: disable-next=unused-wildcard-import,wildcard-import -from tests.integration.conftest import * # noqa: F403 - -logging.getLogger("tests").setLevel("DEBUG") -logging.getLogger("databricks.labs.ucx").setLevel("DEBUG") - -logger = logging.getLogger(__name__) diff --git a/tests/performance/test_performance.py b/tests/performance/test_performance.py deleted file mode 100644 index 57e4e9f52d..0000000000 --- a/tests/performance/test_performance.py +++ /dev/null @@ -1,139 +0,0 @@ -import json -import logging -from collections.abc import Callable -from dataclasses import dataclass -from functools import partial -from time import process_time - -import pytest -from databricks.labs.blueprint.parallel import Threads -from databricks.labs.lsql.backends import SqlBackend -from databricks.sdk import WorkspaceClient -from databricks.sdk.service import iam - -from databricks.labs.ucx.contexts.workspace_cli import WorkspaceContext -from databricks.labs.ucx.workspace_access.base import Permissions -from databricks.labs.ucx.workspace_access.groups import MigratedGroup, MigrationState - -logger = logging.getLogger(__name__) - -NB_OF_TEST_WS_OBJECTS = 100 - - -@dataclass -class WorkspaceObject: - fixture: Callable - permissions: list[iam.PermissionLevel] - id_attribute: str - type: str - - -@pytest.fixture -def migrated_group_experimental(acc, ws, make_group, make_acc_group): - """Create a pair of groups in workspace and account. Assign account group to workspace.""" - ws_group = make_group() - acc_group = make_acc_group() - acc.workspace_assignment.update(ws.get_workspace_id(), acc_group.id, [iam.WorkspacePermission.USER]) - return MigratedGroup.partial_info(ws_group, acc_group) - - -def test_apply_group_permissions_experimental_performance( - ws: WorkspaceClient, - sql_backend: SqlBackend, - inventory_schema, - migrated_group, - migrated_group_experimental, - make_experiment, - make_model, - make_cluster_policy, - env_or_skip, -): - # Making sure this test can only be launched from local - env_or_skip("IDE_PROJECT_ROOTS") - ws_objects = [ - WorkspaceObject(partial(make_experiment), [iam.PermissionLevel.CAN_MANAGE], "experiment_id", "experiments"), - WorkspaceObject( - partial(make_model), [iam.PermissionLevel.CAN_MANAGE_PRODUCTION_VERSIONS], "id", "registered-models" - ), - WorkspaceObject(partial(make_cluster_policy), [iam.PermissionLevel.CAN_USE], "policy_id", "cluster-policies"), - ] - for ws_object in ws_objects: - create_ws_objects_parallel( - ws, - sql_backend, - inventory_schema, - NB_OF_TEST_WS_OBJECTS, - ws_object.fixture, - ws_object.permissions, - ws_object.id_attribute, - ws_object.type, - [migrated_group.name_in_workspace, migrated_group_experimental.name_in_workspace], - ) - - start = process_time() - MigrationState([migrated_group_experimental]).apply_to_groups_with_different_names(ws) - logger.info(f"Migration using experimental API takes {process_time() - start}s") - - start = process_time() - ctx = WorkspaceContext(ws).replace(inventory_schema=inventory_schema, sql_backend=sql_backend) - ctx.permission_manager.apply_group_permissions(MigrationState([migrated_group])) - logger.info(f"Migration using normal approach takes {process_time() - start}s") - - -def set_permissions(ws: WorkspaceClient, ws_object, id_attribute, object_type, acls): - request_object_id = getattr(ws_object, id_attribute) - ws.permissions.update(object_type, request_object_id, access_control_list=acls) - - -def create_ws_objects_parallel( - ws: WorkspaceClient, - sql_backend: SqlBackend, - inventory_schema, - num_objects, - fixture, - all_permissions, - id_attribute, - object_type, - group_names, -): - fixture_tasks = [] - for _ in range(num_objects): - fixture_tasks.append(partial(fixture)) - - success, failure = Threads.gather(f"Creation of {object_type} ", fixture_tasks) # type: ignore[var-annotated] - logger.warning(f"Had {len(failure)} failures when creating objects") - - acls = [ - iam.AccessControlRequest(group_name=group_name, permission_level=permission) - for permission in all_permissions - for group_name in group_names - ] - ws_object_permissions_tasks = [] - to_persist = [] - for ws_object in success: - ws_object_permissions_tasks.append(partial(set_permissions, ws, ws_object, id_attribute, object_type, acls)) - for object_permission in _generate_object_permissions( - ws_object, id_attribute, object_type, all_permissions, group_names - ): - to_persist.append( - Permissions(getattr(ws_object, id_attribute), object_type, json.dumps(object_permission.as_dict())) - ) - sql_backend.save_table(f"{inventory_schema}.permissions", to_persist, Permissions) - success, failure = Threads.gather(f"{object_type} permissions", ws_object_permissions_tasks) - logger.warning(f"Had {len(failure)} failures when applying object permissions") - - -def _generate_object_permissions(ws_object, id_attribute, object_type, all_permissions, group_names): - return [ - iam.ObjectPermissions( - access_control_list=[ - iam.AccessControlResponse( - group_name=group_name, - all_permissions=[iam.Permission(permission_level=permission) for permission in all_permissions], - ) - ], - object_id=getattr(ws_object, id_attribute), - object_type=object_type, - ) - for group_name in group_names - ] diff --git a/tests/unit/source_code/linters/test_files.py b/tests/unit/source_code/linters/test_files.py index 86a2596b18..cade4e976f 100644 --- a/tests/unit/source_code/linters/test_files.py +++ b/tests/unit/source_code/linters/test_files.py @@ -162,7 +162,9 @@ def test_folder_has_repr(): @pytest.mark.skip("Manual testing for troubleshooting") -@pytest.mark.parametrize("path", [(Path(site_packages, "mypy", "build.py"))]) +@pytest.mark.parametrize( + "path", [Path("/Users/eric.vergnaud/development/ucx/.venv/lib/python3.10/site-packages/spacy/pipe_analysis.py")] +) def test_known_issues(path: Path, migration_index): file_loader = FileLoader() folder_loader = FolderLoader(file_loader) @@ -181,4 +183,6 @@ def test_known_issues(path: Path, migration_index): resolver, lambda: LinterContext(migration_index, session_state), ) - linter.lint(MockPrompts({}), path) + advices = linter.lint(MockPrompts({}), path) + for advice in advices: + print(repr(advice)) diff --git a/tests/unit/source_code/linters/test_python_ast.py b/tests/unit/source_code/linters/test_python_ast.py index 47569c7321..16bdf6d34c 100644 --- a/tests/unit/source_code/linters/test_python_ast.py +++ b/tests/unit/source_code/linters/test_python_ast.py @@ -1,5 +1,5 @@ import pytest -from astroid import Assign, AstroidSyntaxError, Attribute, Call, Const, Expr # type: ignore +from astroid import Assign, AstroidSyntaxError, Attribute, Call, Const, Expr, Name # type: ignore from databricks.labs.ucx.source_code.linters.python_ast import Tree from databricks.labs.ucx.source_code.linters.python_infer import InferredValue @@ -147,3 +147,15 @@ def test_appends_statements(): values = list(InferredValue.infer_from_node(tree.node)) strings = list(value.as_string() for value in values) assert strings == ["Hello John!"] + + +def test_is_from_module(): + source = """ +df = spark.read.csv("hi") +df.write.format("delta").saveAsTable("old.things") +""" + tree = Tree.normalize_and_parse(source) + save_call = tree.locate( + Call, [("saveAsTable", Attribute), ("format", Attribute), ("write", Attribute), ("df", Name)] + )[0] + assert Tree(save_call).is_from_module("spark") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-cache-table.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-cache-table.py index 82c82cb613..5cf6d4912b 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-cache-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-cache-table.py @@ -19,7 +19,5 @@ spark.catalog.cacheTable(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. -# FIXME: This is a false positive; any method named 'cacheTable' is triggering the warning. -# ucx[table-migrate:+1:0:+1:39] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.cacheTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-external-table.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-external-table.py index 04aef1841d..890982c8c4 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-external-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-external-table.py @@ -35,7 +35,5 @@ do_stuff_with(df) ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'createExternalTable' is triggering the warning. - # ucx[table-migrate:+1:4:+1:52] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.createExternalTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-table.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-table.py index 695c9de5a8..07bb7e5972 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-create-table.py @@ -35,7 +35,5 @@ do_stuff_with(df) ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'createTable' is triggering the warning. - # ucx[table-migrate:+1:4:+1:44] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.createTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-get-table.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-get-table.py index 8d42b21524..f4e7171651 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-get-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-get-table.py @@ -24,7 +24,5 @@ do_stuff_with(table) ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'getTable' is triggering the warning. - # ucx[table-migrate:+1:4:+1:41] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.getTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-is-cached.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-is-cached.py index b4e5f19d9b..83eb627b12 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-is-cached.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-is-cached.py @@ -19,7 +19,5 @@ cached_previously = spark.catalog.isCached(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'isCached' is triggering the warning. - # ucx[table-migrate:+1:4:+1:41] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.isCached("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-list-columns.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-list-columns.py index e0f86e1e16..43d69c2f07 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-list-columns.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-list-columns.py @@ -29,7 +29,5 @@ columns = spark.catalog.listColumns(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'listColumns' is triggering the warning. - # ucx[table-migrate:+1:4:+1:44] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.listColumns("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-recover-partitions.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-recover-partitions.py index 8312fc974b..08cd982e5e 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-recover-partitions.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-recover-partitions.py @@ -19,7 +19,5 @@ spark.catalog.recoverPartitions(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'recoverPartitions' is triggering the warning. - # ucx[table-migrate:+1:4:+1:50] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.recoverPartitions("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-refresh-table.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-refresh-table.py index 7d1e043be9..8fabfedd01 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-refresh-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-refresh-table.py @@ -19,7 +19,5 @@ spark.catalog.refreshTable(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'refreshTable' is triggering the warning. - # ucx[table-migrate:+1:4:+1:45] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.refreshTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-table-exists.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-table-exists.py index 625f070822..6aee348d91 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-table-exists.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-table-exists.py @@ -34,7 +34,5 @@ pass ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'tableExists' is triggering the warning. - # ucx[table-migrate:+1:4:+1:44] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.tableExists("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-uncache-table.py b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-uncache-table.py index 636bd8c942..63cf7a18b9 100644 --- a/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-uncache-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/catalog/spark-catalog-uncache-table.py @@ -19,7 +19,5 @@ spark.catalog.uncacheTable(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'uncacheTable' is triggering the warning. - # ucx[table-migrate:+1:4:+1:45] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.uncacheTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/dataframe-write-insert-into.py b/tests/unit/source_code/samples/functional/pyspark/dataframe-write-insert-into.py index c27b91d673..2b8abe77b9 100644 --- a/tests/unit/source_code/samples/functional/pyspark/dataframe-write-insert-into.py +++ b/tests/unit/source_code/samples/functional/pyspark/dataframe-write-insert-into.py @@ -23,7 +23,5 @@ df.write.insertInto(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This is a false positive; any method named 'insertInto' is triggering the warning. - # ucx[table-migrate:+1:4:+1:43] Table old.things is migrated to brand.new.stuff in Unity Catalog something_else.insertInto("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/dataframe-write-save-as-table.py b/tests/unit/source_code/samples/functional/pyspark/dataframe-write-save-as-table.py index 5b0017c5ef..0830629a96 100644 --- a/tests/unit/source_code/samples/functional/pyspark/dataframe-write-save-as-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/dataframe-write-save-as-table.py @@ -23,8 +23,5 @@ df.write.format("delta").saveAsTable(f"boop{stuff}") ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: This are false positives; any method named 'saveAsTable' is triggering the warnings. - # ucx[table-migrate:+2:4:+2:44] Table old.things is migrated to brand.new.stuff in Unity Catalog - # ucx[table-migrate:+1:4:+1:44] The default format changed in Databricks Runtime 8.0, from Parquet to Delta something_else.saveAsTable("old.things") a_function("old.things") diff --git a/tests/unit/source_code/samples/functional/pyspark/spark-table.py b/tests/unit/source_code/samples/functional/pyspark/spark-table.py index 461569410e..d82cb728c2 100644 --- a/tests/unit/source_code/samples/functional/pyspark/spark-table.py +++ b/tests/unit/source_code/samples/functional/pyspark/spark-table.py @@ -31,9 +31,6 @@ df = spark.table(f"boop{stuff}") do_stuff_with(df) - ## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns. - # FIXME: These are false positives; any method named 'table' is triggering the warnings. - # ucx[table-migrate:+2:4:+2:38] Table old.things is migrated to brand.new.stuff in Unity Catalog - # ucx[table-migrate:+1:4:+1:38] The default format changed in Databricks Runtime 8.0, from Parquet to Delta + ## Some trivial references to the method or table in unrelated contexts that should not trigger warnings. something_else.table("old.things") a_function("old.things")