databrickslabs
diff --git a/‎src/databricks/labs/ucx/config.py
+2 b/‎src/databricks/labs/ucx/config.py
+2
diff --git a/‎src/databricks/labs/ucx/framework/tasks.py
+29-14 b/‎src/databricks/labs/ucx/framework/tasks.py
+29-14
diff --git a/‎src/databricks/labs/ucx/install.py
+61-18 b/‎src/databricks/labs/ucx/install.py
+61-18
@@ -32,6 +32,8 @@ class WorkspaceConfig:  # pylint: disable=too-many-instance-attributes
     workspace_start_path: str = "/"
     instance_profile: str | None = None
     spark_conf: dict[str, str] | None = None
+    min_workers: int | None = 1
+    max_workers: int | None = 10
 
     override_clusters: dict[str, str] | None = None
     policy_id: str | None = None
 
@@ -28,7 +28,7 @@ class Task:
     workflow: str
     name: str
     doc: str
-    fn: Callable[[WorkspaceConfig, WorkspaceClient, SqlBackend], None]
+    fn: Callable[[WorkspaceConfig, WorkspaceClient, SqlBackend, Installation], None]
     depends_on: list[str] | None = None
     job_cluster: str = "main"
     notebook: str | None = None
@@ -214,33 +214,36 @@ def _create_lock(lockfile_name):
         return f
 
 
-def trigger(*argv):
+def parse_args(*argv) -> dict[str, str]:
     args = dict(a[2:].split("=") for a in argv if a[0:2] == "--")
     if "config" not in args:
         msg = "no --config specified"
         raise KeyError(msg)
+    return args
+
 
+def run_task(
+    args: dict[str, str],
+    install_dir: Path,
+    cfg: WorkspaceConfig,
+    workspace_client: WorkspaceClient,
+    sql_backend: RuntimeBackend,
+    installation: Installation,
+):
     task_name = args.get("task", "not specified")
-    # `{{parent_run_id}}` is the run of entire workflow, whereas `{{run_id}}` is the run of a task
-    workflow_run_id = args.get("parent_run_id", "unknown_run_id")
-    job_id = args.get("job_id")
     if task_name not in _TASKS:
         msg = f'task "{task_name}" not found. Valid tasks are: {", ".join(_TASKS.keys())}'
         raise KeyError(msg)
-
     print(f"UCX v{__version__}")
-
     current_task = _TASKS[task_name]
     print(current_task.doc)
 
-    config_path = Path(args["config"])
-
-    cfg = Installation.load_local(WorkspaceConfig, config_path)
-    sql_backend = RuntimeBackend(debug_truncate_bytes=cfg.connect.debug_truncate_bytes)
-    workspace_client = WorkspaceClient(config=cfg.connect, product='ucx', product_version=__version__)
+    # `{{parent_run_id}}` is the run of entire workflow, whereas `{{run_id}}` is the run of a task
+    workflow_run_id = args.get("parent_run_id", "unknown_run_id")
+    job_id = args.get("job_id", "unknown_job_id")
 
     with TaskLogger(
-        config_path.parent,
+        install_dir,
         workflow=current_task.workflow,
         workflow_id=job_id,
         task_name=task_name,
@@ -249,4 +252,16 @@ def trigger(*argv):
     ) as task_logger:
         ucx_logger = logging.getLogger("databricks.labs.ucx")
         ucx_logger.info(f"UCX v{__version__} After job finishes, see debug logs at {task_logger}")
-        current_task.fn(cfg, workspace_client, sql_backend)
+        current_task.fn(cfg, workspace_client, sql_backend, installation)
+
+
+def trigger(*argv):
+    args = parse_args(*argv)
+    config_path = Path(args["config"])
+
+    cfg = Installation.load_local(WorkspaceConfig, config_path)
+    sql_backend = RuntimeBackend(debug_truncate_bytes=cfg.connect.debug_truncate_bytes)
+    workspace_client = WorkspaceClient(config=cfg.connect, product='ucx', product_version=__version__)
+    installation = Installation.current(workspace_client, "ucx")
+
+    run_task(args, config_path.parent, cfg, workspace_client, sql_backend, installation)
@@ -278,6 +278,26 @@ def _configure_new_installation(self) -> WorkspaceConfig:
 
         policy_id, instance_profile, spark_conf_dict = self._policy_installer.create(inventory_database)
 
+        # Save configurable spark_conf for table migration cluster
+        # parallelism will not be needed if backlog is fixed in https://databricks.atlassian.net/browse/ES-975874
+        parallelism = self._prompts.question(
+            "Parallelism for migrating dbfs root delta tables with deep clone", default="200", valid_number=True
+        )
+        if not spark_conf_dict:
+            spark_conf_dict = {}
+        spark_conf_dict.update({'spark.sql.sources.parallelPartitionDiscovery.parallelism': parallelism})
+        # mix max workers for auto-scale migration job cluster
+        min_workers = int(
+            self._prompts.question(
+                "Min workers for auto-scale job cluster for table migration", default="1", valid_number=True
+            )
+        )
+        max_workers = int(
+            self._prompts.question(
+                "Max workers for auto-scale job cluster for table migration", default="10", valid_number=True
+            )
+        )
+
         # Check if terraform is being used
         is_terraform_used = self._prompts.confirm("Do you use Terraform to deploy your infrastructure?")
 
@@ -294,6 +314,8 @@ def _configure_new_installation(self) -> WorkspaceConfig:
             num_threads=num_threads,
             instance_profile=instance_profile,
             spark_conf=spark_conf_dict,
+            min_workers=min_workers,
+            max_workers=max_workers,
             policy_id=policy_id,
             is_terraform_used=is_terraform_used,
             include_databases=self._select_databases(),
@@ -798,38 +820,59 @@ def _job_wheel_task(self, jobs_task: jobs.Task, task: Task, remote_wheel: str) -
             ),
         )
 
+    def _job_cluster_spark_conf(self, cluster_key: str):
+        conf_from_installation = self._config.spark_conf if self._config.spark_conf else {}
+        if cluster_key == "main":
+            spark_conf = {
+                "spark.databricks.cluster.profile": "singleNode",
+                "spark.master": "local[*]",
+            }
+            return spark_conf | conf_from_installation
+        if cluster_key == "tacl":
+            return {"spark.databricks.acl.sqlOnly": "true"} | conf_from_installation
+        if cluster_key == "table_migration":
+            return {"spark.sql.sources.parallelPartitionDiscovery.parallelism": "200"} | conf_from_installation
+        return conf_from_installation
+
     def _job_clusters(self, names: set[str]):
         clusters = []
-        spark_conf = {
-            "spark.databricks.cluster.profile": "singleNode",
-            "spark.master": "local[*]",
-        }
-        if self._config.spark_conf is not None:
-            spark_conf = spark_conf | self._config.spark_conf
-        spec = compute.ClusterSpec(
-            data_security_mode=compute.DataSecurityMode.LEGACY_SINGLE_USER,
-            spark_conf=spark_conf,
-            custom_tags={"ResourceClass": "SingleNode"},
-            num_workers=0,
-            policy_id=self.config.policy_id,
-        )
         if "main" in names:
             clusters.append(
                 jobs.JobCluster(
                     job_cluster_key="main",
-                    new_cluster=spec,
+                    new_cluster=compute.ClusterSpec(
+                        data_security_mode=compute.DataSecurityMode.LEGACY_SINGLE_USER,
+                        spark_conf=self._job_cluster_spark_conf("main"),
+                        custom_tags={"ResourceClass": "SingleNode"},
+                        num_workers=0,
+                        policy_id=self.config.policy_id,
+                    ),
                 )
             )
         if "tacl" in names:
             clusters.append(
                 jobs.JobCluster(
                     job_cluster_key="tacl",
-                    new_cluster=replace(
-                        spec,
+                    new_cluster=compute.ClusterSpec(
                         data_security_mode=compute.DataSecurityMode.LEGACY_TABLE_ACL,
-                        spark_conf={"spark.databricks.acl.sqlOnly": "true"},
+                        spark_conf=self._job_cluster_spark_conf("tacl"),
                         num_workers=1,  # ShowPermissionsCommand needs a worker
-                        custom_tags={},
+                        policy_id=self.config.policy_id,
+                    ),
+                )
+            )
+        if "table_migration" in names:
+            clusters.append(
+                jobs.JobCluster(
+                    job_cluster_key="table_migration",
+                    new_cluster=compute.ClusterSpec(
+                        data_security_mode=compute.DataSecurityMode.SINGLE_USER,
+                        spark_conf=self._job_cluster_spark_conf("table_migration"),
+                        policy_id=self.config.policy_id,
+                        autoscale=compute.AutoScale(
+                            max_workers=self.config.max_workers,
+                            min_workers=self.config.min_workers,
+                        ),
                     ),
                 )
             )