jincejames
diff --git a/Diff for: ‎src/databricks/labs/ucx/hive_metastore/table_migrate.py
+88-42 b/Diff for: ‎src/databricks/labs/ucx/hive_metastore/table_migrate.py
+88-42
diff --git a/Diff for: ‎src/databricks/labs/ucx/hive_metastore/tables.py
+126 b/Diff for: ‎src/databricks/labs/ucx/hive_metastore/tables.py
+126
@@ -4,15 +4,14 @@
 from collections.abc import Iterable
 from functools import partial
 
-import sqlglot
-from sqlglot import expressions
 from databricks.labs.blueprint.parallel import Threads
 from databricks.labs.lsql.backends import SqlBackend
 from databricks.sdk import WorkspaceClient
 
 from databricks.labs.ucx.framework.utils import escape_sql_identifier
-from databricks.labs.ucx.hive_metastore import TablesCrawler
+from databricks.labs.ucx.hive_metastore import TablesCrawler, Mounts
 from databricks.labs.ucx.hive_metastore.grants import Grant, GrantsCrawler, PrincipalACL
+from databricks.labs.ucx.hive_metastore.locations import Mount, ExternalLocations
 from databricks.labs.ucx.hive_metastore.mapping import (
     Rule,
     TableMapping,
@@ -24,6 +23,7 @@
     MigrationCount,
     Table,
     What,
+    HiveSerdeType,
 )
 from databricks.labs.ucx.hive_metastore.view_migrate import (
     ViewsMigrationSequencer,
@@ -60,22 +60,45 @@ def index(self):
         # TODO: remove this method
         return self._migration_status_refresher.index()
 
-    def migrate_tables(self, what: What, acl_strategy: list[AclMigrationWhat] | None = None):
+    def migrate_tables(
+        self,
+        what: What,
+        acl_strategy: list[AclMigrationWhat] | None = None,
+        mounts_crawler: Mounts | None = None,
+        hiveserde_in_place_migrate: bool = False,
+    ):
         if what in [What.DB_DATASET, What.UNKNOWN]:
             logger.error(f"Can't migrate tables with type {what.name}")
             return None
         all_grants_to_migrate = None if acl_strategy is None else self._gc.snapshot()
         all_migrated_groups = None if acl_strategy is None else self._group.snapshot()
         all_principal_grants = None if acl_strategy is None else self._principal_grants.get_interactive_cluster_grants()
         self._init_seen_tables()
+        # mounts will be used to replace the mnt based table location in the DDL for hiveserde table in-place migration
+        mounts: list[Mount] = []
+        if mounts_crawler:
+            mounts = list(mounts_crawler.snapshot())
         if what == What.VIEW:
             return self._migrate_views(acl_strategy, all_grants_to_migrate, all_migrated_groups, all_principal_grants)
         return self._migrate_tables(
-            what, acl_strategy, all_grants_to_migrate, all_migrated_groups, all_principal_grants
+            what,
+            acl_strategy,
+            all_grants_to_migrate,
+            all_migrated_groups,
+            all_principal_grants,
+            mounts,
+            hiveserde_in_place_migrate,
         )
 
     def _migrate_tables(
-        self, what: What, acl_strategy, all_grants_to_migrate, all_migrated_groups, all_principal_grants
+        self,
+        what: What,
+        acl_strategy,
+        all_grants_to_migrate,
+        all_migrated_groups,
+        all_principal_grants,
+        mounts: list[Mount],
+        hiveserde_in_place_migrate: bool = False,
     ):
         tables_to_migrate = self._tm.get_tables_to_migrate(self._tc)
         tables_in_scope = filter(lambda t: t.src.what == what, tables_to_migrate)
@@ -84,14 +107,10 @@ def _migrate_tables(
             grants = self._compute_grants(
                 table.src, acl_strategy, all_grants_to_migrate, all_migrated_groups, all_principal_grants
             )
-            tasks.append(
-                partial(
-                    self._migrate_table,
-                    table,
-                    grants,
-                )
-            )
+            tasks.append(partial(self._migrate_table, table, grants, mounts, hiveserde_in_place_migrate))
         Threads.strict("migrate tables", tasks)
+        if not tasks:
+            logger.info(f"No tables found to migrate with type {what.name}")
         # the below is useful for testing
         return tasks
 
@@ -134,7 +153,9 @@ def _compute_grants(
     def _migrate_table(
         self,
         src_table: TableToMigrate,
-        grants: list[Grant] | None = None,
+        grants: list[Grant],
+        mounts: list[Mount],
+        hiveserde_in_place_migrate: bool = False,
     ):
         if self._table_already_migrated(src_table.rule.as_uc_table_key):
             logger.info(f"Table {src_table.src.key} already migrated to {src_table.rule.as_uc_table_key}")
@@ -145,8 +166,10 @@ def _migrate_table(
             return self._migrate_table_create_ctas(src_table.src, src_table.rule, grants)
         if src_table.src.what == What.EXTERNAL_SYNC:
             return self._migrate_external_table(src_table.src, src_table.rule, grants)
-        if src_table.src.what == What.EXTERNAL_NO_SYNC:
-            return self._migrate_non_sync_table(src_table.src, src_table.rule, grants)
+        if src_table.src.what == What.EXTERNAL_HIVESERDE:
+            return self._migrate_external_table_hiveserde(
+                src_table.src, src_table.rule, grants, mounts, hiveserde_in_place_migrate
+            )
         logger.info(f"Table {src_table.src.key} is not supported for migration")
         return True
 
@@ -201,18 +224,60 @@ def _migrate_external_table(self, src_table: Table, rule: Rule, grants: list[Gra
         self._backend.execute(src_table.sql_alter_from(rule.as_uc_table_key, self._ws.get_workspace_id()))
         return self._migrate_acl(src_table, rule, grants)
 
-    def _migrate_dbfs_root_table(self, src_table: Table, rule: Rule, grants: list[Grant] | None = None):
-        target_table_key = rule.as_uc_table_key
-        table_migrate_sql = src_table.sql_migrate_dbfs(target_table_key)
-        logger.debug(f"Migrating managed table {src_table.key} to using SQL query: {table_migrate_sql}")
+    def _migrate_external_table_hiveserde(
+        self,
+        src_table: Table,
+        rule: Rule,
+        grants: list[Grant],
+        mounts: list[Mount],
+        hiveserde_in_place_migrate: bool = False,
+    ):
+        # This hiveserde_in_place_migrate is used to determine if current migration should use in-place migration or CTAS.
+        # We will provide two workflows for hiveserde table migration:
+        # 1. One will migrate all hiveserde tables using CTAS which we officially support.
+        # 2. The other one will migrate certain types of hiveserde in place, which is technically working, but the user
+        # need to accept the risk that the old files created by hiveserde may not be processed correctly by Spark
+        # datasource in corner cases.
+        # User will need to decide which workflow to runs first which will migrate the hiveserde tables and mark the
+        # `upgraded_to` property and hence those tables will be skipped in the migration workflow runs later.
+        if not hiveserde_in_place_migrate:
+            # TODO: Add sql_migrate_external_hiveserde_ctas here
+            return False
+
+        # verify hive serde type
+        hiveserde_type = src_table.hiveserde_type(self._backend)
+        if hiveserde_type in [
+            HiveSerdeType.NOT_HIVESERDE,
+            HiveSerdeType.OTHER_HIVESERDE,
+            HiveSerdeType.INVALID_HIVESERDE_INFO,
+        ]:
+            logger.warning(f"{src_table.key} table can only be migrated using CTAS.")
+            return False
+
+        # if the src table location is using mount, resolve the mount location so it will be used in the updated DDL
+        dst_table_location = None
+        if mounts and src_table.is_dbfs_mnt:
+            dst_table_location = ExternalLocations.resolve_mount(src_table.location, mounts)
+
+        table_migrate_sql = src_table.sql_migrate_external_hiveserde_in_place(
+            rule.catalog_name, rule.dst_schema, rule.dst_table, self._backend, hiveserde_type, dst_table_location
+        )
+        if not table_migrate_sql:
+            logger.error(
+                f"Failed to generate in-place migration DDL for {src_table.key}, skip the in-place migration. It can be migrated in CTAS workflow"
+            )
+            return False
+
+        logger.debug(f"Migrating external table {src_table.key} to using SQL query: {table_migrate_sql}")
         self._backend.execute(table_migrate_sql)
         self._backend.execute(src_table.sql_alter_to(rule.as_uc_table_key))
         self._backend.execute(src_table.sql_alter_from(rule.as_uc_table_key, self._ws.get_workspace_id()))
         return self._migrate_acl(src_table, rule, grants)
 
-    def _migrate_non_sync_table(self, src_table: Table, rule: Rule, grants: list[Grant] | None = None):
-        table_migrate_sql = self._get_create_in_place_sql(src_table, rule)
-        logger.debug(f"Migrating table (No Sync) {src_table.key} to using SQL query: {table_migrate_sql}")
+    def _migrate_dbfs_root_table(self, src_table: Table, rule: Rule, grants: list[Grant] | None = None):
+        target_table_key = rule.as_uc_table_key
+        table_migrate_sql = src_table.sql_migrate_dbfs(target_table_key)
+        logger.debug(f"Migrating managed table {src_table.key} to using SQL query: {table_migrate_sql}")
         self._backend.execute(table_migrate_sql)
         self._backend.execute(src_table.sql_alter_to(rule.as_uc_table_key))
         self._backend.execute(src_table.sql_alter_from(rule.as_uc_table_key, self._ws.get_workspace_id()))
@@ -226,25 +291,6 @@ def _migrate_table_create_ctas(self, src_table: Table, rule: Rule, grants: list[
         self._backend.execute(src_table.sql_alter_from(rule.as_uc_table_key, self._ws.get_workspace_id()))
         return self._migrate_acl(src_table, rule, grants)
 
-    def _get_create_in_place_sql(self, src_table: Table, rule: Rule) -> str:
-        create_sql = str(next(self._backend.fetch(src_table.sql_show_create()))["createtab_stmt"])
-        statements = sqlglot.parse(create_sql, read='databricks')
-        assert len(statements) == 1, 'Expected a single statement'
-        create = statements[0]
-        assert isinstance(create, expressions.Create), 'Expected a CREATE statement'
-        # safely replace current table name with the updated catalog
-        for table_name in create.find_all(expressions.Table):
-            if table_name.db == src_table.database and table_name.name == src_table.name:
-                new_table_name = expressions.Table(
-                    catalog=rule.catalog_name,
-                    db=rule.dst_schema,
-                    this=rule.dst_table,
-                )
-                table_name.replace(new_table_name)
-        # safely replace CREATE with CREATE IF NOT EXISTS
-        create.args['exists'] = True
-        return create.sql('databricks')
-
     def _get_create_ctas_sql(self, src_table: Table, rule: Rule) -> str:
         create_sql = (
             f"CREATE TABLE IF NOT EXISTS {escape_sql_identifier(rule.as_uc_table_key)} "
 
@@ -6,6 +6,11 @@
 from enum import Enum, auto
 from functools import partial
 
+import sqlglot
+from sqlglot import expressions
+from sqlglot.expressions import LocationProperty
+from sqlglot.errors import ParseError
+
 from databricks.labs.blueprint.parallel import Threads
 from databricks.labs.lsql.backends import SqlBackend
 from databricks.sdk.errors import NotFound
@@ -18,6 +23,7 @@
 
 class What(Enum):
     EXTERNAL_SYNC = auto()
+    EXTERNAL_HIVESERDE = auto()
     EXTERNAL_NO_SYNC = auto()
     DBFS_ROOT_DELTA = auto()
     DBFS_ROOT_NON_DELTA = auto()
@@ -26,6 +32,15 @@ class What(Enum):
     UNKNOWN = auto()
 
 
+class HiveSerdeType(Enum):
+    PARQUET = auto()
+    AVRO = auto()
+    ORC = auto()
+    OTHER_HIVESERDE = auto()
+    NOT_HIVESERDE = auto()
+    INVALID_HIVESERDE_INFO = auto()
+
+
 class AclMigrationWhat(Enum):
     LEGACY_TACL = auto()
     PRINCIPAL = auto()
@@ -117,6 +132,15 @@ def is_dbfs_root(self) -> bool:
             return True
         return False
 
+    @property
+    def is_dbfs_mnt(self) -> bool:
+        if not self.location:
+            return False
+        for dbfs_mnt_prefix in self.DBFS_ROOT_PREFIX_EXCEPTIONS:
+            if self.location.startswith(dbfs_mnt_prefix):
+                return True
+        return False
+
     @property
     def is_format_supported_for_sync(self) -> bool:
         if self.table_format is None:
@@ -150,6 +174,8 @@ def what(self) -> What:
             return What.DBFS_ROOT_NON_DELTA
         if self.kind == "TABLE" and self.is_format_supported_for_sync:
             return What.EXTERNAL_SYNC
+        if self.kind == "TABLE" and self.table_format.upper() == "HIVE":
+            return What.EXTERNAL_HIVESERDE
         if self.kind == "TABLE":
             return What.EXTERNAL_NO_SYNC
         if self.kind == "VIEW":
@@ -167,6 +193,106 @@ def sql_migrate_create_like(self, target_table_key):
             f"{escape_sql_identifier(self.key)};"
         )
 
+    def hiveserde_type(self, backend: SqlBackend) -> HiveSerdeType:
+        if self.table_format != "HIVE":
+            return HiveSerdeType.NOT_HIVESERDE
+        # Extract hive serde info, ideally this should be done by table crawler.
+        # But doing here to avoid breaking change to the `tables` table in the inventory schema.
+        describe = {}
+        for key, values, _ in backend.fetch(f"DESCRIBE TABLE EXTENDED {escape_sql_identifier(self.key)}"):
+            describe[key] = values
+        if not {"Serde Library", "InputFormat", "OutputFormat"} <= describe.keys():
+            return HiveSerdeType.INVALID_HIVESERDE_INFO
+        serde = describe["Serde Library"]
+        input_format = describe["InputFormat"]
+        output_format = describe["OutputFormat"]
+        if self._if_parquet_serde(serde, input_format, output_format):
+            return HiveSerdeType.PARQUET
+        if self._if_avro_serde(serde, input_format, output_format):
+            return HiveSerdeType.AVRO
+        if self._if_orc_serde(serde, input_format, output_format):
+            return HiveSerdeType.ORC
+        return HiveSerdeType.OTHER_HIVESERDE
+
+    def _if_parquet_serde(self, serde, input_format, output_format) -> bool:
+        return (
+            serde == "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe"
+            and input_format == "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"
+            and output_format == "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"
+        )
+
+    def _if_avro_serde(self, serde, input_format, output_format) -> bool:
+        return (
+            serde == "org.apache.hadoop.hive.serde2.avro.AvroSerDe"
+            and input_format == "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat"
+            and output_format == "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat"
+        )
+
+    def _if_orc_serde(self, serde, input_format, output_format) -> bool:
+        return (
+            serde == "org.apache.hadoop.hive.ql.io.orc.OrcSerde"
+            and input_format == "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"
+            and output_format == "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"
+        )
+
+    def sql_migrate_external_hiveserde_in_place(
+        self,
+        catalog_name,
+        dst_schema,
+        dst_table,
+        backend: SqlBackend,
+        hiveserde_type: HiveSerdeType,
+        replace_table_location: str | None = None,
+    ) -> str | None:
+        # "PARQUET", "AVRO", "ORC" can be migrated with "SHOW CREATE TABLE..." DDL
+        if hiveserde_type in [HiveSerdeType.PARQUET, HiveSerdeType.AVRO, HiveSerdeType.ORC]:
+            return self._ddl_show_create_table(backend, catalog_name, dst_schema, dst_table, replace_table_location)
+
+        # "TEXTFILE" hiveserde needs extra handling on preparing the DDL
+        # TODO: add support for "TEXTFILE" hiveserde, when the data can be parsed as Spark CSV datasource
+
+        # "JSON", "CSV" hiveserde need extra handling on preparing the DDL
+        # TODO: DBR does not bundle the jars for "JSON", "CSV" hiveserde, it's unlikely we see those tables.
+        #  Although it's possible that users has the jars installed as cluster library and use those tables in Databricks,
+        #  we hold off the implementation for now until we see the real use case.
+        return None
+
+    def _ddl_show_create_table(
+        self, backend: SqlBackend, catalog_name, dst_schema, dst_table, replace_location
+    ) -> str | None:
+        # get raw DDL from "SHOW CREATE TABLE..."
+        createtab_stmt = next(backend.fetch(self.sql_show_create()))["createtab_stmt"]
+        # parse the DDL and replace the old table name with the new UC table name
+        try:
+            statements = sqlglot.parse(createtab_stmt)
+        except (ValueError, ParseError):
+            logger.exception(f"Exception when parsing 'SHOW CREATE TABLE' DDL for {self.key}")
+            return None
+
+        statement = statements[0]
+        if not statement:
+            logger.error(f"sqlglot parsed none statement from 'SHOW CREATE TABLE' DDL for {self.key}")
+            return None
+
+        src_table = statement.find(expressions.Table)
+        if not src_table:
+            logger.error(f"sqlglot failed to extract table object from parsed DDL for {self.key}")
+            return None
+        new_table = expressions.Table(catalog=catalog_name, db=dst_schema, this=dst_table)
+        src_table.replace(new_table)
+
+        if replace_location:
+            # replace dbfs mnt in ddl if any
+            mnt_loc = statement.find(LocationProperty)
+            if not mnt_loc:
+                logger.error(f"sqlglot failed to extract table location object from parsed DDL for {self.key}")
+                return None
+            new_loc = LocationProperty(this=f"'{replace_location}'")
+            mnt_loc.replace(new_loc)
+
+        new_sql = statement.sql('databricks')
+        return new_sql
+
     def sql_migrate_dbfs(self, target_table_key):
         if not self.is_delta:
             msg = f"{self.key} is not DELTA: {self.table_format}"