4
4
from collections .abc import Iterable
5
5
from functools import partial
6
6
7
- import sqlglot
8
- from sqlglot import expressions
9
7
from databricks .labs .blueprint .parallel import Threads
10
8
from databricks .labs .lsql .backends import SqlBackend
11
9
from databricks .sdk import WorkspaceClient
12
10
13
11
from databricks .labs .ucx .framework .utils import escape_sql_identifier
14
- from databricks .labs .ucx .hive_metastore import TablesCrawler
12
+ from databricks .labs .ucx .hive_metastore import TablesCrawler , Mounts
15
13
from databricks .labs .ucx .hive_metastore .grants import Grant , GrantsCrawler , PrincipalACL
14
+ from databricks .labs .ucx .hive_metastore .locations import Mount , ExternalLocations
16
15
from databricks .labs .ucx .hive_metastore .mapping import (
17
16
Rule ,
18
17
TableMapping ,
24
23
MigrationCount ,
25
24
Table ,
26
25
What ,
26
+ HiveSerdeType ,
27
27
)
28
28
from databricks .labs .ucx .hive_metastore .view_migrate import (
29
29
ViewsMigrationSequencer ,
@@ -60,22 +60,45 @@ def index(self):
60
60
# TODO: remove this method
61
61
return self ._migration_status_refresher .index ()
62
62
63
- def migrate_tables (self , what : What , acl_strategy : list [AclMigrationWhat ] | None = None ):
63
+ def migrate_tables (
64
+ self ,
65
+ what : What ,
66
+ acl_strategy : list [AclMigrationWhat ] | None = None ,
67
+ mounts_crawler : Mounts | None = None ,
68
+ hiveserde_in_place_migrate : bool = False ,
69
+ ):
64
70
if what in [What .DB_DATASET , What .UNKNOWN ]:
65
71
logger .error (f"Can't migrate tables with type { what .name } " )
66
72
return None
67
73
all_grants_to_migrate = None if acl_strategy is None else self ._gc .snapshot ()
68
74
all_migrated_groups = None if acl_strategy is None else self ._group .snapshot ()
69
75
all_principal_grants = None if acl_strategy is None else self ._principal_grants .get_interactive_cluster_grants ()
70
76
self ._init_seen_tables ()
77
+ # mounts will be used to replace the mnt based table location in the DDL for hiveserde table in-place migration
78
+ mounts : list [Mount ] = []
79
+ if mounts_crawler :
80
+ mounts = list (mounts_crawler .snapshot ())
71
81
if what == What .VIEW :
72
82
return self ._migrate_views (acl_strategy , all_grants_to_migrate , all_migrated_groups , all_principal_grants )
73
83
return self ._migrate_tables (
74
- what , acl_strategy , all_grants_to_migrate , all_migrated_groups , all_principal_grants
84
+ what ,
85
+ acl_strategy ,
86
+ all_grants_to_migrate ,
87
+ all_migrated_groups ,
88
+ all_principal_grants ,
89
+ mounts ,
90
+ hiveserde_in_place_migrate ,
75
91
)
76
92
77
93
def _migrate_tables (
78
- self , what : What , acl_strategy , all_grants_to_migrate , all_migrated_groups , all_principal_grants
94
+ self ,
95
+ what : What ,
96
+ acl_strategy ,
97
+ all_grants_to_migrate ,
98
+ all_migrated_groups ,
99
+ all_principal_grants ,
100
+ mounts : list [Mount ],
101
+ hiveserde_in_place_migrate : bool = False ,
79
102
):
80
103
tables_to_migrate = self ._tm .get_tables_to_migrate (self ._tc )
81
104
tables_in_scope = filter (lambda t : t .src .what == what , tables_to_migrate )
@@ -84,14 +107,10 @@ def _migrate_tables(
84
107
grants = self ._compute_grants (
85
108
table .src , acl_strategy , all_grants_to_migrate , all_migrated_groups , all_principal_grants
86
109
)
87
- tasks .append (
88
- partial (
89
- self ._migrate_table ,
90
- table ,
91
- grants ,
92
- )
93
- )
110
+ tasks .append (partial (self ._migrate_table , table , grants , mounts , hiveserde_in_place_migrate ))
94
111
Threads .strict ("migrate tables" , tasks )
112
+ if not tasks :
113
+ logger .info (f"No tables found to migrate with type { what .name } " )
95
114
# the below is useful for testing
96
115
return tasks
97
116
@@ -134,7 +153,9 @@ def _compute_grants(
134
153
def _migrate_table (
135
154
self ,
136
155
src_table : TableToMigrate ,
137
- grants : list [Grant ] | None = None ,
156
+ grants : list [Grant ],
157
+ mounts : list [Mount ],
158
+ hiveserde_in_place_migrate : bool = False ,
138
159
):
139
160
if self ._table_already_migrated (src_table .rule .as_uc_table_key ):
140
161
logger .info (f"Table { src_table .src .key } already migrated to { src_table .rule .as_uc_table_key } " )
@@ -145,8 +166,10 @@ def _migrate_table(
145
166
return self ._migrate_table_create_ctas (src_table .src , src_table .rule , grants )
146
167
if src_table .src .what == What .EXTERNAL_SYNC :
147
168
return self ._migrate_external_table (src_table .src , src_table .rule , grants )
148
- if src_table .src .what == What .EXTERNAL_NO_SYNC :
149
- return self ._migrate_non_sync_table (src_table .src , src_table .rule , grants )
169
+ if src_table .src .what == What .EXTERNAL_HIVESERDE :
170
+ return self ._migrate_external_table_hiveserde (
171
+ src_table .src , src_table .rule , grants , mounts , hiveserde_in_place_migrate
172
+ )
150
173
logger .info (f"Table { src_table .src .key } is not supported for migration" )
151
174
return True
152
175
@@ -201,18 +224,60 @@ def _migrate_external_table(self, src_table: Table, rule: Rule, grants: list[Gra
201
224
self ._backend .execute (src_table .sql_alter_from (rule .as_uc_table_key , self ._ws .get_workspace_id ()))
202
225
return self ._migrate_acl (src_table , rule , grants )
203
226
204
- def _migrate_dbfs_root_table (self , src_table : Table , rule : Rule , grants : list [Grant ] | None = None ):
205
- target_table_key = rule .as_uc_table_key
206
- table_migrate_sql = src_table .sql_migrate_dbfs (target_table_key )
207
- logger .debug (f"Migrating managed table { src_table .key } to using SQL query: { table_migrate_sql } " )
227
+ def _migrate_external_table_hiveserde (
228
+ self ,
229
+ src_table : Table ,
230
+ rule : Rule ,
231
+ grants : list [Grant ],
232
+ mounts : list [Mount ],
233
+ hiveserde_in_place_migrate : bool = False ,
234
+ ):
235
+ # This hiveserde_in_place_migrate is used to determine if current migration should use in-place migration or CTAS.
236
+ # We will provide two workflows for hiveserde table migration:
237
+ # 1. One will migrate all hiveserde tables using CTAS which we officially support.
238
+ # 2. The other one will migrate certain types of hiveserde in place, which is technically working, but the user
239
+ # need to accept the risk that the old files created by hiveserde may not be processed correctly by Spark
240
+ # datasource in corner cases.
241
+ # User will need to decide which workflow to runs first which will migrate the hiveserde tables and mark the
242
+ # `upgraded_to` property and hence those tables will be skipped in the migration workflow runs later.
243
+ if not hiveserde_in_place_migrate :
244
+ # TODO: Add sql_migrate_external_hiveserde_ctas here
245
+ return False
246
+
247
+ # verify hive serde type
248
+ hiveserde_type = src_table .hiveserde_type (self ._backend )
249
+ if hiveserde_type in [
250
+ HiveSerdeType .NOT_HIVESERDE ,
251
+ HiveSerdeType .OTHER_HIVESERDE ,
252
+ HiveSerdeType .INVALID_HIVESERDE_INFO ,
253
+ ]:
254
+ logger .warning (f"{ src_table .key } table can only be migrated using CTAS." )
255
+ return False
256
+
257
+ # if the src table location is using mount, resolve the mount location so it will be used in the updated DDL
258
+ dst_table_location = None
259
+ if mounts and src_table .is_dbfs_mnt :
260
+ dst_table_location = ExternalLocations .resolve_mount (src_table .location , mounts )
261
+
262
+ table_migrate_sql = src_table .sql_migrate_external_hiveserde_in_place (
263
+ rule .catalog_name , rule .dst_schema , rule .dst_table , self ._backend , hiveserde_type , dst_table_location
264
+ )
265
+ if not table_migrate_sql :
266
+ logger .error (
267
+ f"Failed to generate in-place migration DDL for { src_table .key } , skip the in-place migration. It can be migrated in CTAS workflow"
268
+ )
269
+ return False
270
+
271
+ logger .debug (f"Migrating external table { src_table .key } to using SQL query: { table_migrate_sql } " )
208
272
self ._backend .execute (table_migrate_sql )
209
273
self ._backend .execute (src_table .sql_alter_to (rule .as_uc_table_key ))
210
274
self ._backend .execute (src_table .sql_alter_from (rule .as_uc_table_key , self ._ws .get_workspace_id ()))
211
275
return self ._migrate_acl (src_table , rule , grants )
212
276
213
- def _migrate_non_sync_table (self , src_table : Table , rule : Rule , grants : list [Grant ] | None = None ):
214
- table_migrate_sql = self ._get_create_in_place_sql (src_table , rule )
215
- logger .debug (f"Migrating table (No Sync) { src_table .key } to using SQL query: { table_migrate_sql } " )
277
+ def _migrate_dbfs_root_table (self , src_table : Table , rule : Rule , grants : list [Grant ] | None = None ):
278
+ target_table_key = rule .as_uc_table_key
279
+ table_migrate_sql = src_table .sql_migrate_dbfs (target_table_key )
280
+ logger .debug (f"Migrating managed table { src_table .key } to using SQL query: { table_migrate_sql } " )
216
281
self ._backend .execute (table_migrate_sql )
217
282
self ._backend .execute (src_table .sql_alter_to (rule .as_uc_table_key ))
218
283
self ._backend .execute (src_table .sql_alter_from (rule .as_uc_table_key , self ._ws .get_workspace_id ()))
@@ -226,25 +291,6 @@ def _migrate_table_create_ctas(self, src_table: Table, rule: Rule, grants: list[
226
291
self ._backend .execute (src_table .sql_alter_from (rule .as_uc_table_key , self ._ws .get_workspace_id ()))
227
292
return self ._migrate_acl (src_table , rule , grants )
228
293
229
- def _get_create_in_place_sql (self , src_table : Table , rule : Rule ) -> str :
230
- create_sql = str (next (self ._backend .fetch (src_table .sql_show_create ()))["createtab_stmt" ])
231
- statements = sqlglot .parse (create_sql , read = 'databricks' )
232
- assert len (statements ) == 1 , 'Expected a single statement'
233
- create = statements [0 ]
234
- assert isinstance (create , expressions .Create ), 'Expected a CREATE statement'
235
- # safely replace current table name with the updated catalog
236
- for table_name in create .find_all (expressions .Table ):
237
- if table_name .db == src_table .database and table_name .name == src_table .name :
238
- new_table_name = expressions .Table (
239
- catalog = rule .catalog_name ,
240
- db = rule .dst_schema ,
241
- this = rule .dst_table ,
242
- )
243
- table_name .replace (new_table_name )
244
- # safely replace CREATE with CREATE IF NOT EXISTS
245
- create .args ['exists' ] = True
246
- return create .sql ('databricks' )
247
-
248
294
def _get_create_ctas_sql (self , src_table : Table , rule : Rule ) -> str :
249
295
create_sql = (
250
296
f"CREATE TABLE IF NOT EXISTS { escape_sql_identifier (rule .as_uc_table_key )} "
0 commit comments