-
Notifications
You must be signed in to change notification settings - Fork 749
[GOBBLIN-1961] Allow IcebergDatasetFinder to use separate names for source vs. destination-side DB and table
#3835
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -63,10 +63,8 @@ | |
| @Slf4j | ||
| @Getter | ||
| public class IcebergDataset implements PrioritizedCopyableDataset { | ||
| private final String dbName; | ||
| private final String inputTableName; | ||
| private final IcebergTable srcIcebergTable; | ||
| /** Presumed destination {@link IcebergTable} exists */ | ||
| /* CAUTION: *hopefully* `destIcebergTable` exists... although that's not necessarily been verified yet */ | ||
| private final IcebergTable destIcebergTable; | ||
| protected final Properties properties; | ||
| protected final FileSystem sourceFs; | ||
|
|
@@ -75,9 +73,7 @@ public class IcebergDataset implements PrioritizedCopyableDataset { | |
| /** Destination database name */ | ||
| public static final String DESTINATION_DATABASE_KEY = IcebergDatasetFinder.ICEBERG_DATASET_PREFIX + ".destination.database"; | ||
|
|
||
| public IcebergDataset(String db, String table, IcebergTable srcIcebergTable, IcebergTable destIcebergTable, Properties properties, FileSystem sourceFs) { | ||
| this.dbName = db; | ||
| this.inputTableName = table; | ||
| public IcebergDataset(IcebergTable srcIcebergTable, IcebergTable destIcebergTable, Properties properties, FileSystem sourceFs) { | ||
| this.srcIcebergTable = srcIcebergTable; | ||
| this.destIcebergTable = destIcebergTable; | ||
| this.properties = properties; | ||
|
|
@@ -117,17 +113,17 @@ public Iterator<FileSet<CopyEntity>> getFileSetIterator(FileSystem targetFs, Cop | |
| return createFileSets(targetFs, configuration); | ||
| } | ||
|
|
||
| /** @return unique ID for this dataset, usable as a {@link CopyEntity}.fileset, for atomic publication grouping */ | ||
| /** @return unique ID for dataset (based on the source-side table), usable as a {@link CopyEntity#getFileSet}, for atomic publication grouping */ | ||
| protected String getFileSetId() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: I feel like this is better encapsulated as getSourceTableId because fileSetId() makes me think about some subset of the table, not just the src table.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the name originally arose because it's the value provided to the does it make more sense in context, or do you still recommend to change?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, I guess we can keep it the same here for consistency, just that in this usecase it really is referring to the table when this is a table distcp |
||
| return this.dbName + "." + this.inputTableName; | ||
| return this.srcIcebergTable.getTableId().toString(); | ||
| } | ||
|
|
||
| /** | ||
| * Generates {@link FileSet}s, being themselves able to generate {@link CopyEntity}s for all files, data and metadata, | ||
| * comprising the iceberg/table, so as to fully specify remaining table replication. | ||
| */ | ||
| protected Iterator<FileSet<CopyEntity>> createFileSets(FileSystem targetFs, CopyConfiguration configuration) { | ||
| FileSet<CopyEntity> fileSet = new IcebergTableFileSet(this.getInputTableName(), this, targetFs, configuration); | ||
| FileSet<CopyEntity> fileSet = new IcebergTableFileSet(this.getFileSetId(), this, targetFs, configuration); | ||
| return Iterators.singletonIterator(fileSet); | ||
| } | ||
|
|
||
|
|
@@ -140,7 +136,7 @@ Collection<CopyEntity> generateCopyEntities(FileSystem targetFs, CopyConfigurati | |
| String fileSet = this.getFileSetId(); | ||
| List<CopyEntity> copyEntities = Lists.newArrayList(); | ||
| Map<Path, FileStatus> pathToFileStatus = getFilePathsToFileStatus(targetFs, copyConfig); | ||
| log.info("~{}.{}~ found {} candidate source paths", dbName, inputTableName, pathToFileStatus.size()); | ||
| log.info("~{}~ found {} candidate source paths", fileSet, pathToFileStatus.size()); | ||
|
|
||
| Configuration defaultHadoopConfiguration = new Configuration(); | ||
| for (Map.Entry<Path, FileStatus> entry : pathToFileStatus.entrySet()) { | ||
|
|
@@ -165,8 +161,8 @@ Collection<CopyEntity> generateCopyEntities(FileSystem targetFs, CopyConfigurati | |
| copyEntities.add(fileEntity); | ||
| } | ||
| // TODO: Filter properties specific to iceberg registration and avoid serializing every global property | ||
| copyEntities.add(createPostPublishStep(this.dbName, this.inputTableName, this.properties)); | ||
| log.info("~{}.{}~ generated {} copy entities", dbName, inputTableName, copyEntities.size()); | ||
| copyEntities.add(createPostPublishStep()); | ||
| log.info("~{}~ generated {} copy entities", fileSet, copyEntities.size()); | ||
| return copyEntities; | ||
| } | ||
|
|
||
|
|
@@ -187,8 +183,8 @@ protected Map<Path, FileStatus> getFilePathsToFileStatus(FileSystem targetFs, Co | |
| IcebergSnapshotInfo currentSnapshotOverview = icebergTable.getCurrentSnapshotInfoOverviewOnly(); | ||
| if (currentSnapshotOverview.getMetadataPath().map(isPresentOnTarget).orElse(false) && | ||
| isPresentOnTarget.apply(currentSnapshotOverview.getManifestListPath())) { | ||
| log.info("~{}.{}~ skipping entire iceberg, since snapshot '{}' at '{}' and metadata '{}' both present on target", | ||
| dbName, inputTableName, currentSnapshotOverview.getSnapshotId(), | ||
| log.info("~{}~ skipping entire iceberg, since snapshot '{}' at '{}' and metadata '{}' both present on target", | ||
| this.getFileSetId(), currentSnapshotOverview.getSnapshotId(), | ||
| currentSnapshotOverview.getManifestListPath(), | ||
| currentSnapshotOverview.getMetadataPath().orElse("<<ERROR: MISSING!>>")); | ||
| return Maps.newHashMap(); | ||
|
|
@@ -198,7 +194,7 @@ protected Map<Path, FileStatus> getFilePathsToFileStatus(FileSystem targetFs, Co | |
| Iterators.transform(icebergIncrementalSnapshotInfos, snapshotInfo -> { | ||
| // log each snapshot, for context, in case of `FileNotFoundException` during `FileSystem.getFileStatus()` | ||
| String manListPath = snapshotInfo.getManifestListPath(); | ||
| log.info("~{}.{}~ loaded snapshot '{}' at '{}' from metadata path: '{}'", dbName, inputTableName, | ||
| log.info("~{}~ loaded snapshot '{}' at '{}' from metadata path: '{}'", this.getFileSetId(), | ||
| snapshotInfo.getSnapshotId(), manListPath, snapshotInfo.getMetadataPath().orElse("<<inherited>>")); | ||
| // ALGO: an iceberg's files form a tree of four levels: metadata.json -> manifest-list -> manifest -> data; | ||
| // most critically, all are presumed immutable and uniquely named, although any may be replaced. we depend | ||
|
|
@@ -224,18 +220,17 @@ protected Map<Path, FileStatus> getFilePathsToFileStatus(FileSystem targetFs, Co | |
| missingPaths.addAll(mfi.getListedFilePaths()); | ||
| } | ||
| } | ||
| log.info("~{}.{}~ snapshot '{}': collected {} additional source paths", | ||
| dbName, inputTableName, snapshotInfo.getSnapshotId(), missingPaths.size()); | ||
| log.info("~{}~ snapshot '{}': collected {} additional source paths", | ||
| this.getFileSetId(), snapshotInfo.getSnapshotId(), missingPaths.size()); | ||
| return missingPaths.iterator(); | ||
| } else { | ||
| log.info("~{}.{}~ snapshot '{}' already present on target... skipping (including contents)", | ||
| dbName, inputTableName, snapshotInfo.getSnapshotId()); | ||
| log.info("~{}~ snapshot '{}' already present on target... skipping (including contents)", | ||
| this.getFileSetId(), snapshotInfo.getSnapshotId()); | ||
| // IMPORTANT: separately consider metadata path, to handle case of 'metadata-only' snapshot reusing mf-list | ||
| Optional<String> metadataPath = snapshotInfo.getMetadataPath(); | ||
| Optional<String> nonReplicatedMetadataPath = metadataPath.filter(p -> !isPresentOnTarget.apply(p)); | ||
| metadataPath.ifPresent(ignore -> | ||
| log.info("~{}.{}~ metadata IS {} already present on target", dbName, inputTableName, | ||
| nonReplicatedMetadataPath.isPresent() ? "NOT" : "ALSO") | ||
| log.info("~{}~ metadata IS {} already present on target", this.getFileSetId(), nonReplicatedMetadataPath.isPresent() ? "NOT" : "ALSO") | ||
| ); | ||
| return nonReplicatedMetadataPath.map(p -> Lists.newArrayList(p).iterator()).orElse(Collections.emptyIterator()); | ||
| } | ||
|
|
@@ -255,7 +250,7 @@ protected Map<Path, FileStatus> getFilePathsToFileStatus(FileSystem targetFs, Co | |
| try { | ||
| results.put(path, this.sourceFs.getFileStatus(path)); | ||
| if (growthTracker.isAnotherMilestone(results.size())) { | ||
| log.info("~{}.{}~ collected file status on '{}' source paths", dbName, inputTableName, results.size()); | ||
| log.info("~{}~ collected file status on '{}' source paths", this.getFileSetId(), results.size()); | ||
| } | ||
| } catch (FileNotFoundException fnfe) { | ||
| if (!shouldTolerateMissingSourceFiles) { | ||
|
|
@@ -265,7 +260,7 @@ protected Map<Path, FileStatus> getFilePathsToFileStatus(FileSystem targetFs, Co | |
| String total = ++numSourceFilesNotFound + " total"; | ||
| String speculation = "either premature deletion broke time-travel or metadata read interleaved among delete"; | ||
| errorConsolidator.prepLogMsg(path).ifPresent(msg -> | ||
| log.warn("~{}.{}~ source {} ({}... {})", dbName, inputTableName, msg, speculation, total) | ||
| log.warn("~{}~ source {} ({}... {})", this.getFileSetId(), msg, speculation, total) | ||
| ); | ||
| } | ||
| } | ||
|
|
@@ -326,8 +321,8 @@ protected DatasetDescriptor getDestinationDataset(FileSystem targetFs) { | |
| return this.destIcebergTable.getDatasetDescriptor(targetFs); | ||
| } | ||
|
|
||
| private PostPublishStep createPostPublishStep(String dbName, String inputTableName, Properties properties) { | ||
| IcebergRegisterStep icebergRegisterStep = new IcebergRegisterStep(dbName, inputTableName, properties); | ||
| private PostPublishStep createPostPublishStep() { | ||
| IcebergRegisterStep icebergRegisterStep = new IcebergRegisterStep(this.srcIcebergTable.getTableId(), this.destIcebergTable.getTableId(), this.properties); | ||
| return new PostPublishStep(getFileSetId(), Maps.newHashMap(), icebergRegisterStep, 0); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be out of scope for this PR, but shouldn't our default behavior not have that assumption and create the table if missing? That's the contract for Hive distcp
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for some catalogs, like openhouse, that's not actually possible: the tables must have been already mated in a way that we are unable to accomplish ourselves (i.e. from outside the catalog itself)