-
Notifications
You must be signed in to change notification settings - Fork 3k
Core: Freshness-aware table loading in REST catalog #14398
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,9 +26,11 @@ | |
| import java.util.Map; | ||
| import java.util.Set; | ||
| import java.util.function.BiFunction; | ||
| import java.util.function.Consumer; | ||
| import java.util.function.Function; | ||
| import java.util.function.Supplier; | ||
| import java.util.stream.Collectors; | ||
| import org.apache.hc.core5.http.HttpHeaders; | ||
| import org.apache.iceberg.BaseTable; | ||
| import org.apache.iceberg.CatalogProperties; | ||
| import org.apache.iceberg.CatalogUtil; | ||
|
|
@@ -60,13 +62,15 @@ | |
| import org.apache.iceberg.io.StorageCredential; | ||
| import org.apache.iceberg.metrics.MetricsReporter; | ||
| import org.apache.iceberg.metrics.MetricsReporters; | ||
| import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Maps; | ||
| import org.apache.iceberg.rest.RESTCatalogProperties.SnapshotMode; | ||
| import org.apache.iceberg.rest.RESTTableCache.TableWithETag; | ||
| import org.apache.iceberg.rest.auth.AuthManager; | ||
| import org.apache.iceberg.rest.auth.AuthManagers; | ||
| import org.apache.iceberg.rest.auth.AuthSession; | ||
|
|
@@ -165,6 +169,8 @@ public class RESTSessionCatalog extends BaseViewSessionCatalog | |
| private Supplier<Map<String, String>> mutationHeaders = Map::of; | ||
| private String namespaceSeparator = null; | ||
|
|
||
| private RESTTableCache tableCache; | ||
|
|
||
| public RESTSessionCatalog() { | ||
| this( | ||
| config -> | ||
|
|
@@ -277,9 +283,22 @@ public void initialize(String name, Map<String, String> unresolved) { | |
| mergedProps, | ||
| RESTCatalogProperties.REST_SCAN_PLANNING_ENABLED, | ||
| RESTCatalogProperties.REST_SCAN_PLANNING_ENABLED_DEFAULT); | ||
|
|
||
| this.tableCache = createTableCache(mergedProps); | ||
| this.closeables.addCloseable(this.tableCache); | ||
|
|
||
| super.initialize(name, mergedProps); | ||
| } | ||
|
|
||
| protected RESTTableCache createTableCache(Map<String, String> props) { | ||
| return new RESTTableCache(props); | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| RESTTableCache tableCache() { | ||
| return tableCache; | ||
| } | ||
|
|
||
| @Override | ||
| public void setConf(Object newConf) { | ||
| this.conf = newConf; | ||
|
|
@@ -332,6 +351,8 @@ public boolean dropTable(SessionContext context, TableIdentifier identifier) { | |
| return true; | ||
| } catch (NoSuchTableException e) { | ||
| return false; | ||
| } finally { | ||
| invalidateTable(context, identifier); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -353,6 +374,8 @@ public boolean purgeTable(SessionContext context, TableIdentifier identifier) { | |
| return true; | ||
| } catch (NoSuchTableException e) { | ||
| return false; | ||
| } finally { | ||
| invalidateTable(context, identifier); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -370,6 +393,8 @@ public void renameTable(SessionContext context, TableIdentifier from, TableIdent | |
| client | ||
| .withAuthSession(contextualSession) | ||
| .post(paths.rename(), request, null, mutationHeaders, ErrorHandlers.tableErrorHandler()); | ||
|
|
||
| invalidateTable(context, from); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -384,9 +409,15 @@ public boolean tableExists(SessionContext context, TableIdentifier identifier) { | |
| return true; | ||
| } else { | ||
| // fallback in order to work with 1.7.x and older servers | ||
| return super.tableExists(context, identifier); | ||
| if (!super.tableExists(context, identifier)) { | ||
| invalidateTable(context, identifier); | ||
| return false; | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
| } catch (NoSuchTableException e) { | ||
| invalidateTable(context, identifier); | ||
| return false; | ||
| } | ||
| } | ||
|
|
@@ -396,7 +427,11 @@ private static Map<String, String> snapshotModeToParam(SnapshotMode mode) { | |
| } | ||
|
|
||
| private LoadTableResponse loadInternal( | ||
| SessionContext context, TableIdentifier identifier, SnapshotMode mode) { | ||
| SessionContext context, | ||
| TableIdentifier identifier, | ||
| SnapshotMode mode, | ||
| Map<String, String> headers, | ||
| Consumer<Map<String, String>> responseHeaders) { | ||
| Endpoint.check(endpoints, Endpoint.V1_LOAD_TABLE); | ||
| AuthSession contextualSession = authManager.contextualSession(context, catalogAuth); | ||
| return client | ||
|
|
@@ -405,8 +440,9 @@ private LoadTableResponse loadInternal( | |
| paths.table(identifier), | ||
| snapshotModeToParam(mode), | ||
| LoadTableResponse.class, | ||
| Map.of(), | ||
| ErrorHandlers.tableErrorHandler()); | ||
| headers, | ||
| ErrorHandlers.tableErrorHandler(), | ||
| responseHeaders); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -424,8 +460,25 @@ public Table loadTable(SessionContext context, TableIdentifier identifier) { | |
| MetadataTableType metadataType; | ||
| LoadTableResponse response; | ||
| TableIdentifier loadedIdent; | ||
|
|
||
| Map<String, String> responseHeaders = Maps.newHashMap(); | ||
| TableWithETag cachedTable = tableCache.getIfPresent(context.sessionId(), identifier); | ||
|
|
||
| try { | ||
| response = loadInternal(context, identifier, snapshotMode); | ||
| response = | ||
| loadInternal( | ||
| context, | ||
| identifier, | ||
| snapshotMode, | ||
| headersForLoadTable(cachedTable), | ||
| responseHeaders::putAll); | ||
|
|
||
| if (response == null) { | ||
| Preconditions.checkNotNull(cachedTable, "Invalid load table response: null"); | ||
|
|
||
| return cachedTable.supplier().get(); | ||
| } | ||
|
|
||
| loadedIdent = identifier; | ||
| metadataType = null; | ||
|
|
||
|
|
@@ -435,14 +488,33 @@ public Table loadTable(SessionContext context, TableIdentifier identifier) { | |
| // attempt to load a metadata table using the identifier's namespace as the base table | ||
| TableIdentifier baseIdent = TableIdentifier.of(identifier.namespace().levels()); | ||
| try { | ||
| response = loadInternal(context, baseIdent, snapshotMode); | ||
| responseHeaders.clear(); | ||
| cachedTable = tableCache.getIfPresent(context.sessionId(), baseIdent); | ||
|
|
||
| response = | ||
| loadInternal( | ||
| context, | ||
| baseIdent, | ||
| snapshotMode, | ||
| headersForLoadTable(cachedTable), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This approach requires the table to have an entry in the cachedTable, so that we can retrieve its ETag and include it in the loadTable request. However, what if the ETag is cached externally, and the caller wants to supply the ETag directly when loading the table? This use case is valid and common, the caller may store the ETag in somewhere for performance optimizations and expect to use it in the next loadTable request (may not use the same client).
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for taking a look, @XJDKC !
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, the caller might maintain a cache on the caller side. Many catalogs now support multiplexing, so the caller may have a cache to store the latest table metadata info. For this use case, if the caller got an null pointer or something that indicates that the table is not modified, the caller needs to handle it by itself. e.g., don't need to proceed. Does it make sense to allow the caller to pass in the ETag in the SessionContext? Maybe via a property in the property map? (Noticed that, unfortunately, for RESTCatalog::loadTable, it doesn't allow the caller to pass in a SessionContext at table level, is it possible to add another interface?)
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the explanation, @XJDKC !
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I fully understand your points, they make sense! It would be great if we could provide some flexibility or hooks for customization when implementing this!
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could these users call the REST catalog API directly?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks you for describing the use case @XJDKC ! |
||
| responseHeaders::putAll); | ||
|
|
||
| if (response == null) { | ||
| Preconditions.checkNotNull(cachedTable, "Invalid load table response: null"); | ||
|
|
||
| return MetadataTableUtils.createMetadataTableInstance( | ||
| cachedTable.supplier().get(), metadataType); | ||
| } | ||
|
|
||
| loadedIdent = baseIdent; | ||
| } catch (NoSuchTableException ignored) { | ||
| // the base table does not exist | ||
| invalidateTable(context, baseIdent); | ||
| throw original; | ||
| } | ||
| } else { | ||
| // name is not a metadata table | ||
| invalidateTable(context, identifier); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This feels redundant. We've already loaded this identifier and determined it's not in the cache, right?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Technically it's feasible that we get here and the table exists in the cache. See
|
||
| throw original; | ||
| } | ||
| } | ||
|
|
@@ -461,7 +533,7 @@ public Table loadTable(SessionContext context, TableIdentifier identifier) { | |
| .setPreviousFileLocation(null) | ||
| .setSnapshotsSupplier( | ||
| () -> | ||
| loadInternal(context, finalIdentifier, SnapshotMode.ALL) | ||
| loadInternal(context, finalIdentifier, SnapshotMode.ALL, Map.of(), h -> {}) | ||
| .tableMetadata() | ||
| .snapshots()) | ||
| .discardChanges() | ||
|
|
@@ -470,38 +542,52 @@ public Table loadTable(SessionContext context, TableIdentifier identifier) { | |
| tableMetadata = response.tableMetadata(); | ||
| } | ||
|
|
||
| List<Credential> credentials = response.credentials(); | ||
| RESTClient tableClient = client.withAuthSession(tableSession); | ||
| RESTTableOperations ops = | ||
| newTableOps( | ||
| tableClient, | ||
| paths.table(finalIdentifier), | ||
| Map::of, | ||
| mutationHeaders, | ||
| tableFileIO(context, tableConf, response.credentials()), | ||
| tableMetadata, | ||
| endpoints); | ||
| Supplier<BaseTable> tableSupplier = | ||
| createTableSupplier( | ||
| finalIdentifier, tableMetadata, context, tableClient, tableConf, credentials); | ||
|
|
||
| trackFileIO(ops); | ||
|
|
||
| // RestTable should only be returned for non-metadata tables, because client would | ||
| // not have access to metadata files for example manifests, since all it needs is catalog. | ||
| if (metadataType == null) { | ||
| RESTTable restTable = restTableForScanPlanning(ops, finalIdentifier, tableClient); | ||
| if (restTable != null) { | ||
| return restTable; | ||
| } | ||
| String eTag = responseHeaders.getOrDefault(HttpHeaders.ETAG, null); | ||
| if (eTag != null) { | ||
| tableCache.put(context.sessionId(), finalIdentifier, tableSupplier, eTag); | ||
| } | ||
|
|
||
| BaseTable table = | ||
| new BaseTable( | ||
| ops, | ||
| fullTableName(finalIdentifier), | ||
| metricsReporter(paths.metrics(finalIdentifier), tableClient)); | ||
| if (metadataType != null) { | ||
| return MetadataTableUtils.createMetadataTableInstance(table, metadataType); | ||
| return MetadataTableUtils.createMetadataTableInstance(tableSupplier.get(), metadataType); | ||
| } | ||
|
|
||
| return table; | ||
| return tableSupplier.get(); | ||
| } | ||
|
|
||
| private Supplier<BaseTable> createTableSupplier( | ||
| TableIdentifier identifier, | ||
| TableMetadata tableMetadata, | ||
| SessionContext context, | ||
| RESTClient tableClient, | ||
| Map<String, String> tableConf, | ||
| List<Credential> credentials) { | ||
| return () -> { | ||
| RESTTableOperations ops = | ||
| newTableOps( | ||
| tableClient, | ||
| paths.table(identifier), | ||
| Map::of, | ||
| mutationHeaders, | ||
| tableFileIO(context, tableConf, credentials), | ||
| tableMetadata, | ||
| endpoints); | ||
|
|
||
| trackFileIO(ops); | ||
|
|
||
| RESTTable table = restTableForScanPlanning(ops, identifier, tableClient); | ||
| if (table != null) { | ||
| return table; | ||
| } | ||
|
|
||
| return new BaseTable( | ||
| ops, fullTableName(identifier), metricsReporter(paths.metrics(identifier), tableClient)); | ||
| }; | ||
| } | ||
|
|
||
| private RESTTable restTableForScanPlanning( | ||
|
|
@@ -546,7 +632,9 @@ public Catalog.TableBuilder buildTable( | |
| } | ||
|
|
||
| @Override | ||
| public void invalidateTable(SessionContext context, TableIdentifier ident) {} | ||
| public void invalidateTable(SessionContext context, TableIdentifier ident) { | ||
| tableCache.invalidate(context.sessionId(), ident); | ||
| } | ||
|
|
||
| @Override | ||
| public Table registerTable( | ||
|
|
@@ -906,7 +994,8 @@ public Transaction replaceTransaction() { | |
| throw new AlreadyExistsException("View with same name already exists: %s", ident); | ||
| } | ||
|
|
||
| LoadTableResponse response = loadInternal(context, ident, snapshotMode); | ||
| LoadTableResponse response = loadInternal(context, ident, snapshotMode, Map.of(), h -> {}); | ||
|
|
||
| String fullName = fullTableName(ident); | ||
|
|
||
| Map<String, String> tableConf = response.config(); | ||
|
|
@@ -1368,6 +1457,17 @@ public void renameView(SessionContext context, TableIdentifier from, TableIdenti | |
| .post(paths.renameView(), request, null, mutationHeaders, ErrorHandlers.viewErrorHandler()); | ||
| } | ||
|
|
||
| private static Map<String, String> headersForLoadTable(TableWithETag tableWithETag) { | ||
| if (tableWithETag == null) { | ||
| return Map.of(); | ||
| } | ||
|
|
||
| String eTag = tableWithETag.eTag(); | ||
| Preconditions.checkArgument(eTag != null, "Invalid ETag: null"); | ||
|
|
||
| return Map.of(HttpHeaders.IF_NONE_MATCH, eTag); | ||
| } | ||
|
|
||
| private class RESTViewBuilder implements ViewBuilder { | ||
| private final SessionContext context; | ||
| private final TableIdentifier identifier; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.