-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Custom catalogs from IcebergSource
#1783
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
409b1ad
d45ee4c
239c2e7
581638c
ef26211
2a8bac3
c39f108
50808b7
8ebe3c9
20e9d21
806e92a
4821282
04ce006
ca249bc
4a38352
c9b6483
b8c2320
a4c08d1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,21 +20,43 @@ | |
| package org.apache.iceberg.spark.source; | ||
|
|
||
| import java.util.Map; | ||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.iceberg.Table; | ||
| import org.apache.iceberg.catalog.TableIdentifier; | ||
| import org.apache.iceberg.hadoop.HadoopTables; | ||
| import org.apache.iceberg.hive.HiveCatalog; | ||
| import org.apache.iceberg.hive.HiveCatalogs; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; | ||
| import org.apache.iceberg.spark.PathIdentifier; | ||
| import org.apache.iceberg.spark.Spark3Util; | ||
| import org.apache.iceberg.spark.SparkSessionCatalog; | ||
| import org.apache.spark.sql.SparkSession; | ||
| import org.apache.spark.sql.connector.catalog.TableProvider; | ||
| import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; | ||
| import org.apache.spark.sql.connector.catalog.CatalogManager; | ||
| import org.apache.spark.sql.connector.catalog.CatalogPlugin; | ||
| import org.apache.spark.sql.connector.catalog.Identifier; | ||
| import org.apache.spark.sql.connector.catalog.SupportsCatalogOptions; | ||
| import org.apache.spark.sql.connector.catalog.Table; | ||
| import org.apache.spark.sql.connector.catalog.TableCatalog; | ||
| import org.apache.spark.sql.connector.expressions.Transform; | ||
| import org.apache.spark.sql.sources.DataSourceRegister; | ||
| import org.apache.spark.sql.types.StructType; | ||
| import org.apache.spark.sql.util.CaseInsensitiveStringMap; | ||
|
|
||
| public class IcebergSource implements DataSourceRegister, TableProvider { | ||
| /** | ||
| * The IcebergSource loads/writes tables with format "iceberg". It can load paths and tables. | ||
| * | ||
| * How paths/tables are loaded when using spark.read().format("iceberg").path(table) | ||
| * | ||
| * table = "file:/path/to/table" -> loads a HadoopTable at given path | ||
| * table = "tablename" -> loads currentCatalog.currentNamespace.tablename | ||
| * table = "catalog.tablename" -> load "tablename" from the specified catalog. | ||
| * table = "namespace.tablename" -> load "namespace.tablename" from current catalog | ||
| * table = "catalog.namespace.tablename" -> "namespace.tablename" from the specified catalog. | ||
| * table = "namespace1.namespace2.tablename" -> load "namespace1.namespace2.tablename" from current catalog | ||
| * | ||
| * The above list is in order of priority. For example: a matching catalog will take priority over any namespace | ||
| * resolution. | ||
| */ | ||
| public class IcebergSource implements DataSourceRegister, SupportsCatalogOptions { | ||
| private static final String DEFAULT_CATALOG_NAME = "default_iceberg"; | ||
| private static final String DEFAULT_CATALOG = "spark.sql.catalog." + DEFAULT_CATALOG_NAME; | ||
|
|
||
| @Override | ||
| public String shortName() { | ||
| return "iceberg"; | ||
|
|
@@ -56,48 +78,72 @@ public boolean supportsExternalMetadata() { | |
| } | ||
|
|
||
| @Override | ||
| public SparkTable getTable(StructType schema, Transform[] partitioning, Map<String, String> options) { | ||
| // Get Iceberg table from options | ||
| Configuration conf = SparkSession.active().sessionState().newHadoopConf(); | ||
| Table icebergTable = getTableAndResolveHadoopConfiguration(options, conf); | ||
|
|
||
| // Build Spark table based on Iceberg table, and return it | ||
| // Eagerly refresh the table before reading to ensure views containing this table show up-to-date data | ||
| return new SparkTable(icebergTable, schema, true); | ||
| public Table getTable(StructType schema, Transform[] partitioning, Map<String, String> options) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When implementing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| Spark3Util.CatalogAndIdentifier catalogIdentifier = catalogAndIdentifier(new CaseInsensitiveStringMap(options)); | ||
| CatalogPlugin catalog = catalogIdentifier.catalog(); | ||
| Identifier ident = catalogIdentifier.identifier(); | ||
|
|
||
| try { | ||
| if (catalog instanceof TableCatalog) { | ||
| return ((TableCatalog) catalog).loadTable(ident); | ||
| } | ||
| } catch (NoSuchTableException e) { | ||
| // throwing an iceberg NoSuchTableException because the Spark one is typed and cant be thrown from this interface | ||
| throw new org.apache.iceberg.exceptions.NoSuchTableException(e, "Cannot find table for %s.", ident); | ||
| } | ||
|
|
||
| // throwing an iceberg NoSuchTableException because the Spark one is typed and cant be thrown from this interface | ||
rdblue marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| throw new org.apache.iceberg.exceptions.NoSuchTableException("Cannot find table for %s.", ident); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the correct exception is the Spark exception since this is going to be called from Spark code.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Spark
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it, I agree in that case. Thanks for explaining! You may want to add a comment here to explain in the code as well. |
||
| } | ||
|
|
||
| protected Table findTable(Map<String, String> options, Configuration conf) { | ||
| private Spark3Util.CatalogAndIdentifier catalogAndIdentifier(CaseInsensitiveStringMap options) { | ||
| Preconditions.checkArgument(options.containsKey("path"), "Cannot open table: path is not set"); | ||
| SparkSession spark = SparkSession.active(); | ||
| setupDefaultSparkCatalog(spark); | ||
| String path = options.get("path"); | ||
| CatalogManager catalogManager = spark.sessionState().catalogManager(); | ||
|
|
||
| if (path.contains("/")) { | ||
| HadoopTables tables = new HadoopTables(conf); | ||
| return tables.load(path); | ||
| } else { | ||
| HiveCatalog hiveCatalog = HiveCatalogs.loadCatalog(conf); | ||
| TableIdentifier tableIdentifier = TableIdentifier.parse(path); | ||
| return hiveCatalog.loadTable(tableIdentifier); | ||
| // contains a path. Return iceberg default catalog and a PathIdentifier | ||
| return new Spark3Util.CatalogAndIdentifier(catalogManager.catalog(DEFAULT_CATALOG_NAME), | ||
| new PathIdentifier(path)); | ||
| } | ||
| } | ||
|
|
||
| private Table getTableAndResolveHadoopConfiguration(Map<String, String> options, Configuration conf) { | ||
| // Overwrite configurations from the Spark Context with configurations from the options. | ||
| mergeIcebergHadoopConfs(conf, options); | ||
| final Spark3Util.CatalogAndIdentifier catalogAndIdentifier = Spark3Util.catalogAndIdentifier( | ||
| "path or identifier", spark, path); | ||
|
|
||
| Table table = findTable(options, conf); | ||
|
|
||
| // Set confs from table properties | ||
| mergeIcebergHadoopConfs(conf, table.properties()); | ||
| if (catalogAndIdentifier.catalog().name().equals("spark_catalog") && | ||
| !(catalogAndIdentifier.catalog() instanceof SparkSessionCatalog)) { | ||
| // catalog is a session catalog but does not support Iceberg. Use Iceberg instead. | ||
| return new Spark3Util.CatalogAndIdentifier(catalogManager.catalog(DEFAULT_CATALOG_NAME), | ||
| catalogAndIdentifier.identifier()); | ||
| } else { | ||
| return catalogAndIdentifier; | ||
| } | ||
| } | ||
|
|
||
| // Re-overwrite values set in options and table properties but were not in the environment. | ||
| mergeIcebergHadoopConfs(conf, options); | ||
| @Override | ||
| public Identifier extractIdentifier(CaseInsensitiveStringMap options) { | ||
| return catalogAndIdentifier(options).identifier(); | ||
| } | ||
|
|
||
| return table; | ||
| @Override | ||
| public String extractCatalog(CaseInsensitiveStringMap options) { | ||
| return catalogAndIdentifier(options).catalog().name(); | ||
| } | ||
|
|
||
| private static void mergeIcebergHadoopConfs(Configuration baseConf, Map<String, String> options) { | ||
| options.keySet().stream() | ||
| .filter(key -> key.startsWith("hadoop.")) | ||
| .forEach(key -> baseConf.set(key.replaceFirst("hadoop.", ""), options.get(key))); | ||
| private static void setupDefaultSparkCatalog(SparkSession spark) { | ||
| if (spark.conf().contains(DEFAULT_CATALOG)) { | ||
| return; | ||
| } | ||
| ImmutableMap<String, String> config = ImmutableMap.of( | ||
| "type", "hive", | ||
| "default-namespace", "default", | ||
| "parquet-enabled", "true", | ||
| "cache-enabled", "false" // the source should not use a cache | ||
| ); | ||
| String catalogName = "org.apache.iceberg.spark.SparkCatalog"; | ||
| spark.conf().set(DEFAULT_CATALOG, catalogName); | ||
| config.forEach((key, value) -> spark.conf().set(DEFAULT_CATALOG + "." + key, value)); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe important to note that these are used with this priority as well, ie
if "catalog.namespace.table" is valid it will be read and "namespace.namespace.table" will be ignored ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
very good point @RussellSpitzer, I have included a note to that effect