-
Notifications
You must be signed in to change notification settings - Fork 8
for bigquery table spark jobs, add transient to unblock serialization issues and do not create table when attempting to insert #174
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d9f0834
aac350a
d770011
e1137db
fe164cf
9a33536
945c635
7958562
fe28953
3fd9e08
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -88,7 +88,9 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable | |
| private val blockingCacheEviction: Boolean = | ||
| sparkSession.conf.get("spark.chronon.table_write.cache.blocking", "false").toBoolean | ||
|
|
||
| private[spark] lazy val tableFormatProvider: FormatProvider = { | ||
| // Add transient here because the format provider can sometimes not be serializable. | ||
| // for example, BigQueryImpl during reflecting with bq flavor | ||
| @transient private[spark] lazy val tableFormatProvider: FormatProvider = { | ||
| val clazzName = | ||
| sparkSession.conf.get("spark.chronon.table.format_provider.class", classOf[DefaultFormatProvider].getName) | ||
| val mirror = runtimeMirror(getClass.getClassLoader) | ||
|
|
@@ -261,6 +263,48 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable | |
| def firstAvailablePartition(tableName: String, subPartitionFilters: Map[String, String] = Map.empty): Option[String] = | ||
| partitions(tableName, subPartitionFilters).reduceOption((x, y) => Ordering[String].min(x, y)) | ||
|
|
||
| def createTable(df: DataFrame, | ||
| tableName: String, | ||
| partitionColumns: Seq[String] = Seq.empty, | ||
| writeFormatTypeString: String = "", | ||
| tableProperties: Map[String, String] = null, | ||
| fileFormat: String = "PARQUET", | ||
| autoExpand: Boolean = false): Boolean = { | ||
| val doesTableExist = tableExists(tableName) | ||
|
|
||
| // create table sql doesn't work for bigquery here. instead of creating the table explicitly, we can rely on the | ||
| // bq connector to indirectly create the table and eventually write the data | ||
| if (writeFormatTypeString.toUpperCase == "BIGQUERY") { | ||
david-zlai marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| logger.info(s"Skipping table creation in BigQuery for $tableName. tableExists=$doesTableExist") | ||
|
|
||
| return doesTableExist | ||
| } | ||
|
|
||
| if (!doesTableExist) { | ||
| val creationSql = createTableSql(tableName, df.schema, partitionColumns, tableProperties, fileFormat) | ||
| try { | ||
| sql(creationSql) | ||
| } catch { | ||
| case _: TableAlreadyExistsException => | ||
| logger.info(s"Table $tableName already exists, skipping creation") | ||
| case e: Exception => | ||
| logger.error(s"Failed to create table $tableName", e) | ||
| throw e | ||
| } | ||
| } | ||
|
|
||
| // TODO: we need to also allow for bigquery tables to have their table properties (or tags) to be persisted too. | ||
| // https://app.asana.com/0/1208949807589885/1209111629687568/f | ||
| if (tableProperties != null && tableProperties.nonEmpty) { | ||
| sql(alterTablePropertiesSql(tableName, tableProperties)) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we actually want to persist some of these table properties though. The behavior here isn't quite equivalent for bigquery in that any custom properties we pass through here ultimately don't make it to the bigquery table. That might be problematic down the line.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @david-zlai mind just adding a TODO here so we can take care of it for BigQuery? |
||
| } | ||
| if (autoExpand) { | ||
| expandTable(tableName, df.schema) | ||
| } | ||
|
|
||
| true | ||
| } | ||
|
|
||
| // Needs provider | ||
| def insertPartitions(df: DataFrame, | ||
| tableName: String, | ||
|
|
@@ -279,30 +323,18 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable | |
| df | ||
| } | ||
|
|
||
| if (!tableExists(tableName)) { | ||
| val creationSql = createTableSql(tableName, dfRearranged.schema, partitionColumns, tableProperties, fileFormat) | ||
| try { | ||
| sql(creationSql) | ||
| } catch { | ||
| case _: TableAlreadyExistsException => | ||
| logger.info(s"Table $tableName already exists, skipping creation") | ||
| case e: Exception => | ||
| logger.error(s"Failed to create table $tableName", e) | ||
| throw e | ||
| } | ||
| } | ||
| if (tableProperties != null && tableProperties.nonEmpty) { | ||
| sql(alterTablePropertiesSql(tableName, tableProperties)) | ||
| } | ||
|
|
||
| if (autoExpand) { | ||
| expandTable(tableName, dfRearranged.schema) | ||
| } | ||
| val isTableCreated = createTable(dfRearranged, | ||
| tableName, | ||
| partitionColumns, | ||
| tableFormatProvider.writeFormat(tableName).createTableTypeString, | ||
| tableProperties, | ||
| fileFormat, | ||
| autoExpand) | ||
|
|
||
| val finalizedDf = if (autoExpand) { | ||
| val finalizedDf = if (autoExpand && isTableCreated) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @david-zlai will this be correct if the table already exists and therefore does not need to be created? I think looking at the code yes, but could you add a unit test to verify that?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added a test |
||
| // reselect the columns so that an deprecated columns will be selected as NULL before write | ||
| val updatedSchema = getSchemaFromTable(tableName) | ||
| val finalColumns = updatedSchema.fieldNames.map(fieldName => { | ||
| val tableSchema = getSchemaFromTable(tableName) | ||
| val finalColumns = tableSchema.fieldNames.map(fieldName => { | ||
| if (dfRearranged.schema.fieldNames.contains(fieldName)) { | ||
| col(fieldName) | ||
| } else { | ||
|
|
@@ -362,13 +394,12 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable | |
| saveMode: SaveMode = SaveMode.Overwrite, | ||
| fileFormat: String = "PARQUET"): Unit = { | ||
|
|
||
| if (!tableExists(tableName)) { | ||
| sql(createTableSql(tableName, df.schema, Seq.empty[String], tableProperties, fileFormat)) | ||
| } else { | ||
| if (tableProperties != null && tableProperties.nonEmpty) { | ||
| sql(alterTablePropertiesSql(tableName, tableProperties)) | ||
| } | ||
| } | ||
| createTable(df, | ||
| tableName, | ||
| Seq.empty[String], | ||
| tableFormatProvider.writeFormat(tableName).createTableTypeString, | ||
| tableProperties, | ||
| fileFormat) | ||
|
|
||
| repartitionAndWrite(df, tableName, saveMode, None) | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.