zipline-ai · david-zlai · Jan 8, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -88,7 +88,9 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
   private val blockingCacheEviction: Boolean =
     sparkSession.conf.get("spark.chronon.table_write.cache.blocking", "false").toBoolean
 
-  private[spark] lazy val tableFormatProvider: FormatProvider = {
+  // Add transient here because it can contain BigQueryImpl during reflection with bq flavor
+  // and that can't be serialized by Spark
+  @transient private[spark] lazy val tableFormatProvider: FormatProvider = {
     val clazzName =
       sparkSession.conf.get("spark.chronon.table.format_provider.class", classOf[DefaultFormatProvider].getName)
     val mirror = runtimeMirror(getClass.getClassLoader)
@@ -261,6 +263,35 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
   def firstAvailablePartition(tableName: String, subPartitionFilters: Map[String, String] = Map.empty): Option[String] =
     partitions(tableName, subPartitionFilters).reduceOption((x, y) => Ordering[String].min(x, y))
 
+  def createTable(df: DataFrame,
+                  tableName: String,
+                  partitionColumns: Seq[String] = Seq.empty,
+                  writeFormatTypeString: String = "",
+                  tableProperties: Map[String, String] = null,
+                  fileFormat: String = "PARQUET",
+                  autoExpand: Boolean = false): Unit = {
+    // create table sql doesn't work for bigquery here. instead of creating the table explicitly, we can rely on the
+    // bq connector to indirectly create the table and eventually write the data
+    if (!tableExists(tableName) && writeFormatTypeString.toUpperCase != "BIGQUERY") {
+      val creationSql = createTableSql(tableName, df.schema, partitionColumns, tableProperties, fileFormat)
+      try {
+        sql(creationSql)
+      } catch {
+        case _: TableAlreadyExistsException =>
+          logger.info(s"Table $tableName already exists, skipping creation")
+        case e: Exception =>
+          logger.error(s"Failed to create table $tableName", e)
+          throw e
+      }
+    }
+    if (tableProperties != null && tableProperties.nonEmpty) {
+      sql(alterTablePropertiesSql(tableName, tableProperties))
+    }
+    if (autoExpand) {
+      expandTable(tableName, df.schema)
+    }
+  }
+
   // Needs provider
   def insertPartitions(df: DataFrame,
                        tableName: String,
@@ -279,25 +310,15 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
       df
     }
 
-    if (!tableExists(tableName)) {
-      val creationSql = createTableSql(tableName, dfRearranged.schema, partitionColumns, tableProperties, fileFormat)
-      try {
-        sql(creationSql)
-      } catch {
-        case _: TableAlreadyExistsException =>
-          logger.info(s"Table $tableName already exists, skipping creation")
-        case e: Exception =>
-          logger.error(s"Failed to create table $tableName", e)
-          throw e
-      }
-    }
-    if (tableProperties != null && tableProperties.nonEmpty) {
-      sql(alterTablePropertiesSql(tableName, tableProperties))
-    }
+    val writeFormatTypeString = tableFormatProvider.writeFormat(tableName).createTableTypeString
 
-    if (autoExpand) {
-      expandTable(tableName, dfRearranged.schema)
-    }
+    createTable(dfRearranged,
+                tableName,
+                partitionColumns,
+                writeFormatTypeString,
+                tableProperties,
+                fileFormat,
+                autoExpand)
 
     val finalizedDf = if (autoExpand) {
       // reselect the columns so that an deprecated columns will be selected as NULL before write
@@ -362,13 +383,9 @@ class TableUtils(@transient val sparkSession: SparkSession) extends Serializable
                           saveMode: SaveMode = SaveMode.Overwrite,
                           fileFormat: String = "PARQUET"): Unit = {
 
-    if (!tableExists(tableName)) {
-      sql(createTableSql(tableName, df.schema, Seq.empty[String], tableProperties, fileFormat))
-    } else {
-      if (tableProperties != null && tableProperties.nonEmpty) {
-        sql(alterTablePropertiesSql(tableName, tableProperties))
-      }
-    }
+    val writeFormatTypeString = tableFormatProvider.writeFormat(tableName).createTableTypeString
+
+    createTable(df, tableName, Seq.empty[String], writeFormatTypeString, tableProperties, fileFormat)
 
     repartitionAndWrite(df, tableName, saveMode, None)
   }

diff --git a/spark/src/test/scala/ai/chronon/spark/test/TableUtilsTest.scala b/spark/src/test/scala/ai/chronon/spark/test/TableUtilsTest.scala
@@ -31,8 +31,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.types
-import org.junit.Assert.assertEquals
-import org.junit.Assert.assertTrue
+import org.junit.Assert.{assertEquals, assertFalse, assertTrue}
 import org.junit.Test
 
 import scala.util.Try
@@ -431,4 +430,52 @@ class TableUtilsTest {
     tableUtils.sql("CREATE TEMPORARY FUNCTION test AS 'ai.chronon.spark.test.SimpleAddUDF'")
   }
 
+  @Test
+  def testCreateTable(): Unit = {
+    val tableName = "db.test_create_table"
+    spark.sql("CREATE DATABASE IF NOT EXISTS db")
+
+    val columns = Array(
+      StructField("long_field", LongType),
+      StructField("int_field", IntType),
+      StructField("string_field", StringType)
+    )
+    val df = makeDf(
+      spark,
+      StructType(
+        tableName,
+        columns
+      ),
+      List(
+        Row(1L, 2, "3")
+      )
+    )
+    tableUtils.createTable(df, tableName)
+    assertTrue(spark.catalog.tableExists(tableName))
+  }
-  @Test
-  def testCreateTable(): Unit = {
-    val tableName = "db.test_create_table"
-    spark.sql("CREATE DATABASE IF NOT EXISTS db")
-
-    val columns = Array(
-      StructField("long_field", LongType),
-      StructField("int_field", IntType),
-      StructField("string_field", StringType)
-    )
-    val df = makeDf(
-      spark,
-      StructType(
-        tableName,
-        columns
-      ),
-      List(
-        Row(1L, 2, "3")
-      )
-    )
-    tableUtils.createTable(df, tableName)
-    assertTrue(spark.catalog.tableExists(tableName))
-  }
+  @Test
+  def testCreateTable(): Unit = {
+    val tableName = "db.test_create_table"
+    spark.sql("CREATE DATABASE IF NOT EXISTS db")
+    try {
+      val columns = Array(
+        StructField("long_field", LongType),
+        StructField("int_field", IntType),
+        StructField("string_field", StringType)
+      )
+      val df = makeDf(
+        spark,
+        StructType(
+          tableName,
+          columns
+        ),
+        List(
+          Row(1L, 2, "3")
+        )
+      )
+      tableUtils.createTable(df, tableName)
+      assertTrue(spark.catalog.tableExists(tableName))
+      val createdTable = spark.table(tableName)
+      assertEquals(df.schema, createdTable.schema)
+      assertEquals(df.collect().toSeq, createdTable.collect().toSeq)
+    } finally {
+      spark.sql(s"DROP TABLE IF EXISTS $tableName")
+    }
+  }
-  @Test
-  def testCreateTable(): Unit = {
-    val tableName = "db.test_create_table"
-    spark.sql("CREATE DATABASE IF NOT EXISTS db")
-
-    val columns = Array(
-      StructField("long_field", LongType),
-      StructField("int_field", IntType),
-      StructField("string_field", StringType)
-    )
-    val df = makeDf(
-      spark,
-      StructType(
-        tableName,
-        columns
-      ),
-      List(
-        Row(1L, 2, "3")
-      )
-    )
-    tableUtils.createTable(df, tableName)
-    assertTrue(spark.catalog.tableExists(tableName))
-  }
+  @Test
+  def testCreateTable(): Unit = {
+    val tableName = "db.test_create_table"
+    spark.sql("CREATE DATABASE IF NOT EXISTS db")
+    try {
+      val columns = Array(
+        StructField("long_field", LongType),
+        StructField("int_field", IntType),
+        StructField("string_field", StringType)
+      )
+      val df = makeDf(
+        spark,
+        StructType(
+          tableName,
+          columns
+        ),
+        List(
+          Row(1L, 2, "3")
+        )
+      )
+      tableUtils.createTable(df, tableName)
+      assertTrue(spark.catalog.tableExists(tableName))
+      val createdTable = spark.table(tableName)
+      assertEquals(df.schema, createdTable.schema)
+      assertEquals(df.collect().toSeq, createdTable.collect().toSeq)
+    } finally {
+      spark.sql(s"DROP TABLE IF EXISTS $tableName")
+    }
+  }
+
+  @Test
+  def testCreateTableBigQuery(): Unit = {
+    val tableName = "db.test_create_table_bigquery"
+    spark.sql("CREATE DATABASE IF NOT EXISTS db")
+
+    val columns = Array(
+      StructField("long_field", LongType),
+      StructField("int_field", IntType),
+      StructField("string_field", StringType)
+    )
+    val df = makeDf(
+      spark,
+      StructType(
+        tableName,
+        columns
+      ),
+      List(
+        Row(1L, 2, "3")
+      )
+    )
+    tableUtils.createTable(df, tableName, writeFormatTypeString = "BIGQUERY")
+    assertFalse(spark.catalog.tableExists(tableName))
+  }
+
 }