Implement Delta Lake in Azure Synapse Analytics Spark.
Time to configure the blob storage and load the data
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
spark.conf.set(
"fs.azure.account.key.waginput.blob.core.windows.net",
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
val df = spark.read.option("header","true").option("inferSchema","true").csv("wasbs://[email protected]/train.csv")
display(df)
import org.apache.spark.sql.SaveMode
val df1 = df.withColumn("Date", (col("pickup_datetime").cast("date")))
display(df1)
df1.write.format("delta").mode("overwrite").partitionBy("Date").save("/delta/taxidata/")
val df_delta = spark.read.format("delta").load("/delta/taxidata/")
display(df_delta)
display(spark.sql("DROP TABLE IF EXISTS taxidata"))
display(spark.sql("CREATE TABLE taxidata USING DELTA LOCATION '/delta/taxidata/'"))
display(spark.sql("TRUNCATE TABLE taxidata"))
df_delta.count()
val historical_events = spark.read.option("header","true").option("inferSchema","true").csv("wasbs://[email protected]/train.csv")
val historical_events1 = df.withColumn("Date", (col("pickup_datetime").cast("date")))
import io.delta.tables._
import org.apache.spark.sql.functions._
DeltaTable.forPath(spark, "/delta/taxidata/").as("org").merge(historical_events1.as("updates"),"org.key = updates.key and org.pickup_datetime = updates.pickup_datetime and org.pickup_longitude = updates.pickup_longitude and org.pickup_latitude = updates.pickup_latitude and org.dropoff_longitude = updates.dropoff_longitude and org.dropoff_latitude = updates.dropoff_latitude").whenNotMatched().insertAll().execute()
df_delta.count()
historical_events1.write.format("delta").mode("append").partitionBy("Date").save("/delta/taxidata/")
Have fun