-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-31183][SQL] Rebase date/timestamp from/to Julian calendar in Avro #27953
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
23b013b
9f1c0ea
a9b4b8a
de064d1
5837181
7c74989
61cf83b
9a96af0
2e1cee1
dac03f2
dd24d91
19a5ff7
2464c90
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,15 +17,15 @@ | |
| package org.apache.spark.sql.avro | ||
|
|
||
| import java.io.File | ||
| import java.sql.Timestamp | ||
| import java.sql.{Date, Timestamp} | ||
|
|
||
| import org.apache.avro.{LogicalTypes, Schema} | ||
| import org.apache.avro.Conversions.DecimalConversion | ||
| import org.apache.avro.file.DataFileWriter | ||
| import org.apache.avro.generic.{GenericData, GenericDatumWriter, GenericRecord} | ||
|
|
||
| import org.apache.spark.{SparkConf, SparkException} | ||
| import org.apache.spark.sql.{QueryTest, Row} | ||
| import org.apache.spark.sql.{DataFrame, QueryTest, Row} | ||
| import org.apache.spark.sql.catalyst.util.DateTimeUtils | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.test.SharedSparkSession | ||
|
|
@@ -348,6 +348,100 @@ abstract class AvroLogicalTypeSuite extends QueryTest with SharedSparkSession { | |
| assert(msg.contains("Unscaled value too large for precision")) | ||
| } | ||
| } | ||
|
|
||
| private def readResourceAvroFile(name: String): DataFrame = { | ||
| val url = Thread.currentThread().getContextClassLoader.getResource(name) | ||
| spark.read.format("avro").load(url.toString) | ||
| } | ||
|
|
||
| test("SPARK-31183: compatibility with Spark 2.4 in reading dates/timestamps") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missed one thing. I think the test is not very related to logical types and probably should be put in @MaxGekk can you move the test in your next PR?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean only this test, correct?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All the new tests added here. The are more about compatibility, not logical type. |
||
| withSQLConf(SQLConf.LEGACY_AVRO_REBASE_DATETIME.key -> "true") { | ||
| checkAnswer( | ||
| readResourceAvroFile("before_1582_date_v2_4.avro"), | ||
| Row(java.sql.Date.valueOf("1001-01-01"))) | ||
| checkAnswer( | ||
| readResourceAvroFile("before_1582_ts_micros_v2_4.avro"), | ||
| Row(java.sql.Timestamp.valueOf("1001-01-01 01:02:03.123456"))) | ||
| checkAnswer( | ||
| readResourceAvroFile("before_1582_ts_millis_v2_4.avro"), | ||
| Row(java.sql.Timestamp.valueOf("1001-01-01 01:02:03.124"))) | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-31183: rebasing microseconds timestamps in write") { | ||
| val tsStr = "1001-01-01 01:02:03.123456" | ||
| val nonRebased = "1001-01-07 01:09:05.123456" | ||
| withTempPath { dir => | ||
| val path = dir.getAbsolutePath | ||
| withSQLConf(SQLConf.LEGACY_AVRO_REBASE_DATETIME.key -> "true") { | ||
| Seq(tsStr).toDF("tsS") | ||
| .select($"tsS".cast("timestamp").as("ts")) | ||
| .write.format("avro") | ||
| .save(path) | ||
|
|
||
| checkAnswer(spark.read.format("avro").load(path), Row(Timestamp.valueOf(tsStr))) | ||
| } | ||
| withSQLConf(SQLConf.LEGACY_AVRO_REBASE_DATETIME.key -> "false") { | ||
| checkAnswer(spark.read.format("avro").load(path), Row(Timestamp.valueOf(nonRebased))) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-31183: rebasing milliseconds timestamps in write") { | ||
| val tsStr = "1001-01-01 01:02:03.123456" | ||
| val rebased = "1001-01-01 01:02:03.123" | ||
| val nonRebased = "1001-01-07 01:09:05.123" | ||
| Seq( | ||
| """{"type": "long","logicalType": "timestamp-millis"}""", | ||
| """"long"""").foreach { tsType => | ||
| val timestampSchema = s""" | ||
| |{ | ||
| | "namespace": "logical", | ||
| | "type": "record", | ||
| | "name": "test", | ||
| | "fields": [ | ||
| | {"name": "ts", "type": $tsType} | ||
| | ] | ||
| |}""".stripMargin | ||
| withTempPath { dir => | ||
| val path = dir.getAbsolutePath | ||
| withSQLConf(SQLConf.LEGACY_AVRO_REBASE_DATETIME.key -> "true") { | ||
| Seq(tsStr).toDF("tsS") | ||
| .select($"tsS".cast("timestamp").as("ts")) | ||
| .write | ||
| .option("avroSchema", timestampSchema) | ||
| .format("avro") | ||
| .save(path) | ||
|
|
||
| checkAnswer( | ||
| spark.read.schema("ts timestamp").format("avro").load(path), | ||
| Row(Timestamp.valueOf(rebased))) | ||
| } | ||
| withSQLConf(SQLConf.LEGACY_AVRO_REBASE_DATETIME.key -> "false") { | ||
| checkAnswer( | ||
| spark.read.schema("ts timestamp").format("avro").load(path), | ||
| Row(Timestamp.valueOf(nonRebased))) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-31183: rebasing dates in write") { | ||
| withTempPath { dir => | ||
| val path = dir.getAbsolutePath | ||
| withSQLConf(SQLConf.LEGACY_AVRO_REBASE_DATETIME.key -> "true") { | ||
| Seq("1001-01-01").toDF("dateS") | ||
| .select($"dateS".cast("date").as("date")) | ||
| .write.format("avro") | ||
| .save(path) | ||
|
|
||
| checkAnswer(spark.read.format("avro").load(path), Row(Date.valueOf("1001-01-01"))) | ||
| } | ||
| withSQLConf(SQLConf.LEGACY_AVRO_REBASE_DATETIME.key -> "false") { | ||
| checkAnswer(spark.read.format("avro").load(path), Row(Date.valueOf("1001-01-07"))) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| class AvroV1LogicalTypeSuite extends AvroLogicalTypeSuite { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One more thing, why don't we return a function rather than checking
rebaseDateTimefor every time?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can move the flag checking out of the function body in a follow PR, or in the same for #27953 (comment)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's easy to switch with almost no additional complexity. Seems fine to change rather than relying on other optimization like JIT, or having a bad example.