-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-31076][SQL] Convert Catalyst's DATE/TIMESTAMP to Java Date/Timestamp via local date-time #27807
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-31076][SQL] Convert Catalyst's DATE/TIMESTAMP to Java Date/Timestamp via local date-time #27807
Changes from all commits
88cf01c
c60a255
b9532d7
503e3f0
4a13df3
120db04
1edac2a
2ecaf1c
d2e3a27
8e83e1f
bfb6535
fe8c1fb
6c892da
f4ced32
00003dd
37bd181
6172737
8c15f51
627d429
1bc54ae
52cdd20
5f5fdd6
da8de7e
407cc1f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,7 +23,7 @@ class HiveResultSuite extends SharedSparkSession { | |
| import testImplicits._ | ||
|
|
||
| test("date formatting in hive result") { | ||
| val dates = Seq("2018-12-28", "1582-10-13", "1582-10-14", "1582-10-15") | ||
| val dates = Seq("2018-12-28", "1582-10-03", "1582-10-04", "1582-10-15") | ||
| val df = dates.toDF("a").selectExpr("cast(a as date) as b") | ||
| val executedPlan1 = df.queryExecution.executedPlan | ||
| val result = HiveResult.hiveResultString(executedPlan1) | ||
|
|
@@ -36,8 +36,8 @@ class HiveResultSuite extends SharedSparkSession { | |
| test("timestamp formatting in hive result") { | ||
| val timestamps = Seq( | ||
| "2018-12-28 01:02:03", | ||
| "1582-10-13 01:02:03", | ||
| "1582-10-14 01:02:03", | ||
| "1582-10-03 01:02:03", | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Conversions of timestamps in the range 1582-10-04 - 1582-10-15 is implementation specific because of calendar switching. |
||
| "1582-10-04 01:02:03", | ||
| "1582-10-15 01:02:03") | ||
| val df = timestamps.toDF("a").selectExpr("cast(a as timestamp) as b") | ||
| val executedPlan1 = df.queryExecution.executedPlan | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |||||||||||
|
|
||||||||||||
| import org.apache.hadoop.hive.ql.exec.vector.*; | ||||||||||||
|
|
||||||||||||
| import org.apache.spark.sql.catalyst.util.DateTimeUtils; | ||||||||||||
| import org.apache.spark.sql.types.DataType; | ||||||||||||
| import org.apache.spark.sql.types.Decimal; | ||||||||||||
| import org.apache.spark.sql.types.TimestampType; | ||||||||||||
|
|
@@ -136,7 +137,7 @@ public int getInt(int rowId) { | |||||||||||
| public long getLong(int rowId) { | ||||||||||||
| int index = getRowIndex(rowId); | ||||||||||||
| if (isTimestamp) { | ||||||||||||
| return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000 % 1000; | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I took a look at ORC type spec but it doesn't mention the calendar. The physical timestamp type looks very similar to How about the write side?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The write side uses toJavaTimestamp already: spark/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala Lines 144 to 148 in 300ec1a
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, @cloud-fan . Apache ORC inherited Apache Hive's original design. |
||||||||||||
| return DateTimeUtils.fromJavaTimestamp(timestampData.asScratchTimestamp(index)); | ||||||||||||
| } else { | ||||||||||||
| return longData.vector[index]; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,8 @@ | |
| package org.apache.spark.sql.hive | ||
|
|
||
| import java.lang.reflect.{ParameterizedType, Type, WildcardType} | ||
| import java.util.concurrent.TimeUnit._ | ||
| import java.time.LocalDate | ||
| import java.util.Calendar | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
|
|
||
|
|
@@ -181,6 +182,33 @@ import org.apache.spark.unsafe.types.UTF8String | |
| */ | ||
| private[hive] trait HiveInspectors { | ||
|
|
||
| private final val JULIAN_CUTOVER_DAY = | ||
| rebaseGregorianToJulianDays(DateTimeUtils.GREGORIAN_CUTOVER_DAY.toInt) | ||
|
|
||
| private def rebaseJulianToGregorianDays(daysSinceEpoch: Int): Int = { | ||
| val localDate = LocalDate.ofEpochDay(daysSinceEpoch) | ||
| val utcCal = new Calendar.Builder() | ||
| .setCalendarType("gregory") | ||
| .setTimeZone(DateTimeUtils.TimeZoneUTC) | ||
| .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth) | ||
| .build() | ||
| Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, DateTimeConstants.MILLIS_PER_DAY)) | ||
| } | ||
|
|
||
| private def rebaseGregorianToJulianDays(daysSinceEpoch: Int): Int = { | ||
| val millis = Math.multiplyExact(daysSinceEpoch, DateTimeConstants.MILLIS_PER_DAY) | ||
| val utcCal = new Calendar.Builder() | ||
| .setCalendarType("gregory") | ||
| .setTimeZone(DateTimeUtils.TimeZoneUTC) | ||
| .setInstant(millis) | ||
| .build() | ||
| val localDate = LocalDate.of( | ||
| utcCal.get(Calendar.YEAR), | ||
| utcCal.get(Calendar.MONTH) + 1, | ||
| utcCal.get(Calendar.DAY_OF_MONTH)) | ||
| Math.toIntExact(localDate.toEpochDay) | ||
| } | ||
|
|
||
| def javaTypeToDataType(clz: Type): DataType = clz match { | ||
| // writable | ||
| case c: Class[_] if c == classOf[hadoopIo.DoubleWritable] => DoubleType | ||
|
|
@@ -466,7 +494,7 @@ private[hive] trait HiveInspectors { | |
| _ => constant | ||
| case poi: WritableConstantTimestampObjectInspector => | ||
| val t = poi.getWritableConstantValue | ||
| val constant = SECONDS.toMicros(t.getSeconds) + NANOSECONDS.toMicros(t.getNanos) | ||
| val constant = DateTimeUtils.fromJavaTimestamp(t.getTimestamp) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The bug fix should be made independently from the PR, I think. |
||
| _ => constant | ||
| case poi: WritableConstantIntObjectInspector => | ||
| val constant = poi.getWritableConstantValue.get() | ||
|
|
@@ -618,7 +646,14 @@ private[hive] trait HiveInspectors { | |
| case x: DateObjectInspector if x.preferWritable() => | ||
| data: Any => { | ||
| if (data != null) { | ||
| DateTimeUtils.fromJavaDate(x.getPrimitiveWritableObject(data).get()) | ||
| // Rebasing written days via conversion to local dates. | ||
| // See the comment for `getDateWritable()`. | ||
| val daysSinceEpoch = x.getPrimitiveWritableObject(data).getDays | ||
| if (daysSinceEpoch < JULIAN_CUTOVER_DAY) { | ||
| rebaseJulianToGregorianDays(daysSinceEpoch) | ||
| } else { | ||
| daysSinceEpoch | ||
| } | ||
| } else { | ||
| null | ||
| } | ||
|
|
@@ -634,8 +669,7 @@ private[hive] trait HiveInspectors { | |
| case x: TimestampObjectInspector if x.preferWritable() => | ||
| data: Any => { | ||
| if (data != null) { | ||
| val t = x.getPrimitiveWritableObject(data) | ||
| SECONDS.toMicros(t.getSeconds) + NANOSECONDS.toMicros(t.getNanos) | ||
| DateTimeUtils.fromJavaTimestamp(x.getPrimitiveWritableObject(data).getTimestamp) | ||
| } else { | ||
| null | ||
| } | ||
|
|
@@ -1012,7 +1046,27 @@ private[hive] trait HiveInspectors { | |
| } | ||
|
|
||
| private def getDateWritable(value: Any): hiveIo.DateWritable = | ||
| if (value == null) null else new hiveIo.DateWritable(value.asInstanceOf[Int]) | ||
| if (value == null) { | ||
| null | ||
| } else { | ||
| // Rebasing days since the epoch to store the same number of days | ||
| // as by Spark 2.4 and earlier versions. Spark 3.0 switched to | ||
| // Proleptic Gregorian calendar (see SPARK-26651), and as a consequence of that, | ||
| // this affects dates before 1582-10-15. Spark 2.4 and earlier versions use | ||
| // Julian calendar for dates before 1582-10-15. So, the same local date may | ||
| // be mapped to different number of days since the epoch in different calendars. | ||
| // For example: | ||
| // Proleptic Gregorian calendar: 1582-01-01 -> -141714 | ||
| // Julian calendar: 1582-01-01 -> -141704 | ||
| // The code below converts -141714 to -141704. | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gregorian year is shorter than Julian year: 365.2425 days vs 365.25 days, so the same local date in Gregorian calendar requires less days in Julian calendar. |
||
| val daysSinceEpoch = value.asInstanceOf[Int] | ||
| val rebasedDays = if (daysSinceEpoch < DateTimeUtils.GREGORIAN_CUTOVER_DAY) { | ||
| rebaseGregorianToJulianDays(daysSinceEpoch) | ||
| } else { | ||
| daysSinceEpoch | ||
| } | ||
| new hiveIo.DateWritable(rebasedDays) | ||
| } | ||
|
|
||
| private def getTimestampWritable(value: Any): hiveIo.TimestampWritable = | ||
| if (value == null) { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.