diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala new file mode 100644 index 0000000000000..53a0deb270fd1 --- /dev/null +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import java.io.{DataInput, DataOutput, IOException} +import java.sql.Date +import java.time.LocalDate +import java.util.Calendar + +import org.apache.hadoop.hive.serde2.io.DateWritable +import org.apache.hadoop.io.WritableUtils + +import org.apache.spark.sql.catalyst.util.{DateTimeConstants, DateTimeUtils} + +/** + * The class accepts/returns days in Gregorian calendar and rebase them + * via conversion to local date in Julian calendar for dates before 1582-10-15 + * in read/write for backward compatibility with Spark 2.4 and earlier versions. + * + * @param gregorianDays The number of days since the epoch 1970-01-01 in + * Gregorian calendar. + * @param julianDays The number of days since the epoch 1970-01-01 in + * Julian calendar. + */ +private[hive] class DaysWritable( + var gregorianDays: Int, + var julianDays: Int) + extends DateWritable { + + def this(gregorianDays: Int) = + this(gregorianDays, DaysWritable.rebaseGregorianToJulianDays(gregorianDays)) + def this(dateWritable: DateWritable) = { + this( + gregorianDays = dateWritable match { + case daysWritable: DaysWritable => daysWritable.gregorianDays + case dateWritable: DateWritable => + DaysWritable.rebaseJulianToGregorianDays(dateWritable.getDays) + }, + julianDays = dateWritable.getDays) + } + + override def getDays: Int = julianDays + override def get(): Date = new Date(DateWritable.daysToMillis(julianDays)) + + @throws[IOException] + override def write(out: DataOutput): Unit = { + WritableUtils.writeVInt(out, julianDays) + } + + @throws[IOException] + override def readFields(in: DataInput): Unit = { + julianDays = WritableUtils.readVInt(in) + gregorianDays = DaysWritable.rebaseJulianToGregorianDays(julianDays) + } +} + +private[hive] object DaysWritable { + // Rebasing days since the epoch to store the same number of days + // as by Spark 2.4 and earlier versions. Spark 3.0 switched to + // Proleptic Gregorian calendar (see SPARK-26651), and as a consequence of that, + // this affects dates before 1582-10-15. Spark 2.4 and earlier versions use + // Julian calendar for dates before 1582-10-15. So, the same local date may + // be mapped to different number of days since the epoch in different calendars. + // For example: + // Proleptic Gregorian calendar: 1582-01-01 -> -141714 + // Julian calendar: 1582-01-01 -> -141704 + // The code below converts -141714 to -141704. + def rebaseGregorianToJulianDays(daysSinceEpoch: Int): Int = { + if (daysSinceEpoch < DateTimeUtils.GREGORIAN_CUTOVER_DAY) { + val millis = Math.multiplyExact(daysSinceEpoch, DateTimeConstants.MILLIS_PER_DAY) + val utcCal = new Calendar.Builder() + .setCalendarType("gregory") + .setTimeZone(DateTimeUtils.TimeZoneUTC) + .setInstant(millis) + .build() + val localDate = LocalDate.of( + utcCal.get(Calendar.YEAR), + utcCal.get(Calendar.MONTH) + 1, + utcCal.get(Calendar.DAY_OF_MONTH)) + Math.toIntExact(localDate.toEpochDay) + } else { + daysSinceEpoch + } + } + + def rebaseJulianToGregorianDays(daysSinceEpoch: Int): Int = { + if (daysSinceEpoch < JULIAN_CUTOVER_DAY) { + val localDate = LocalDate.ofEpochDay(daysSinceEpoch) + val utcCal = new Calendar.Builder() + .setCalendarType("gregory") + .setTimeZone(DateTimeUtils.TimeZoneUTC) + .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth) + .build() + Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, DateTimeConstants.MILLIS_PER_DAY)) + } else { + daysSinceEpoch + } + } + + final val JULIAN_CUTOVER_DAY = + rebaseGregorianToJulianDays(DateTimeUtils.GREGORIAN_CUTOVER_DAY.toInt) +} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index e217c5279046e..e3e9a3144728d 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -18,8 +18,6 @@ package org.apache.spark.sql.hive import java.lang.reflect.{ParameterizedType, Type, WildcardType} -import java.time.LocalDate -import java.util.Calendar import scala.collection.JavaConverters._ @@ -182,33 +180,6 @@ import org.apache.spark.unsafe.types.UTF8String */ private[hive] trait HiveInspectors { - private final val JULIAN_CUTOVER_DAY = - rebaseGregorianToJulianDays(DateTimeUtils.GREGORIAN_CUTOVER_DAY.toInt) - - private def rebaseJulianToGregorianDays(daysSinceEpoch: Int): Int = { - val localDate = LocalDate.ofEpochDay(daysSinceEpoch) - val utcCal = new Calendar.Builder() - .setCalendarType("gregory") - .setTimeZone(DateTimeUtils.TimeZoneUTC) - .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth) - .build() - Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, DateTimeConstants.MILLIS_PER_DAY)) - } - - private def rebaseGregorianToJulianDays(daysSinceEpoch: Int): Int = { - val millis = Math.multiplyExact(daysSinceEpoch, DateTimeConstants.MILLIS_PER_DAY) - val utcCal = new Calendar.Builder() - .setCalendarType("gregory") - .setTimeZone(DateTimeUtils.TimeZoneUTC) - .setInstant(millis) - .build() - val localDate = LocalDate.of( - utcCal.get(Calendar.YEAR), - utcCal.get(Calendar.MONTH) + 1, - utcCal.get(Calendar.DAY_OF_MONTH)) - Math.toIntExact(localDate.toEpochDay) - } - def javaTypeToDataType(clz: Type): DataType = clz match { // writable case c: Class[_] if c == classOf[hadoopIo.DoubleWritable] => DoubleType @@ -646,14 +617,7 @@ private[hive] trait HiveInspectors { case x: DateObjectInspector if x.preferWritable() => data: Any => { if (data != null) { - // Rebasing written days via conversion to local dates. - // See the comment for `getDateWritable()`. - val daysSinceEpoch = x.getPrimitiveWritableObject(data).getDays - if (daysSinceEpoch < JULIAN_CUTOVER_DAY) { - rebaseJulianToGregorianDays(daysSinceEpoch) - } else { - daysSinceEpoch - } + new DaysWritable(x.getPrimitiveWritableObject(data)).gregorianDays } else { null } @@ -1045,27 +1009,11 @@ private[hive] trait HiveInspectors { new hadoopIo.BytesWritable(value.asInstanceOf[Array[Byte]]) } - private def getDateWritable(value: Any): hiveIo.DateWritable = + private def getDateWritable(value: Any): DaysWritable = if (value == null) { null } else { - // Rebasing days since the epoch to store the same number of days - // as by Spark 2.4 and earlier versions. Spark 3.0 switched to - // Proleptic Gregorian calendar (see SPARK-26651), and as a consequence of that, - // this affects dates before 1582-10-15. Spark 2.4 and earlier versions use - // Julian calendar for dates before 1582-10-15. So, the same local date may - // be mapped to different number of days since the epoch in different calendars. - // For example: - // Proleptic Gregorian calendar: 1582-01-01 -> -141714 - // Julian calendar: 1582-01-01 -> -141704 - // The code below converts -141714 to -141704. - val daysSinceEpoch = value.asInstanceOf[Int] - val rebasedDays = if (daysSinceEpoch < DateTimeUtils.GREGORIAN_CUTOVER_DAY) { - rebaseGregorianToJulianDays(daysSinceEpoch) - } else { - daysSinceEpoch - } - new hiveIo.DateWritable(rebasedDays) + new DaysWritable(value.asInstanceOf[Int]) } private def getTimestampWritable(value: Any): hiveIo.TimestampWritable =