diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 4badcbaa89aa4..0fc040ffb747b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -149,12 +149,15 @@ object RandomDataGenerator { * @param dataType the type to generate values for * @param nullable whether null values should be generated * @param rand an optional random number generator + * @param validJulianDatetime whether to generate dates and timestamps that are valid + * in the Julian calendar. * @return a function which can be called to generate random values. */ def forType( dataType: DataType, nullable: Boolean = true, - rand: Random = new Random): Option[() => Any] = { + rand: Random = new Random, + validJulianDatetime: Boolean = false): Option[() => Any] = { val valueGenerator: Option[() => Any] = dataType match { case StringType => Some(() => rand.nextString(rand.nextInt(MAX_STR_LEN))) case BinaryType => Some(() => { @@ -182,25 +185,33 @@ object RandomDataGenerator { "1970-01-01", // the epoch date "9999-12-31" // the last supported date according to SQL standard ) + def getRandomDate(rand: Random): java.sql.Date = { + val date = DateTimeUtils.toJavaDate(uniformDaysRand(rand)) + // The generated `date` is based on the hybrid calendar Julian + Gregorian since + // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used + // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to + // a local date in Proleptic Gregorian calendar to satisfy this requirement. Some + // years are leap years in Julian calendar but not in Proleptic Gregorian calendar. + // As the consequence of that, 29 February of such years might not exist in Proleptic + // Gregorian calendar. When this happens, we shift the date by one day. + Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) + } if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) { randomNumeric[LocalDate]( rand, - (rand: Random) => LocalDate.ofEpochDay(uniformDaysRand(rand)), + (rand: Random) => { + val days = if (validJulianDatetime) { + DateTimeUtils.fromJavaDate(getRandomDate(rand)) + } else { + uniformDaysRand(rand) + } + LocalDate.ofEpochDay(days) + }, specialDates.map(LocalDate.parse)) } else { randomNumeric[java.sql.Date]( rand, - (rand: Random) => { - val date = DateTimeUtils.toJavaDate(uniformDaysRand(rand)) - // The generated `date` is based on the hybrid calendar Julian + Gregorian since - // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used - // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `date` to - // a local date in Proleptic Gregorian calendar to satisfy this requirement. Some - // years are leap years in Julian calendar but not in Proleptic Gregorian calendar. - // As the consequence of that, 29 February of such years might not exist in Proleptic - // Gregorian calendar. When this happens, we shift the date by one day. - Try { date.toLocalDate; date }.getOrElse(new Date(date.getTime + MILLIS_PER_DAY)) - }, + getRandomDate, specialDates.map(java.sql.Date.valueOf)) } case TimestampType => @@ -222,10 +233,29 @@ object RandomDataGenerator { "1970-01-01 00:00:00", // the epoch timestamp "9999-12-31 23:59:59" // the last supported timestamp according to SQL standard ) + def getRandomTimestamp(rand: Random): java.sql.Timestamp = { + // DateTimeUtils.toJavaTimestamp takes microsecond. + val ts = DateTimeUtils.toJavaTimestamp(uniformMicrosRand(rand)) + // The generated `ts` is based on the hybrid calendar Julian + Gregorian since + // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used + // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `ts` to + // a local timestamp in Proleptic Gregorian calendar to satisfy this requirement. Some + // years are leap years in Julian calendar but not in Proleptic Gregorian calendar. + // As the consequence of that, 29 February of such years might not exist in Proleptic + // Gregorian calendar. When this happens, we shift the timestamp `ts` by one day. + Try { ts.toLocalDateTime; ts }.getOrElse(new Timestamp(ts.getTime + MILLIS_PER_DAY)) + } if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) { randomNumeric[Instant]( rand, - (rand: Random) => DateTimeUtils.microsToInstant(uniformMicrosRand(rand)), + (rand: Random) => { + val micros = if (validJulianDatetime) { + DateTimeUtils.fromJavaTimestamp(getRandomTimestamp(rand)) + } else { + uniformMicrosRand(rand) + } + DateTimeUtils.microsToInstant(micros) + }, specialTs.map { s => val ldt = LocalDateTime.parse(s.replace(" ", "T")) ldt.atZone(ZoneId.systemDefault()).toInstant @@ -233,18 +263,7 @@ object RandomDataGenerator { } else { randomNumeric[java.sql.Timestamp]( rand, - (rand: Random) => { - // DateTimeUtils.toJavaTimestamp takes microsecond. - val ts = DateTimeUtils.toJavaTimestamp(uniformMicrosRand(rand)) - // The generated `ts` is based on the hybrid calendar Julian + Gregorian since - // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used - // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `ts` to - // a local timestamp in Proleptic Gregorian calendar to satisfy this requirement. Some - // years are leap years in Julian calendar but not in Proleptic Gregorian calendar. - // As the consequence of that, 29 February of such years might not exist in Proleptic - // Gregorian calendar. When this happens, we shift the timestamp `ts` by one day. - Try { ts.toLocalDateTime; ts }.getOrElse(new Timestamp(ts.getTime + MILLIS_PER_DAY)) - }, + getRandomTimestamp, specialTs.map(java.sql.Timestamp.valueOf)) } case CalendarIntervalType => Some(() => { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala index b65a00457c72c..9befd910fab31 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.sources import java.io.File +import java.util.Locale import scala.util.Random @@ -160,7 +161,17 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes val dataGenerator = RandomDataGenerator.forType( dataType = dataType, nullable = true, - new Random(seed) + new Random(seed), + // TODO(SPARK-34440): Allow saving/loading datetime in ORC w/o rebasing + // The ORC datasource always performs datetime rebasing that can lead to + // shifting of the original dates/timestamps. For instance, 1582-10-06 is valid + // date in the Proleptic Gregorian calendar but it does not exist in the Julian + // calendar. The ORC datasource shifts the date to the next valid date 1582-10-15 + // during rebasing of this date to the Julian calendar. Since the test compares + // the original date before saving and the date loaded back from the ORC files, + // we set `validJulianDatetime` to `true` to generate only Proleptic Gregorian + // dates that exist in the Julian calendar and will be not changed during rebase. + validJulianDatetime = dataSourceName.toLowerCase(Locale.ROOT).contains("orc") ).getOrElse { fail(s"Failed to create data generator for schema $dataType") }