From 86d41fde5968608bd44d171bdeec4a78e211bbdd Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Fri, 9 Apr 2021 22:23:48 +0300 Subject: [PATCH 1/4] Impl Hive style --- .../spark/sql/catalyst/expressions/Cast.scala | 20 +++++---- .../sql/catalyst/util/IntervalUtils.scala | 43 +++++++++++++++---- .../catalyst/util/IntervalUtilsSuite.scala | 33 ++++++++++++++ .../spark/sql/execution/HiveResult.scala | 5 ++- 4 files changed, 82 insertions(+), 19 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 1e1b7eeca0f35..5d799c768af30 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.catalyst.util.DateTimeConstants._ import org.apache.spark.sql.catalyst.util.DateTimeUtils._ +import org.apache.spark.sql.catalyst.util.IntervalStringStyles.ANSI_STYLE import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -407,9 +408,11 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit case udt: UserDefinedType[_] => buildCast[Any](_, o => UTF8String.fromString(udt.deserialize(o).toString)) case YearMonthIntervalType => - buildCast[Int](_, i => UTF8String.fromString(IntervalUtils.toYearMonthIntervalString(i))) + buildCast[Int](_, i => UTF8String.fromString( + IntervalUtils.toYearMonthIntervalString(i, ANSI_STYLE))) case DayTimeIntervalType => - buildCast[Long](_, i => UTF8String.fromString(IntervalUtils.toDayTimeIntervalString(i))) + buildCast[Long](_, i => UTF8String.fromString( + IntervalUtils.toDayTimeIntervalString(i, ANSI_STYLE))) case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString)) } @@ -1125,14 +1128,13 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit (c, evPrim, evNull) => { code"$evPrim = UTF8String.fromString($udtRef.deserialize($c).toString());" } - case YearMonthIntervalType => + case i @ (YearMonthIntervalType | DayTimeIntervalType) => val iu = IntervalUtils.getClass.getName.stripSuffix("$") - (c, evPrim, _) => - code"""$evPrim = UTF8String.fromString($iu.toYearMonthIntervalString($c));""" - case DayTimeIntervalType => - val iu = IntervalUtils.getClass.getName.stripSuffix("$") - (c, evPrim, _) => - code"""$evPrim = UTF8String.fromString($iu.toDayTimeIntervalString($c));""" + val iss = IntervalStringStyles.getClass.getName.stripSuffix("$") + val subType = if (i.isInstanceOf[YearMonthIntervalType]) "YearMonth" else "DayTime" + val f = s"to${subType}IntervalString" + val style = s"$iss$$.MODULE$$.ANSI_STYLE()" + (c, evPrim, _) => code"""$evPrim = UTF8String.fromString($iu.$f($c, $style));""" case _ => (c, evPrim, evNull) => code"$evPrim = UTF8String.fromString(String.valueOf($c));" } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala index b96a7b9eda324..e52d3c881742d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala @@ -25,10 +25,17 @@ import scala.util.control.NonFatal import org.apache.spark.sql.catalyst.util.DateTimeConstants._ import org.apache.spark.sql.catalyst.util.DateTimeUtils.millisToMicros +import org.apache.spark.sql.catalyst.util.IntervalStringStyles.{ANSI_STYLE, HIVE_STYLE, IntervalStyle} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.Decimal import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} +// The style of textual representation of intervals +object IntervalStringStyles extends Enumeration { + type IntervalStyle = Value + val ANSI_STYLE, HIVE_STYLE = Value +} + object IntervalUtils { object IntervalUnit extends Enumeration { @@ -840,16 +847,21 @@ object IntervalUtils { * which conforms to the ANSI SQL standard. * * @param months The number of months, positive or negative + * @param style The style of textual representation of the interval * @return Year-month interval string */ - def toYearMonthIntervalString(months: Int): String = { + def toYearMonthIntervalString(months: Int, style: IntervalStyle): String = { var sign = "" var absMonths: Long = months if (months < 0) { sign = "-" absMonths = -absMonths } - s"INTERVAL '$sign${absMonths / MONTHS_PER_YEAR}-${absMonths % MONTHS_PER_YEAR}' YEAR TO MONTH" + val payload = s"$sign${absMonths / MONTHS_PER_YEAR}-${absMonths % MONTHS_PER_YEAR}" + style match { + case ANSI_STYLE => s"INTERVAL '$payload' YEAR TO MONTH" + case HIVE_STYLE => payload + } } /** @@ -857,9 +869,10 @@ object IntervalUtils { * which conforms to the ANSI SQL standard. * * @param micros The number of microseconds, positive or negative + * @param style The style of textual representation of the interval * @return Day-time interval string */ - def toDayTimeIntervalString(micros: Long): String = { + def toDayTimeIntervalString(micros: Long, style: IntervalStyle): String = { var sign = "" var rest = micros if (micros < 0) { @@ -867,20 +880,34 @@ object IntervalUtils { // Especial handling of minimum `Long` value because negate op overflows `Long`. // seconds = 106751991 * (24 * 60 * 60) + 4 * 60 * 60 + 54 = 9223372036854 // microseconds = -9223372036854000000L-775808 == Long.MinValue - return "INTERVAL '-106751991 04:00:54.775808' DAY TO SECOND" + val minIntervalString = style match { + case ANSI_STYLE => "INTERVAL '-106751991 04:00:54.775808' DAY TO SECOND" + case HIVE_STYLE => "-106751991 04:00:54.775808000" + } + return minIntervalString } else { sign = "-" rest = -rest } } - val seconds = rest % MICROS_PER_MINUTE + val secondsWithFraction = rest % MICROS_PER_MINUTE rest /= MICROS_PER_MINUTE val minutes = rest % MINUTES_PER_HOUR rest /= MINUTES_PER_HOUR val hours = rest % HOURS_PER_DAY val days = rest / HOURS_PER_DAY - val leadSecZero = if (seconds < 10 * MICROS_PER_SECOND) "0" else "" - val secStr = java.math.BigDecimal.valueOf(seconds, 6).stripTrailingZeros().toPlainString() - f"INTERVAL '$sign$days $hours%02d:$minutes%02d:$leadSecZero$secStr' DAY TO SECOND" + val leadSecZero = if (secondsWithFraction < 10 * MICROS_PER_SECOND) "0" else "" + val intervalString = style match { + case ANSI_STYLE => + val secStr = java.math.BigDecimal.valueOf(secondsWithFraction, 6) + .stripTrailingZeros() + .toPlainString() + f"INTERVAL '$sign$days $hours%02d:$minutes%02d:$leadSecZero$secStr' DAY TO SECOND" + case HIVE_STYLE => + val seconds = secondsWithFraction / MICROS_PER_SECOND + val nanos = (secondsWithFraction % MICROS_PER_SECOND) * NANOS_PER_MICROS + f"$sign$days $hours%02d:$minutes%02d:$seconds%02d.$nanos%09d" + } + intervalString } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala index 607337383a78a..b3c59bced5d57 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala @@ -24,6 +24,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.catalyst.util.DateTimeConstants._ import org.apache.spark.sql.catalyst.util.DateTimeUtils.millisToMicros +import org.apache.spark.sql.catalyst.util.IntervalStringStyles.{ANSI_STYLE, HIVE_STYLE} import org.apache.spark.sql.catalyst.util.IntervalUtils._ import org.apache.spark.sql.catalyst.util.IntervalUtils.IntervalUnit._ import org.apache.spark.sql.internal.SQLConf @@ -441,4 +442,36 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper { assert(durationToMicros(duration) === micros) } } + + test("SPARK-XXXXX: format year-month intervals") { + Seq( + 0 -> ("0-0", "INTERVAL '0-0' YEAR TO MONTH"), + -11 -> ("-0-11", "INTERVAL '-0-11' YEAR TO MONTH"), + 11 -> ("0-11", "INTERVAL '0-11' YEAR TO MONTH"), + -13 -> ("-1-1", "INTERVAL '-1-1' YEAR TO MONTH"), + 13 -> ("1-1", "INTERVAL '1-1' YEAR TO MONTH"), + -24 -> ("-2-0", "INTERVAL '-2-0' YEAR TO MONTH"), + 24 -> ("2-0", "INTERVAL '2-0' YEAR TO MONTH"), + Int.MinValue -> ("-178956970-8", "INTERVAL '-178956970-8' YEAR TO MONTH"), + Int.MaxValue -> ("178956970-7", "INTERVAL '178956970-7' YEAR TO MONTH") + ).foreach { case (months, (hiveIntervalStr, ansiIntervalStr)) => + assert(toYearMonthIntervalString(months, ANSI_STYLE) === ansiIntervalStr) + assert(toYearMonthIntervalString(months, HIVE_STYLE) === hiveIntervalStr) + } + } + + test("SPARK-XXXXX: format day-time intervals") { + Seq( + 0L -> ("0 00:00:00.000000000", "INTERVAL '0 00:00:00' DAY TO SECOND"), + -1L -> ("-0 00:00:00.000001000", "INTERVAL '-0 00:00:00.000001' DAY TO SECOND"), + 10 * MICROS_PER_MILLIS -> ("0 00:00:00.010000000", "INTERVAL '0 00:00:00.01' DAY TO SECOND"), + (-123 * MICROS_PER_DAY - 3 * MICROS_PER_SECOND) -> + ("-123 00:00:03.000000000", "INTERVAL '-123 00:00:03' DAY TO SECOND"), + Long.MinValue -> ("-106751991 04:00:54.775808000", + "INTERVAL '-106751991 04:00:54.775808' DAY TO SECOND") + ).foreach { case (micros, (hiveIntervalStr, ansiIntervalStr)) => + assert(toDayTimeIntervalString(micros, ANSI_STYLE) === ansiIntervalStr) + assert(toDayTimeIntervalString(micros, HIVE_STYLE) === hiveIntervalStr) + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala index 48da6f0410690..52394c1b7f7f4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala @@ -23,6 +23,7 @@ import java.time.{Duration, Instant, LocalDate, Period, ZoneOffset} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TimestampFormatter} +import org.apache.spark.sql.catalyst.util.IntervalStringStyles.HIVE_STYLE import org.apache.spark.sql.catalyst.util.IntervalUtils.{durationToMicros, periodToMonths, toDayTimeIntervalString, toYearMonthIntervalString} import org.apache.spark.sql.execution.command.{DescribeCommandBase, ExecutedCommandExec, ShowTablesCommand, ShowViewsCommand} import org.apache.spark.sql.execution.datasources.v2.{DescribeTableExec, ShowTablesExec} @@ -119,9 +120,9 @@ object HiveResult { s""""${t.name}":${toHiveString((v, t.dataType), true, formatters)}""" }.mkString("{", ",", "}") case (period: Period, YearMonthIntervalType) => - toYearMonthIntervalString(periodToMonths(period)) + toYearMonthIntervalString(periodToMonths(period), HIVE_STYLE) case (duration: Duration, DayTimeIntervalType) => - toDayTimeIntervalString(durationToMicros(duration)) + toDayTimeIntervalString(durationToMicros(duration), HIVE_STYLE) case (other, _: UserDefinedType[_]) => other.toString } } From 69dc247c5d135cb9f2f0e1dea3d4619de8a62953 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sat, 10 Apr 2021 00:25:10 +0300 Subject: [PATCH 2/4] Fix HiveResultSuite --- .../org/apache/spark/sql/execution/HiveResultSuite.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala index 187fda749a983..159f87b599253 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala @@ -113,16 +113,16 @@ class HiveResultSuite extends SharedSparkSession { test("SPARK-34984: year-month interval formatting in hive result") { val df = Seq(Period.ofYears(-10).minusMonths(1)).toDF("i") val plan1 = df.queryExecution.executedPlan - assert(hiveResultString(plan1) === Seq("INTERVAL '-10-1' YEAR TO MONTH")) + assert(hiveResultString(plan1) === Seq("-10-1")) val plan2 = df.selectExpr("array(i)").queryExecution.executedPlan - assert(hiveResultString(plan2) === Seq("[INTERVAL '-10-1' YEAR TO MONTH]")) + assert(hiveResultString(plan2) === Seq("[-10-1]")) } test("SPARK-34984: day-time interval formatting in hive result") { val df = Seq(Duration.ofDays(5).plusMillis(10)).toDF("i") val plan1 = df.queryExecution.executedPlan - assert(hiveResultString(plan1) === Seq("INTERVAL '5 00:00:00.01' DAY TO SECOND")) + assert(hiveResultString(plan1) === Seq("5 00:00:00.010000000")) val plan2 = df.selectExpr("array(i)").queryExecution.executedPlan - assert(hiveResultString(plan2) === Seq("[INTERVAL '5 00:00:00.01' DAY TO SECOND]")) + assert(hiveResultString(plan2) === Seq("[5 00:00:00.010000000]")) } } From 1a90228ef0c1fd8d003332f2d55f1da73350bc11 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sat, 10 Apr 2021 10:44:26 +0300 Subject: [PATCH 3/4] Set JIRA id --- .../apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala index b3c59bced5d57..a04c4b002d00c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/IntervalUtilsSuite.scala @@ -443,7 +443,7 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper { } } - test("SPARK-XXXXX: format year-month intervals") { + test("SPARK-35016: format year-month intervals") { Seq( 0 -> ("0-0", "INTERVAL '0-0' YEAR TO MONTH"), -11 -> ("-0-11", "INTERVAL '-0-11' YEAR TO MONTH"), @@ -460,7 +460,7 @@ class IntervalUtilsSuite extends SparkFunSuite with SQLHelper { } } - test("SPARK-XXXXX: format day-time intervals") { + test("SPARK-35016: format day-time intervals") { Seq( 0L -> ("0 00:00:00.000000000", "INTERVAL '0 00:00:00' DAY TO SECOND"), -1L -> ("-0 00:00:00.000001000", "INTERVAL '-0 00:00:00.000001' DAY TO SECOND"), From 3daaf3d0581318ccfdf1609ff7542597807023aa Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sat, 10 Apr 2021 10:44:45 +0300 Subject: [PATCH 4/4] Add JIRA id to existing tests --- .../org/apache/spark/sql/execution/HiveResultSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala index 159f87b599253..f16265ee6131c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala @@ -110,7 +110,7 @@ class HiveResultSuite extends SharedSparkSession { } } - test("SPARK-34984: year-month interval formatting in hive result") { + test("SPARK-34984, SPARK-35016: year-month interval formatting in hive result") { val df = Seq(Period.ofYears(-10).minusMonths(1)).toDF("i") val plan1 = df.queryExecution.executedPlan assert(hiveResultString(plan1) === Seq("-10-1")) @@ -118,7 +118,7 @@ class HiveResultSuite extends SharedSparkSession { assert(hiveResultString(plan2) === Seq("[-10-1]")) } - test("SPARK-34984: day-time interval formatting in hive result") { + test("SPARK-34984, SPARK-35016: day-time interval formatting in hive result") { val df = Seq(Duration.ofDays(5).plusMillis(10)).toDF("i") val plan1 = df.queryExecution.executedPlan assert(hiveResultString(plan1) === Seq("5 00:00:00.010000000"))