Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
800f6e5
Strict fromDayTimeString
MaxGekk Nov 11, 2019
8cf38db
Add tests
MaxGekk Nov 11, 2019
3a1a710
Update tests in IntervalUtilsSuite
MaxGekk Nov 11, 2019
4e051d6
Fix ExpressionParserSuite
MaxGekk Nov 11, 2019
a1ef591
Add tests for invalid input into literals.sql
MaxGekk Nov 11, 2019
e88c8e0
Regen interval.sql.out
MaxGekk Nov 11, 2019
c87cf8c
Add the config spark.sql.legacy.fromDayTimeString.enabled
MaxGekk Nov 12, 2019
3f66881
Restore previous version of fromDayTimeString
MaxGekk Nov 12, 2019
da9b801
Restore old tests in IntervalUtilsSuite
MaxGekk Nov 12, 2019
c8afd33
Enable legacy behavior for postgresql tests
MaxGekk Nov 12, 2019
988d532
Regen interval.sql.out
MaxGekk Nov 12, 2019
ce28745
Set LEGACY_FROM_DAYTIME_STRING to true in ThriftServerQueryTestSuite
MaxGekk Nov 12, 2019
58ba7b4
Address Wenchen's review comment about a test
MaxGekk Nov 12, 2019
60fe0c1
Add comments for parseDayTimeLegacy
MaxGekk Nov 12, 2019
e95068e
Avoid unnecessary catching of exceptions
MaxGekk Nov 12, 2019
880e7ed
Add comments for parseDayTime()
MaxGekk Nov 12, 2019
d00c95d
Merge remote-tracking branch 'remotes/origin/master' into strict-from…
MaxGekk Nov 12, 2019
a37bad4
Regen literals.sql.out
MaxGekk Nov 12, 2019
8e733c1
Remove exact ops
MaxGekk Nov 12, 2019
6b5b7ef
Update the SQL migration guide
MaxGekk Nov 12, 2019
a2ce9ae
Merge remote-tracking branch 'remotes/origin/master' into strict-from…
MaxGekk Nov 13, 2019
fc77452
Merge remote-tracking branch 'remotes/origin/master' into strict-from…
MaxGekk Nov 15, 2019
6be5f4e
truncation -> truncated
MaxGekk Nov 15, 2019
d253094
Make the config internal
MaxGekk Nov 15, 2019
dfd0dce
Use the legacy method in the PostgreSQL dialect
MaxGekk Nov 15, 2019
d1145cd
Update the SQL migration guide
MaxGekk Nov 15, 2019
c94f1df
Reorganize tests in interval.sql
MaxGekk Nov 15, 2019
833c7b0
Regen interval.sql.out
MaxGekk Nov 15, 2019
5b26335
Check in the PostgreSQL dialect as well
MaxGekk Nov 17, 2019
f401bd2
Black list interval.sql in ThriftServerQueryTestSuite
MaxGekk Nov 17, 2019
ca46f44
Set settings explicitly in tests
MaxGekk Nov 19, 2019
eadaa92
Merge remote-tracking branch 'remotes/origin/master' into strict-from…
MaxGekk Nov 19, 2019
8f10259
Regen interval.sql.out
MaxGekk Nov 19, 2019
d3d730a
Merge remote-tracking branch 'remotes/origin/master' into strict-from…
MaxGekk Nov 20, 2019
32b4d2f
Regen interval.sql.out
MaxGekk Nov 20, 2019
e39ca52
Merge remote-tracking branch 'remotes/origin/master' into strict-from…
MaxGekk Nov 22, 2019
e012f8b
Remove explicit set
MaxGekk Nov 22, 2019
73ef32f
Regen interval.sql.out
MaxGekk Nov 22, 2019
d27d434
Merge remote-tracking branch 'remotes/origin/master' into strict-from…
MaxGekk Nov 22, 2019
9d8394e
Merge remote-tracking branch 'remotes/origin/master' into strict-from…
MaxGekk Dec 10, 2019
ef2cbe1
Remove usage of usePostgreSQLDialect
MaxGekk Dec 10, 2019
f9510e3
Regen interval.sql.out
MaxGekk Dec 10, 2019
c16f2a7
Replace 999999999 by 123456789 in seconds fractions in tests
MaxGekk Dec 11, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/sql-migration-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,8 @@ license: |

- Since Spark 3.0, the unary arithmetic operator plus(`+`) only accepts string, numeric and interval type values as inputs. Besides, `+` with a integral string representation will be coerced to double value, e.g. `+'1'` results `1.0`. In Spark version 2.4 and earlier, this operator is ignored. There is no type checking for it, thus, all type values with a `+` prefix are valid, e.g. `+ array(1, 2)` is valid and results `[1, 2]`. Besides, there is no type coercion for it at all, e.g. in Spark 2.4, the result of `+'1'` is string `1`.

- Since Spark 3.0, day-time interval strings are converted to intervals with respect to the `from` and `to` bounds. If an input string does not match to the pattern defined by specified bounds, the `ParseException` exception is thrown. For example, `interval '2 10:20' hour to minute` raises the exception because the expected format is `[+|-]h[h]:[m]m`. In Spark version 2.4, the `from` bound was not taken into account, and the `to` bound was used to truncate the resulted interval. For instance, the day-time interval string from the showed example is converted to `interval 10 hours 20 minutes`. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.fromDayTimeString.enabled` to `true`.

## Upgrading from Spark SQL 2.4 to 2.4.1

- The value of `spark.executor.heartbeatInterval`, when specified without units like "30" rather than "30s", was
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import java.util.concurrent.TimeUnit
import scala.util.control.NonFatal

import org.apache.spark.sql.catalyst.util.DateTimeConstants._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.Decimal
import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}

Expand Down Expand Up @@ -155,9 +156,6 @@ object IntervalUtils {
fromDayTimeString(s, DAY, SECOND)
}

private val dayTimePattern =
"^([+|-])?((\\d+) )?((\\d+):)?(\\d+):(\\d+)(\\.(\\d+))?$".r

/**
* Parse dayTime string in form: [-]d HH:mm:ss.nnnnnnnnn and [-]HH:mm:ss.nnnnnnnnn
*
Expand All @@ -168,9 +166,35 @@ object IntervalUtils {
* - MINUTE TO SECOND
*/
def fromDayTimeString(input: String, from: IntervalUnit, to: IntervalUnit): CalendarInterval = {
if (SQLConf.get.getConf(SQLConf.LEGACY_FROM_DAYTIME_STRING)) {
parseDayTimeLegacy(input, from, to)
} else {
parseDayTime(input, from, to)
}
}

private val dayTimePatternLegacy =
"^([+|-])?((\\d+) )?((\\d+):)?(\\d+):(\\d+)(\\.(\\d+))?$".r

/**
* Legacy method of parsing a string in a day-time format. It ignores the `from` bound,
* and takes into account only the `to` bound by truncating the result. For example,
* if the input string is "2 12:30:15", `from` is "hour" and `to` is "second", the result
* is "2 days 12 hours 30 minutes".
*
* @param input The day-time string
* @param from The interval units from which the input strings begins
* @param to The interval units at which the input string ends
* @return an instance of `CalendarInterval` if parsing completes successfully otherwise
* the exception `IllegalArgumentException` is raised.
*/
private def parseDayTimeLegacy(
input: String,
from: IntervalUnit,
to: IntervalUnit): CalendarInterval = {
require(input != null, "Interval day-time string must be not null")
assert(input.length == input.trim.length)
val m = dayTimePattern.pattern.matcher(input)
val m = dayTimePatternLegacy.pattern.matcher(input)
require(m.matches, s"Interval string must match day-time format of 'd h:m:s.n': $input")

try {
Expand Down Expand Up @@ -222,6 +246,78 @@ object IntervalUtils {
}
}

private val signRe = "(?<sign>[+|-])"
private val dayRe = "(?<day>\\d+)"
private val hourRe = "(?<hour>\\d{1,2})"
private val minuteRe = "(?<minute>\\d{1,2})"
private val secondRe = "(?<second>(\\d{1,2})(\\.(\\d{1,9}))?)"

private val dayTimePattern = Map(
(MINUTE, SECOND) -> s"^$signRe?$minuteRe:$secondRe$$".r,
(HOUR, MINUTE) -> s"^$signRe?$hourRe:$minuteRe$$".r,
(HOUR, SECOND) -> s"^$signRe?$hourRe:$minuteRe:$secondRe$$".r,
(DAY, HOUR) -> s"^$signRe?$dayRe $hourRe$$".r,
(DAY, MINUTE) -> s"^$signRe?$dayRe $hourRe:$minuteRe$$".r,
(DAY, SECOND) -> s"^$signRe?$dayRe $hourRe:$minuteRe:$secondRe$$".r
)

private def unitsRange(start: IntervalUnit, end: IntervalUnit): Seq[IntervalUnit] = {
(start.id to end.id).map(IntervalUnit(_))
}

/**
* Parses an input string in the day-time format defined by the `from` and `to` bounds.
* It supports the following formats:
* - [+|-]D+ H[H]:m[m]:s[s][.SSSSSSSSS] for DAY TO SECOND
* - [+|-]D+ H[H]:m[m] for DAY TO MINUTE
* - [+|-]D+ H[H] for DAY TO HOUR
* - [+|-]H[H]:m[m]s[s][.SSSSSSSSS] for HOUR TO SECOND
* - [+|-]H[H]:m[m] for HOUR TO MINUTE
* - [+|-]m[m]:s[s][.SSSSSSSSS] for MINUTE TO SECOND
*
* Note: the seconds fraction is truncated to microseconds.
*
* @param input The input string to parse.
* @param from The interval unit from which the input string begins.
* @param to The interval unit at where the input string ends.
* @return an instance of `CalendarInterval` if the input string was parsed successfully
* otherwise throws an exception.
* @throws IllegalArgumentException The input string has incorrect format and cannot be parsed.
* @throws ArithmeticException An interval unit value is out of valid range or the resulted
* interval fields `days` or `microseconds` are out of the valid
* ranges.
*/
private def parseDayTime(
input: String,
from: IntervalUnit,
to: IntervalUnit): CalendarInterval = {
require(input != null, "Interval day-time string must be not null")
val regexp = dayTimePattern.get(from -> to)
require(regexp.isDefined, s"Cannot support (interval '$input' $from to $to) expression")
val pattern = regexp.get.pattern
val m = pattern.matcher(input)
require(m.matches, s"Interval string must match day-time format of '$pattern': $input")
var micros: Long = 0L
var days: Int = 0
unitsRange(to, from).foreach {
case unit @ DAY =>
days = toLongWithRange(unit, m.group(unit.toString), 0, Int.MaxValue).toInt
case unit @ HOUR =>
val parsed = toLongWithRange(unit, m.group(unit.toString), 0, 23)
micros = Math.addExact(micros, parsed * MICROS_PER_HOUR)
case unit @ MINUTE =>
val parsed = toLongWithRange(unit, m.group(unit.toString), 0, 59)
micros = Math.addExact(micros, parsed * MICROS_PER_MINUTE)
case unit @ SECOND =>
micros = Math.addExact(micros, parseSecondNano(m.group(unit.toString)))
case _ =>
throw new IllegalArgumentException(
s"Cannot support (interval '$input' $from to $to) expression")
}
val sign = if (m.group("sign") != null && m.group("sign") == "-") -1 else 1
new CalendarInterval(0, sign * days, sign * micros)
}

// Parses a string with nanoseconds, truncates the result and returns microseconds
private def parseNanos(nanosStr: String, isNegative: Boolean): Long = {
if (nanosStr != null) {
Expand All @@ -237,6 +333,30 @@ object IntervalUtils {
}
}

/**
* Parse second_nano string in ss.nnnnnnnnn format to microseconds
*/
private def parseSecondNano(secondNano: String): Long = {
def parseSeconds(secondsStr: String): Long = {
toLongWithRange(
SECOND,
secondsStr,
Long.MinValue / MICROS_PER_SECOND,
Long.MaxValue / MICROS_PER_SECOND) * MICROS_PER_SECOND
}

secondNano.split("\\.") match {
case Array(secondsStr) => parseSeconds(secondsStr)
case Array("", nanosStr) => parseNanos(nanosStr, false)
case Array(secondsStr, nanosStr) =>
val seconds = parseSeconds(secondsStr)
Math.addExact(seconds, parseNanos(nanosStr, seconds < 0))
case _ =>
throw new IllegalArgumentException(
"Interval string does not match second-nano format of ss.nnnnnnnnn")
}
}

/**
* Gets interval duration
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2087,6 +2087,17 @@ object SQLConf {
.stringConf
.createWithDefault(
"https://maven-central.storage-download.googleapis.com/repos/central/data/")

val LEGACY_FROM_DAYTIME_STRING =
buildConf("spark.sql.legacy.fromDayTimeString.enabled")
.internal()
.doc("When true, the `from` bound is not taken into account in conversion of " +
"a day-time string to an interval, and the `to` bound is used to skip" +
"all interval units out of the specified range. If it is set to `false`, " +
"`ParseException` is thrown if the input does not match to the pattern " +
"defined by `from` and `to`.")
.booleanConf
.createWithDefault(false)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,7 @@ class ExpressionParserSuite extends AnalysisTest {
"0:0:0",
"0:0:1")
hourTimeValues.foreach { value =>
val result = Literal(IntervalUtils.fromDayTimeString(value))
val result = Literal(IntervalUtils.fromDayTimeString(value, HOUR, SECOND))
checkIntervals(s"'$value' hour to second", result)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@ package org.apache.spark.sql.catalyst.util
import java.util.concurrent.TimeUnit

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.plans.SQLHelper
import org.apache.spark.sql.catalyst.util.DateTimeConstants._
import org.apache.spark.sql.catalyst.util.IntervalUtils._
import org.apache.spark.sql.catalyst.util.IntervalUtils.IntervalUnit._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}

class IntervalUtilsSuite extends SparkFunSuite {
class IntervalUtilsSuite extends SparkFunSuite with SQLHelper {

private def checkFromString(input: String, expected: CalendarInterval): Unit = {
assert(stringToInterval(UTF8String.fromString(input)) === expected)
Expand Down Expand Up @@ -160,43 +162,45 @@ class IntervalUtilsSuite extends SparkFunSuite {
}
}

test("from day-time string") {
assert(fromDayTimeString("5 12:40:30.999999999") ===
new CalendarInterval(
0,
5,
12 * MICROS_PER_HOUR +
40 * MICROS_PER_MINUTE +
30 * MICROS_PER_SECOND + 999999L))
assert(fromDayTimeString("10 0:12:0.888") ===
new CalendarInterval(
0,
10,
12 * MICROS_PER_MINUTE + 888 * MICROS_PER_MILLIS))
assert(fromDayTimeString("-3 0:0:0") === new CalendarInterval(0, -3, 0L))

try {
fromDayTimeString("5 30:12:20")
fail("Expected to throw an exception for the invalid input")
} catch {
case e: IllegalArgumentException =>
assert(e.getMessage.contains("hour 30 outside range"))
}

try {
fromDayTimeString("5 30-12")
fail("Expected to throw an exception for the invalid input")
} catch {
case e: IllegalArgumentException =>
assert(e.getMessage.contains("must match day-time format"))
}

try {
fromDayTimeString("5 1:12:20", HOUR, MICROSECOND)
fail("Expected to throw an exception for the invalid convention type")
} catch {
case e: IllegalArgumentException =>
assert(e.getMessage.contains("Cannot support (interval"))
test("from day-time string - legacy") {
withSQLConf(SQLConf.LEGACY_FROM_DAYTIME_STRING.key -> "true") {
assert(fromDayTimeString("5 12:40:30.999999999") ===
new CalendarInterval(
0,
5,
12 * MICROS_PER_HOUR +
40 * MICROS_PER_MINUTE +
30 * MICROS_PER_SECOND + 999999L))
assert(fromDayTimeString("10 0:12:0.888") ===
new CalendarInterval(
0,
10,
12 * MICROS_PER_MINUTE + 888 * MICROS_PER_MILLIS))
assert(fromDayTimeString("-3 0:0:0") === new CalendarInterval(0, -3, 0L))

try {
fromDayTimeString("5 30:12:20")
fail("Expected to throw an exception for the invalid input")
} catch {
case e: IllegalArgumentException =>
assert(e.getMessage.contains("hour 30 outside range"))
}

try {
fromDayTimeString("5 30-12")
fail("Expected to throw an exception for the invalid input")
} catch {
case e: IllegalArgumentException =>
assert(e.getMessage.contains("must match day-time format"))
}

try {
fromDayTimeString("5 1:12:20", HOUR, MICROSECOND)
fail("Expected to throw an exception for the invalid convention type")
} catch {
case e: IllegalArgumentException =>
assert(e.getMessage.contains("Cannot support (interval"))
}
}
}

Expand Down Expand Up @@ -384,4 +388,61 @@ class IntervalUtilsSuite extends SparkFunSuite {
val i9 = new CalendarInterval(0, 0, -3000 * MICROS_PER_HOUR)
assert(IntervalUtils.toMultiUnitsString(i9) === "-3000 hours")
}

test("from day-time string") {
def check(input: String, from: IntervalUnit, to: IntervalUnit, expected: String): Unit = {
withClue(s"from = $from, to = $to") {
val expectedUtf8 = UTF8String.fromString(expected)
assert(fromDayTimeString(input, from, to) === safeStringToInterval(expectedUtf8))
}
}
def checkFail(
input: String,
from: IntervalUnit,
to: IntervalUnit,
errMsg: String): Unit = {
try {
fromDayTimeString(input, from, to)
fail("Expected to throw an exception for the invalid input")
} catch {
case e: IllegalArgumentException =>
assert(e.getMessage.contains(errMsg))
}
}

check("12:40", HOUR, MINUTE, "12 hours 40 minutes")
check("+12:40", HOUR, MINUTE, "12 hours 40 minutes")
check("-12:40", HOUR, MINUTE, "-12 hours -40 minutes")
checkFail("5 12:40", HOUR, MINUTE, "must match day-time format")

check("12:40:30.999999999", HOUR, SECOND, "12 hours 40 minutes 30.999999 seconds")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we test 12:40:30.0123456789 as well?

check("+12:40:30.123456789", HOUR, SECOND, "12 hours 40 minutes 30.123456 seconds")
check("-12:40:30.123456789", HOUR, SECOND, "-12 hours -40 minutes -30.123456 seconds")
checkFail("5 12:40:30", HOUR, SECOND, "must match day-time format")
checkFail("12:40:30.0123456789", HOUR, SECOND, "must match day-time format")

check("40:30.123456789", MINUTE, SECOND, "40 minutes 30.123456 seconds")
check("+40:30.123456789", MINUTE, SECOND, "40 minutes 30.123456 seconds")
check("-40:30.123456789", MINUTE, SECOND, "-40 minutes -30.123456 seconds")
checkFail("12:40:30", MINUTE, SECOND, "must match day-time format")

check("5 12", DAY, HOUR, "5 days 12 hours")
check("+5 12", DAY, HOUR, "5 days 12 hours")
check("-5 12", DAY, HOUR, "-5 days -12 hours")
checkFail("5 12:30", DAY, HOUR, "must match day-time format")

check("5 12:40", DAY, MINUTE, "5 days 12 hours 40 minutes")
check("+5 12:40", DAY, MINUTE, "5 days 12 hours 40 minutes")
check("-5 12:40", DAY, MINUTE, "-5 days -12 hours -40 minutes")
checkFail("5 12", DAY, MINUTE, "must match day-time format")

check("5 12:40:30.123", DAY, SECOND, "5 days 12 hours 40 minutes 30.123 seconds")
check("+5 12:40:30.123456", DAY, SECOND, "5 days 12 hours 40 minutes 30.123456 seconds")
check("-5 12:40:30.123456789", DAY, SECOND, "-5 days -12 hours -40 minutes -30.123456 seconds")
checkFail("5 12", DAY, SECOND, "must match day-time format")

checkFail("5 30:12:20", DAY, SECOND, "hour 30 outside range")
checkFail("5 30-12", DAY, SECOND, "must match day-time format")
checkFail("5 1:12:20", HOUR, MICROSECOND, "Cannot support (interval")
}
}
15 changes: 10 additions & 5 deletions sql/core/src/test/resources/sql-tests/inputs/interval.sql
Original file line number Diff line number Diff line change
Expand Up @@ -97,17 +97,22 @@ select interval 1 year 2 month 3 week 4 day 5 hour 6 minute 7 seconds 8 millisec
select interval '30' year '25' month '-100' day '40' hour '80' minute '299.889987299' second;
select interval '0 0:0:0.1' day to second;
select interval '10-9' year to month;
select interval '20 15' day to hour;
select interval '20 15:40' day to minute;
select interval '20 15:40:32.99899999' day to second;
select interval '15:40' hour to minute;
select interval '15:40:32.99899999' hour to second;
select interval '40:32.99899999' minute to second;
select interval '40:32' minute to second;
select interval 30 day day;

-- invalid day-time string intervals
select interval '20 15:40:32.99899999' day to hour;
select interval '20 15:40:32.99899999' day to minute;
select interval '20 15:40:32.99899999' day to second;
select interval '15:40:32.99899999' hour to minute;
select interval '15:40.99899999' hour to second;
select interval '15:40' hour to second;
select interval '15:40:32.99899999' hour to second;
select interval '20 40:32.99899999' minute to second;
select interval '40:32.99899999' minute to second;
select interval '40:32' minute to second;
select interval 30 day day;

-- ns is not supported
select interval 10 nanoseconds;
Expand Down
Loading