Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
package org.apache.spark.unsafe.types;

import java.io.Serializable;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -73,45 +72,53 @@ private static long toLong(String s) {
* This method is case-insensitive.
*/
public static CalendarInterval fromString(String s) {
if (s == null) {
return null;
}
s = s.trim();
Matcher m = p.matcher(s);
if (!m.matches() || s.compareToIgnoreCase("interval") == 0) {
try {
return fromCaseInsensitiveString(s);
} catch (IllegalArgumentException e) {
return null;
} else {
long months = toLong(m.group(1)) * 12 + toLong(m.group(2));
long microseconds = toLong(m.group(3)) * MICROS_PER_WEEK;
microseconds += toLong(m.group(4)) * MICROS_PER_DAY;
microseconds += toLong(m.group(5)) * MICROS_PER_HOUR;
microseconds += toLong(m.group(6)) * MICROS_PER_MINUTE;
microseconds += toLong(m.group(7)) * MICROS_PER_SECOND;
microseconds += toLong(m.group(8)) * MICROS_PER_MILLI;
microseconds += toLong(m.group(9));
return new CalendarInterval((int) months, microseconds);
}
}

/**
* Convert a string to CalendarInterval. Unlike fromString, this method can handle
* Convert a string to CalendarInterval. This method can handle
* strings without the `interval` prefix and throws IllegalArgumentException
* when the input string is not a valid interval.
*
* @throws IllegalArgumentException if the string is not a valid internal.
*/
public static CalendarInterval fromCaseInsensitiveString(String s) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it the only place we parse interval string? I thought we parse it with antlr parser.

Copy link
Member Author

@MaxGekk MaxGekk Oct 10, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

antlr parser does this as well but it parses sql elements like

spark-sql> select interval 10 days 1 second;
interval 1 weeks 3 days 1 seconds

here is only the place where we parse string values:

spark-sql> select interval 'interval 10 days 1 second';
interval 1 weeks 3 days 1 seconds

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks duplicated. Shall we add a parseInterval method to the ParserInterface interface and call the parser here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe something has been duplicated, and can be reused but this is heavy refactoring for this PR.

For instance, AstBuilder.visitInterval gets already split interval units but CalendarInterval.fromString() uses regular expression to parse & split:

private static Pattern p = Pattern.compile("interval" + unitRegex("year") + unitRegex("month") +
unitRegex("week") + unitRegex("day") + unitRegex("hour") + unitRegex("minute") +
unitRegex("second") + unitRegex("millisecond") + unitRegex("microsecond"),
Pattern.CASE_INSENSITIVE);

If you don't mind, I would try to do that in a separate PR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR introduced code duplication #8034 for your code #7355 5 years ago.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And your regexp is not tolerant to the order of interval units, see:

spark-sql> select interval 'interval 1 microsecond 2 months';
NULL
spark-sql> select interval 1 microsecond 2 months;
interval 2 months 1 microseconds

Copy link
Member Author

@MaxGekk MaxGekk Oct 10, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's keep them separate so far. And I will try to write flexible and common code in the near future for parsing string intervals that could handle other features found in #26055

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SGTM

if (s == null || s.trim().isEmpty()) {
throw new IllegalArgumentException("Interval cannot be null or blank.");
if (s == null) {
throw new IllegalArgumentException("Interval cannot be null");
}
String sInLowerCase = s.trim().toLowerCase(Locale.ROOT);
String interval =
sInLowerCase.startsWith("interval ") ? sInLowerCase : "interval " + sInLowerCase;
CalendarInterval cal = fromString(interval);
if (cal == null) {
String trimmed = s.trim();
if (trimmed.isEmpty()) {
throw new IllegalArgumentException("Interval cannot be blank");
}
String prefix = "interval";
String intervalStr = trimmed;
// Checks the given interval string does not start with the `interval` prefix
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just call trimmed.toLowerCase.startsWith("interval")? For perf reasons?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, don't want to lower case the entire string and allocate memory for new one to only compare small prefix.

if (!intervalStr.regionMatches(true, 0, prefix, 0, prefix.length())) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does this condition mean?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added comments about this.

// Prepend `interval` if it does not present because
// the regular expression strictly require it.
Copy link
Member Author

@MaxGekk MaxGekk Oct 10, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have not figured out how to modify the regular expression to make the interval prefix optional.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably, this needs this feature https://www.regular-expressions.info/branchreset.html which Java's regexps doesn't have.

Copy link
Contributor

@cloud-fan cloud-fan Oct 10, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about something like

String intervalStr = trimmed.toLowerCase();
if (intervalStr.startsWith("interval")) {
  intervalStr = intervalStr.drop(8)
}
// parse the interval string assuming there is no leading "interval"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

String intervalStr = trimmed.toLowerCase();

Your code is more expensive because you lower case whole input string.

// parse the interval string assuming there is no leading "interval"

Here there is a problem with current regexp when you delete the anchor "interval". Without this anchor, it cannot match to valid inputs:

scala> import java.util.regex._
import java.util.regex._

scala> def unitRegex(unit: String) = "(?:\\s+(-?\\d+)\\s+" + unit + "s?)?"
unitRegex: (unit: String)String

scala> val p = Pattern.compile(unitRegex("year") + unitRegex("month") +
     |     unitRegex("week") + unitRegex("day") + unitRegex("hour") + unitRegex("minute") +
     |     unitRegex("second") + unitRegex("millisecond") + unitRegex("microsecond"),
     |     Pattern.CASE_INSENSITIVE)
p: java.util.regex.Pattern = (?:\s+(-?\d+)\s+years?)?(?:\s+(-?\d+)\s+months?)?(?:\s+(-?\d+)\s+weeks?)?(?:\s+(-?\d+)\s+days?)?(?:\s+(-?\d+)\s+hours?)?(?:\s+(-?\d+)\s+minutes?)?(?:\s+(-?\d+)\s+seconds?)?(?:\s+(-?\d+)\s+milliseconds?)?(?:\s+(-?\d+)\s+microseconds?)?

scala> val m = p.matcher("1 month 1 second")
m: java.util.regex.Matcher = java.util.regex.Matcher[pattern=(?:\s+(-?\d+)\s+years?)?(?:\s+(-?\d+)\s+months?)?(?:\s+(-?\d+)\s+weeks?)?(?:\s+(-?\d+)\s+days?)?(?:\s+(-?\d+)\s+hours?)?(?:\s+(-?\d+)\s+minutes?)?(?:\s+(-?\d+)\s+seconds?)?(?:\s+(-?\d+)\s+milliseconds?)?(?:\s+(-?\d+)\s+microseconds?)? region=0,16 lastmatch=]

scala> m.matches()
res7: Boolean = false

If we added it back:

scala> val p = Pattern.compile("interval" + unitRegex("year") + unitRegex("month") +
     |     unitRegex("week") + unitRegex("day") + unitRegex("hour") + unitRegex("minute") +
     |     unitRegex("second") + unitRegex("millisecond") + unitRegex("microsecond"),
     |     Pattern.CASE_INSENSITIVE)
p: java.util.regex.Pattern = interval(?:\s+(-?\d+)\s+years?)?(?:\s+(-?\d+)\s+months?)?(?:\s+(-?\d+)\s+weeks?)?(?:\s+(-?\d+)\s+days?)?(?:\s+(-?\d+)\s+hours?)?(?:\s+(-?\d+)\s+minutes?)?(?:\s+(-?\d+)\s+seconds?)?(?:\s+(-?\d+)\s+milliseconds?)?(?:\s+(-?\d+)\s+microseconds?)?

scala> val m = p.matcher("interval 1 month 1 second")
m: java.util.regex.Matcher = java.util.regex.Matcher[pattern=interval(?:\s+(-?\d+)\s+years?)?(?:\s+(-?\d+)\s+months?)?(?:\s+(-?\d+)\s+weeks?)?(?:\s+(-?\d+)\s+days?)?(?:\s+(-?\d+)\s+hours?)?(?:\s+(-?\d+)\s+minutes?)?(?:\s+(-?\d+)\s+seconds?)?(?:\s+(-?\d+)\s+milliseconds?)?(?:\s+(-?\d+)\s+microseconds?)? region=0,25 lastmatch=]

scala> m.matches()
res8: Boolean = true

it can match now. That's why I had to add the interval prefix instead of removing it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you just start the regex with (interval)?? then the first matching group is either null or "interval", and the rest should match the same way?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I remember I tried this regex, and it didn't work. Have you tried it?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have checked, it doesn't work:

scala> def unitRegex(unit: String) = "(?:\\s+(-?\\d+)\\s+" + unit + "s?)?"
unitRegex: (unit: String)String

scala> val p = Pattern.compile("(interval)?" + unitRegex("year") + unitRegex("month") +
     |     unitRegex("week") + unitRegex("day") + unitRegex("hour") + unitRegex("minute") +
     |     unitRegex("second") + unitRegex("millisecond") + unitRegex("microsecond"),
     |     Pattern.CASE_INSENSITIVE)
p: java.util.regex.Pattern = (interval)?(?:\s+(-?\d+)\s+years?)?(?:\s+(-?\d+)\s+months?)?(?:\s+(-?\d+)\s+weeks?)?(?:\s+(-?\d+)\s+days?)?(?:\s+(-?\d+)\s+hours?)?(?:\s+(-?\d+)\s+minutes?)?(?:\s+(-?\d+)\s+seconds?)?(?:\s+(-?\d+)\s+milliseconds?)?(?:\s+(-?\d+)\s+microseconds?)?

scala> val m = p.matcher("1 month 1 second")
m: java.util.regex.Matcher = java.util.regex.Matcher[pattern=(interval)?(?:\s+(-?\d+)\s+years?)?(?:\s+(-?\d+)\s+months?)?(?:\s+(-?\d+)\s+weeks?)?(?:\s+(-?\d+)\s+days?)?(?:\s+(-?\d+)\s+hours?)?(?:\s+(-?\d+)\s+minutes?)?(?:\s+(-?\d+)\s+seconds?)?(?:\s+(-?\d+)\s+milliseconds?)?(?:\s+(-?\d+)\s+microseconds?)? region=0,16 lastmatch=]

scala> m.matches()
res0: Boolean = false

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried simply "(interval)?(.+)".r and it worked as expected on inputs like "abc" and "interval abc". It's a toy example and not sure if it interacts unexpectedly with the rest of the matching. no big deal, just leave it.

intervalStr = prefix + " " + trimmed;
} else if (intervalStr.length() == prefix.length()) {
throw new IllegalArgumentException("Interval string must have time units");
}

Matcher m = p.matcher(intervalStr);
if (!m.matches()) {
throw new IllegalArgumentException("Invalid interval: " + s);
}
return cal;

long months = toLong(m.group(1)) * 12 + toLong(m.group(2));
long microseconds = toLong(m.group(3)) * MICROS_PER_WEEK;
microseconds += toLong(m.group(4)) * MICROS_PER_DAY;
microseconds += toLong(m.group(5)) * MICROS_PER_HOUR;
microseconds += toLong(m.group(6)) * MICROS_PER_MINUTE;
microseconds += toLong(m.group(7)) * MICROS_PER_SECOND;
microseconds += toLong(m.group(8)) * MICROS_PER_MILLI;
microseconds += toLong(m.group(9));
return new CalendarInterval((int) months, microseconds);
}

public static long toLongWithRange(String fieldName,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

import org.junit.Test;

import java.util.Arrays;

import static org.junit.Assert.*;
import static org.apache.spark.unsafe.types.CalendarInterval.*;

Expand Down Expand Up @@ -72,36 +74,26 @@ public void fromStringTest() {
testSingleUnit("millisecond", 3, 0, 3 * MICROS_PER_MILLI);
testSingleUnit("microsecond", 3, 0, 3);

String input;

input = "interval -5 years 23 month";
CalendarInterval result = new CalendarInterval(-5 * 12 + 23, 0);
assertEquals(fromString(input), result);

input = "interval -5 years 23 month ";
assertEquals(fromString(input), result);

input = " interval -5 years 23 month ";
assertEquals(fromString(input), result);
Arrays.asList(
"interval -5 years 23 month",
" -5 years 23 month",
"interval -5 years 23 month ",
" -5 years 23 month ",
" interval -5 years 23 month ").forEach(input ->
assertEquals(fromString(input), result)
);

// Error cases
input = "interval 3month 1 hour";
assertNull(fromString(input));

input = "interval 3 moth 1 hour";
assertNull(fromString(input));

input = "interval";
assertNull(fromString(input));

input = "int";
assertNull(fromString(input));

input = "";
assertNull(fromString(input));

input = null;
assertNull(fromString(input));
Arrays.asList(
"interval 3month 1 hour",
"3month 1 hour",
"interval 3 moth 1 hour",
"3 moth 1 hour",
"interval",
"int",
"",
null).forEach(input -> assertNull(fromString(input)));
}

@Test
Expand All @@ -115,7 +107,9 @@ public void fromCaseInsensitiveStringTest() {
fromCaseInsensitiveString(input);
fail("Expected to throw an exception for the invalid input");
} catch (IllegalArgumentException e) {
assertTrue(e.getMessage().contains("cannot be null or blank"));
String msg = e.getMessage();
if (input == null) assertTrue(msg.contains("cannot be null"));
else assertTrue(msg.contains("cannot be blank"));
}
}

Expand All @@ -124,7 +118,12 @@ public void fromCaseInsensitiveStringTest() {
fromCaseInsensitiveString(input);
fail("Expected to throw an exception for the invalid input");
} catch (IllegalArgumentException e) {
assertTrue(e.getMessage().contains("Invalid interval"));
String msg = e.getMessage();
if (input.trim().equalsIgnoreCase("interval")) {
assertTrue(msg.contains("Interval string must have time units"));
} else {
assertTrue(msg.contains("Invalid interval:"));
}
}
}
}
Expand Down Expand Up @@ -268,11 +267,13 @@ public void subtractTest() {
}

private static void testSingleUnit(String unit, int number, int months, long microseconds) {
String input1 = "interval " + number + " " + unit;
String input2 = "interval " + number + " " + unit + "s";
CalendarInterval result = new CalendarInterval(months, microseconds);
assertEquals(fromString(input1), result);
assertEquals(fromString(input2), result);
Arrays.asList("interval ", "").forEach(prefix -> {
String input1 = prefix + number + " " + unit;
String input2 = prefix + number + " " + unit + "s";
CalendarInterval result = new CalendarInterval(months, microseconds);
assertEquals(fromString(input1), result);
assertEquals(fromString(input2), result);
});
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,8 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
"interval 1 years 3 months -3 days")
checkEvaluation(Cast(Literal("INTERVAL 1 Second 1 microsecond"), CalendarIntervalType),
new CalendarInterval(0, 1000001))
checkEvaluation(Cast(Literal("1 MONTH 1 Microsecond"), CalendarIntervalType),
new CalendarInterval(1, 1))
}

test("cast string to boolean") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -432,8 +432,9 @@ class ExpressionParserSuite extends AnalysisTest {
intercept("timestamP '2016-33-11 20:54:00.000'")

// Interval.
assertEqual("InterVal 'interval 3 month 1 hour'",
Literal(CalendarInterval.fromString("interval 3 month 1 hour")))
val intervalLiteral = Literal(CalendarInterval.fromString("interval 3 month 1 hour"))
assertEqual("InterVal 'interval 3 month 1 hour'", intervalLiteral)
assertEqual("INTERVAL '3 month 1 hour'", intervalLiteral)
assertEqual("Interval 'interval 3 monthsss 1 hoursss'",
Literal(null, CalendarIntervalType))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,6 @@ interval 3 years 1 hours
-- !query 45
select interval '3 year 1 hour'
-- !query 45 schema
struct<CAST(NULL AS INTERVAL):interval>
struct<interval 3 years 1 hours:interval>
-- !query 45 output
NULL
interval 3 years 1 hours
Expand Down