-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-10439] [sql] Add bound checks to DateTimeUtils. #8606
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
ffe39e4
196ba9e
63d0c39
00757d2
edeb06e
922c09e
3869816
6064ef2
7a4cf9e
fa8d6f6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
|
|
||
| package org.apache.spark.sql.catalyst.util | ||
|
|
||
| import java.lang.{Long => JLong} | ||
| import java.sql.{Date, Timestamp} | ||
| import java.text.{DateFormat, SimpleDateFormat} | ||
| import java.util.{TimeZone, Calendar} | ||
|
|
@@ -56,6 +57,12 @@ object DateTimeUtils { | |
|
|
||
| @transient lazy val defaultTimeZone = TimeZone.getDefault | ||
|
|
||
| // Constants defining the allowed ranges for timestampts that fit in parquet files. Mostly | ||
| // used by tests to avoid duplication. | ||
| private[spark] final val MIN_TIMESTAMP: Long = | ||
| -(DateTimeUtils.JULIAN_DAY_OF_EPOCH * DateTimeUtils.SECONDS_PER_DAY * 1000) | ||
| private[spark] final val MAX_TIMESTAMP: Long = java.lang.Long.MAX_VALUE / 1000 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, although we can do but, we cannot do For text based format, the upper bound of timestamp value is much smaller.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the ranges might be different for the string parser, but is there any documentation about what that range would be? |
||
|
|
||
| // Java TimeZone has no mention of thread safety. Use thread local instance to be safe. | ||
| private val threadLocalLocalTimeZone = new ThreadLocal[TimeZone] { | ||
| override protected def initialValue: TimeZone = { | ||
|
|
@@ -82,7 +89,9 @@ object DateTimeUtils { | |
| // SPARK-6785: use Math.floor so negative number of days (dates before 1970) | ||
| // will correctly work as input for function toJavaDate(Int) | ||
| val millisLocal = millisUtc + threadLocalLocalTimeZone.get().getOffset(millisUtc) | ||
| Math.floor(millisLocal.toDouble / MILLIS_PER_DAY).toInt | ||
| val days = Math.floor(millisLocal.toDouble / MILLIS_PER_DAY) | ||
| require(days <= Integer.MAX_VALUE && days >= Integer.MIN_VALUE, "Date exceeeds allowed range.") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd assume the millisUtc is a valid timestamp, then this check is not needed. |
||
| days.toInt | ||
| } | ||
|
|
||
| // reverse of millisToDays | ||
|
|
@@ -172,6 +181,8 @@ object DateTimeUtils { | |
| */ | ||
| def fromJavaTimestamp(t: Timestamp): SQLTimestamp = { | ||
| if (t != null) { | ||
| require(t.getTime() <= JLong.MAX_VALUE / 1000 && t.getTime() >= JLong.MIN_VALUE / 1000, | ||
| s"Timestamp exceeds allowed range (${t.getTime()}).") | ||
| t.getTime() * 1000L + (t.getNanos().toLong / 1000) % 1000L | ||
| } else { | ||
| 0L | ||
|
|
@@ -193,9 +204,15 @@ object DateTimeUtils { | |
| */ | ||
| def toJulianDay(us: SQLTimestamp): (Int, Long) = { | ||
| val seconds = us / MICROS_PER_SECOND | ||
| val day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH | ||
| val secondsInDay = seconds % SECONDS_PER_DAY | ||
| val nanos = (us % MICROS_PER_SECOND) * 1000L | ||
| var day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH | ||
| var secondsInDay = seconds % SECONDS_PER_DAY | ||
| var nanos = (us % MICROS_PER_SECOND) * 1000L | ||
| if (us < 0) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch! |
||
| day -= 1 | ||
| secondsInDay += SECONDS_PER_DAY | ||
| nanos += (SECONDS_PER_DAY * MICROS_PER_SECOND * 1000L) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not correct, it should be:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch; my code can return
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Created https://issues.apache.org/jira/browse/SPARK-10522 for this, then we could back port the fix for 1.5. |
||
| require(day >= 0, "Timestamp exceeds allowed range.") | ||
| } | ||
| (day.toInt, secondsInDay * NANOS_PER_SECOND + nanos) | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I gave it a try and I got
So, is it the right lower bound?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's the right value if you consider the math. But I've seen really weird behavior in how the Java classes print very large (positive or negative) timestamps, and I didn't find anything in the javadocs about limits, so I'm not sure what's the best way to proceed. We can choose an arbitrary minimum and maximum, but what would those be based on?
For example,
java.sql.Timestampwill print the same formatted date string for both-793495812000and-123456789012000L.