-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-8995][SQL] cast date strings like '2015-01-01 12:15:31' to date #7353
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
c1083fb
30e5aec
f7452fa
71f89c1
cfbaed7
0e30c0a
71622c0
34ec573
01c9ff3
ef05753
d20b8b4
ca1ae69
14f333b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,8 @@ import java.sql.{Date, Timestamp} | |
| import java.text.{DateFormat, SimpleDateFormat} | ||
| import java.util.{Calendar, TimeZone} | ||
|
|
||
| import org.apache.spark.unsafe.types.UTF8String | ||
|
|
||
| /** | ||
| * Helper functions for converting between internal and external date and time representations. | ||
| * Dates are exposed externally as java.sql.Date and are represented internally as the number of | ||
|
|
@@ -180,4 +182,169 @@ object DateTimeUtils { | |
| val nanos = (us % MICROS_PER_SECOND) * 1000L | ||
| (day.toInt, secondsInDay * NANOS_PER_SECOND + nanos) | ||
| } | ||
|
|
||
| /** | ||
| * Parses a given UTF8 date string to the corresponding [[Timestamp]] object. The format of the | ||
| * date has to be one of the following: `yyyy`, `yyyy-[m]m`, `yyyy-[m]m-[d]d`, `yyyy-[m]m-[d]d `, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's better to have each format in a single line. |
||
| * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][ms]`, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we have a way to support 1us precision? MySQL does support that. |
||
| * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][ms]Z`, | ||
| * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][ms]-[h]h:[m]m`, | ||
| * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][ms]+[h]h:[m]m`, | ||
| * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][ms]`, | ||
| * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][ms]Z`, | ||
| * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][ms]-[h]h:[m]m`, | ||
| * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][ms]+[h]h:[m]m`, | ||
| */ | ||
| def stringToTimestamp(s: UTF8String): Timestamp = { | ||
| if (s == null) { | ||
| return null | ||
| } | ||
| var timeZone: Option[Byte] = None | ||
| val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0) | ||
| var i = 0 | ||
| var currentSegmentValue = 0 | ||
| val bytes = s.getBytes | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's cool to work on bytes. |
||
| var j = 0 | ||
| var digitsMilli = 0 | ||
| while (j < bytes.length) { | ||
| val b = bytes(j) | ||
| val parsedValue = b - 48 | ||
| if (parsedValue < 0 || parsedValue > 9) { | ||
| if (i < 2) { | ||
| if (b == '-') { | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| i += 1 | ||
| } else { | ||
| return null | ||
| } | ||
| } else if (i == 2) { | ||
| if (b == ' ' || b == 'T') { | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| i += 1 | ||
| } else { | ||
| return null | ||
| } | ||
| } else if (i < 5) { | ||
| if (b == ':') { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We will support
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is equals to
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or we can use |
||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| i += 1 | ||
| } else { | ||
| return null | ||
| } | ||
| } else if (i < 7) { | ||
| if (b == 'Z') { | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| i += 1 | ||
| timeZone = Some(43) | ||
| } else if (b == '-' || b == '+') { | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| i += 1 | ||
| timeZone = Some(b) | ||
| } else if (b == '.' && i == 5) { | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| i += 1 | ||
| } else { | ||
| return null | ||
| } | ||
| if (i == 6 && b != '.') { | ||
| i += 1 | ||
| } | ||
| } else if (i > 6) { | ||
| if (b == ':') { | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| i += 1 | ||
| } else { | ||
| return null | ||
| } | ||
| } | ||
| } else { | ||
| if (i == 6) { | ||
| digitsMilli += 1 | ||
| } | ||
| currentSegmentValue = currentSegmentValue * 10 + parsedValue | ||
| } | ||
| j += 1 | ||
| } | ||
| if (i > 8) { | ||
| return null | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we just throw away the garbage on the right side? As we do for DateType.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay. If there is a space the garbage is ignored |
||
| } | ||
| segments(i) = currentSegmentValue | ||
|
|
||
| // Hive compatibility 2011-05-06 07:08:09.1000 == 2011-05-06 07:08:09.1 | ||
| if (digitsMilli == 4) { | ||
| segments(6) = segments(6) / 10 | ||
| } | ||
|
|
||
| // 18:3:1.1 is equals to 18:3:1:100 | ||
| if (digitsMilli == 1) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this could be (in microsecond):
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea! |
||
| segments(6) = segments(6) * 100 | ||
| } else if (digitsMilli == 2) { | ||
| segments(6) = segments(6) * 10 | ||
| } | ||
|
|
||
| if (segments(0) < 0 || segments(0) > 9999 || segments(1) < 1 || segments(1) > 12 || | ||
| segments(2) < 1 || segments(2) > 31 || segments(3) < 0 || segments(3) > 23 || | ||
| segments(4) < 0 || segments(4) > 59 || segments(5) < 0 || segments(5) > 59 || | ||
| segments(6) < 0 || segments(6) > 999 || segments(7) < 0 || segments(7) > 14 || | ||
| segments(8) < 0 || segments(8) > 59) { | ||
| return null | ||
| } | ||
| val c = if (timeZone.isEmpty) { | ||
| Calendar.getInstance() | ||
| } else { | ||
| Calendar.getInstance( | ||
| TimeZone.getTimeZone(f"GMT${timeZone.get.toChar}${segments(7)}%02d:${segments(8)}%02d")) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could fail
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, if the given timezone is invalid, but should the result be null instead of an error? Or when will it fail?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The returned timzone could be null, can we have a test for that? If should return null, but NPE now?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't seems to crash. It falls back to and http://docs.oracle.com/javase/7/docs/api/java/util/TimeZone.html#getTimeZone(java.lang.String) But I added a filter:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's better to return
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is now checked by the check above. TimeZone allows |
||
| } | ||
| c.set(segments(0), segments(1) - 1, segments(2), segments(3), segments(4), segments(5)) | ||
| c.set(Calendar.MILLISECOND, segments(6)) | ||
| new Timestamp(c.getTimeInMillis) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we return microseconds instead of Timestamp? |
||
| } | ||
|
|
||
| /** | ||
| * Parses a given UTF8 date string to the corresponding [[Date]] object. The format of the date | ||
| * has to be one of the following: `yyyy`, `yyyy-[m]m`, `yyyy-[m]m-[d]d`, `yyyy-[m]m-[d]d `, | ||
| * `yyyy-[m]m-[d]d *`, `yyyy-[m]m-[d]dT*` | ||
| */ | ||
| def stringToDate(s: UTF8String): Date = { | ||
| if (s == null) { | ||
| return null | ||
| } | ||
| val segments: Array[Int] = Array[Int](1, 1, 1) | ||
| var i = 0 | ||
| var currentSegmentValue = 0 | ||
| val bytes = s.getBytes | ||
| var j = 0 | ||
| while (j < bytes.length && (i < 3 && !(bytes(j) == ' ' || bytes(j) == 'T'))) { | ||
| val b = bytes(j) | ||
| if (i < 2 && b == '-') { | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| i += 1 | ||
| } else { | ||
| val parsedValue = b - 48 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes that would look better, but I was wondering if there is any performance/memory implication?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the compiler could take care it (constant folding) |
||
| if (parsedValue < 0 || parsedValue > 9) { | ||
| return null | ||
| } else { | ||
| currentSegmentValue = currentSegmentValue * 10 + parsedValue | ||
| } | ||
| } | ||
| j += 1 | ||
| } | ||
| segments(i) = currentSegmentValue | ||
| if (segments(0) < 0 || segments(0) > 9999 || segments(1) < 1 || segments(1) > 12 || | ||
| segments(2) < 1 || segments(2) > 31) { | ||
| return null | ||
| } | ||
| val c = Calendar.getInstance() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add some tests for the date that is the begging of Daylight Saving Time? To make sure that we can get the correct date back.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added tests. On |
||
| c.set(segments(0), segments(1) - 1, segments(2), 0, 0, 0) | ||
| c.set(Calendar.MILLISECOND, 0) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this necessary? |
||
| new Date(c.getTimeInMillis) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. c.getTime()
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we return the date as |
||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a Timestamp object, should we have a better name? parsedTime?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm going to adjust this.