Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -164,17 +164,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
// TimestampConverter
private[this] def castToTimestamp(from: DataType): Any => Any = from match {
case StringType =>
buildCast[UTF8String](_, utfs => {
// Throw away extra if more than 9 decimal places
val s = utfs.toString
val periodIdx = s.indexOf(".")
var n = s
if (periodIdx != -1 && n.length() - periodIdx > 9) {
n = n.substring(0, periodIdx + 10)
}
try DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(n))
catch { case _: java.lang.IllegalArgumentException => null }
})
buildCast[UTF8String](_, utfs => DateTimeUtils.stringToTimestamp(utfs).orNull)
case BooleanType =>
buildCast[Boolean](_, b => if (b) 1L else 0)
case LongType =>
Expand Down Expand Up @@ -222,10 +212,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
// DateConverter
private[this] def castToDate(from: DataType): Any => Any = from match {
case StringType =>
buildCast[UTF8String](_, s =>
try DateTimeUtils.fromJavaDate(Date.valueOf(s.toString))
catch { case _: java.lang.IllegalArgumentException => null }
)
buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s).orNull)
case TimestampType =>
// throw valid precision more than seconds, according to Hive.
// Timestamp.nanos is in 0 to 999,999,999, no more than a second.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import java.sql.{Date, Timestamp}
import java.text.{DateFormat, SimpleDateFormat}
import java.util.{Calendar, TimeZone}

import org.apache.spark.unsafe.types.UTF8String

/**
* Helper functions for converting between internal and external date and time representations.
* Dates are exposed externally as java.sql.Date and are represented internally as the number of
Expand Down Expand Up @@ -180,4 +182,200 @@ object DateTimeUtils {
val nanos = (us % MICROS_PER_SECOND) * 1000L
(day.toInt, secondsInDay * NANOS_PER_SECOND + nanos)
}

/**
* Parses a given UTF8 date string to the corresponding a corresponding [[Long]] value.
* The return type is [[Option]] in order to distinguish between 0L and null. The following
* formats are allowed:
*
* `yyyy`
* `yyyy-[m]m`
* `yyyy-[m]m-[d]d`
* `yyyy-[m]m-[d]d `
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
*/
def stringToTimestamp(s: UTF8String): Option[Long] = {
if (s == null) {
return None
}
var timeZone: Option[Byte] = None
val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0)
var i = 0
var currentSegmentValue = 0
val bytes = s.getBytes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's cool to work on bytes.

var j = 0
var digitsMilli = 0
var justTime = false
while (j < bytes.length) {
val b = bytes(j)
val parsedValue = b - '0'.toByte
if (parsedValue < 0 || parsedValue > 9) {
if (j == 0 && b == 'T') {
justTime = true
i += 3
} else if (i < 2) {
if (b == '-') {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
} else if (i == 0 && b == ':') {
justTime = true
segments(3) = currentSegmentValue
currentSegmentValue = 0
i = 4
} else {
return None
}
} else if (i == 2) {
if (b == ' ' || b == 'T') {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
} else {
return None
}
} else if (i == 3 || i == 4) {
if (b == ':') {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will support 2014:1:1 here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is equals to i == 3 || i == 4, because of the if and elseif before. I am going to adjust the checks that
they are more readable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or we can use switch instead of if.

segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
} else {
return None
}
} else if (i == 5 || i == 6) {
if (b == 'Z') {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
timeZone = Some(43)
} else if (b == '-' || b == '+') {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
timeZone = Some(b)
} else if (b == '.' && i == 5) {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
} else {
return None
}
if (i == 6 && b != '.') {
i += 1
}
} else {
if (b == ':' || b == ' ') {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
} else {
return None
}
}
} else {
if (i == 6) {
digitsMilli += 1
}
currentSegmentValue = currentSegmentValue * 10 + parsedValue
}
j += 1
}

segments(i) = currentSegmentValue

while (digitsMilli < 6) {
segments(6) *= 10
digitsMilli += 1
}

if (!justTime && (segments(0) < 1000 || segments(0) > 9999 || segments(1) < 1 ||
segments(1) > 12 || segments(2) < 1 || segments(2) > 31)) {
return None
}

if (segments(3) < 0 || segments(3) > 23 || segments(4) < 0 || segments(4) > 59 ||
segments(5) < 0 || segments(5) > 59 || segments(6) < 0 || segments(6) > 999999 ||
segments(7) < 0 || segments(7) > 23 || segments(8) < 0 || segments(8) > 59) {
return None
}

val c = if (timeZone.isEmpty) {
Calendar.getInstance()
} else {
Calendar.getInstance(
TimeZone.getTimeZone(f"GMT${timeZone.get.toChar}${segments(7)}%02d:${segments(8)}%02d"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could fail

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, if the given timezone is invalid, but should the result be null instead of an error? Or when will it fail?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The returned timzone could be null, can we have a test for that?

If should return null, but NPE now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seems to crash. It falls back to UTC

scala> TimeZone.getTimeZone("asd")
res0: java.util.TimeZone = sun.util.calendar.ZoneInfo[id="GMT",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null]

and



scala> TimeZone.getTimeZone("GMT+99:09")
res2: java.util.TimeZone = sun.util.calendar.ZoneInfo[id="GMT",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null]

http://docs.oracle.com/javase/7/docs/api/java/util/TimeZone.html#getTimeZone(java.lang.String)
the specified TimeZone, or the _GMT zone if the given ID cannot be understood._

But I added a filter:
segments(7) < 0 || segments(7) > 14 || segments(8) < 0 || segments(8) > 59
This isn't very consistent. We should either allow everything and fall back to GMT or filter
more sophisticated, shouldn't we?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better to return null if we could know that if failed to lookup a timezone.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is now checked by the check above. TimeZone allows 0-23 as hour and 0-59 as minute. If this is not given we are going to return None

}

if (justTime) {
c.set(Calendar.HOUR, segments(3))
c.set(Calendar.MINUTE, segments(4))
c.set(Calendar.SECOND, segments(5))
} else {
c.set(segments(0), segments(1) - 1, segments(2), segments(3), segments(4), segments(5))
}

Some(c.getTimeInMillis / 1000 * 1000000 + segments(6))
}

/**
* Parses a given UTF8 date string to the corresponding a corresponding [[Int]] value.
* The return type is [[Option]] in order to distinguish between 0 and null. The following
* formats are allowed:
*
* `yyyy`,
* `yyyy-[m]m`
* `yyyy-[m]m-[d]d`
* `yyyy-[m]m-[d]d `
* `yyyy-[m]m-[d]d *`
* `yyyy-[m]m-[d]dT*`
*/
def stringToDate(s: UTF8String): Option[Int] = {
if (s == null) {
return None
}
val segments: Array[Int] = Array[Int](1, 1, 1)
var i = 0
var currentSegmentValue = 0
val bytes = s.getBytes
var j = 0
while (j < bytes.length && (i < 3 && !(bytes(j) == ' ' || bytes(j) == 'T'))) {
val b = bytes(j)
if (i < 2 && b == '-') {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
} else {
val parsedValue = b - '0'.toByte
if (parsedValue < 0 || parsedValue > 9) {
return None
} else {
currentSegmentValue = currentSegmentValue * 10 + parsedValue
}
}
j += 1
}
segments(i) = currentSegmentValue
if (segments(0) < 1000 || segments(0) > 9999 || segments(1) < 1 || segments(1) > 12 ||
segments(2) < 1 || segments(2) > 31) {
return None
}
val c = Calendar.getInstance()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add some tests for the date that is the begging of Daylight Saving Time? To make sure that we can get the correct date back.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added tests. On 2015-3-8, I check if 2:30 == 3:30. In November 2:30 is ambiguous, because of that, I check if we get the same result as the Calendar. Besides that I check if we can cast from TimeStamp to String to TimeStamp.

c.set(segments(0), segments(1) - 1, segments(2), 0, 0, 0)
Some((c.getTimeInMillis / 1000 / 3600 / 24).toInt)
}
}
Loading