Skip to content
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -165,15 +165,8 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
private[this] def castToTimestamp(from: DataType): Any => Any = from match {
case StringType =>
buildCast[UTF8String](_, utfs => {
// Throw away extra if more than 9 decimal places
val s = utfs.toString
val periodIdx = s.indexOf(".")
var n = s
if (periodIdx != -1 && n.length() - periodIdx > 9) {
n = n.substring(0, periodIdx + 10)
}
try DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(n))
catch { case _: java.lang.IllegalArgumentException => null }
val parsedDateString = DateTimeUtils.stringToTimestamp(utfs)
if (parsedDateString == null) null else DateTimeUtils.fromJavaTimestamp(parsedDateString)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a Timestamp object, should we have a better name? parsedTime?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to adjust this.

})
case BooleanType =>
buildCast[Boolean](_, b => if (b) 1L else 0)
Expand Down Expand Up @@ -222,10 +215,10 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
// DateConverter
private[this] def castToDate(from: DataType): Any => Any = from match {
case StringType =>
buildCast[UTF8String](_, s =>
try DateTimeUtils.fromJavaDate(Date.valueOf(s.toString))
catch { case _: java.lang.IllegalArgumentException => null }
)
buildCast[UTF8String](_, s => {
val parsedDate = DateTimeUtils.stringToDate(s)
if (parsedDate == null) null else DateTimeUtils.fromJavaDate(parsedDate)
})
case TimestampType =>
// throw valid precision more than seconds, according to Hive.
// Timestamp.nanos is in 0 to 999,999,999, no more than a second.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import java.sql.{Date, Timestamp}
import java.text.{DateFormat, SimpleDateFormat}
import java.util.{Calendar, TimeZone}

import org.apache.spark.unsafe.types.UTF8String

/**
* Helper functions for converting between internal and external date and time representations.
* Dates are exposed externally as java.sql.Date and are represented internally as the number of
Expand Down Expand Up @@ -180,4 +182,169 @@ object DateTimeUtils {
val nanos = (us % MICROS_PER_SECOND) * 1000L
(day.toInt, secondsInDay * NANOS_PER_SECOND + nanos)
}

/**
* Parses a given UTF8 date string to the corresponding [[Timestamp]] object. The format of the
* date has to be one of the following: `yyyy`, `yyyy-[m]m`, `yyyy-[m]m-[d]d`, `yyyy-[m]m-[d]d `,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better to have each format in a single line.

* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][ms]`,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we have a way to support 1us precision? MySQL does support that.

* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][ms]Z`,
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][ms]-[h]h:[m]m`,
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][ms]+[h]h:[m]m`,
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][ms]`,
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][ms]Z`,
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][ms]-[h]h:[m]m`,
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][ms]+[h]h:[m]m`,
*/
def stringToTimestamp(s: UTF8String): Timestamp = {
if (s == null) {
return null
}
var timeZone: Option[Byte] = None
val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0)
var i = 0
var currentSegmentValue = 0
val bytes = s.getBytes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's cool to work on bytes.

var j = 0
var digitsMilli = 0
while (j < bytes.length) {
val b = bytes(j)
val parsedValue = b - 48
if (parsedValue < 0 || parsedValue > 9) {
if (i < 2) {
if (b == '-') {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
} else {
return null
}
} else if (i == 2) {
if (b == ' ' || b == 'T') {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
} else {
return null
}
} else if (i < 5) {
if (b == ':') {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will support 2014:1:1 here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is equals to i == 3 || i == 4, because of the if and elseif before. I am going to adjust the checks that
they are more readable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or we can use switch instead of if.

segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
} else {
return null
}
} else if (i < 7) {
if (b == 'Z') {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
timeZone = Some(43)
} else if (b == '-' || b == '+') {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
timeZone = Some(b)
} else if (b == '.' && i == 5) {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
} else {
return null
}
if (i == 6 && b != '.') {
i += 1
}
} else if (i > 6) {
if (b == ':') {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
} else {
return null
}
}
} else {
if (i == 6) {
digitsMilli += 1
}
currentSegmentValue = currentSegmentValue * 10 + parsedValue
}
j += 1
}
if (i > 8) {
return null
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we just throw away the garbage on the right side? As we do for DateType.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay. If there is a space the garbage is ignored

}
segments(i) = currentSegmentValue

// Hive compatibility 2011-05-06 07:08:09.1000 == 2011-05-06 07:08:09.1
if (digitsMilli == 4) {
segments(6) = segments(6) / 10
}

// 18:3:1.1 is equals to 18:3:1:100
if (digitsMilli == 1) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this could be (in microsecond):

while (digitsMilli < 6) {
  segments(6) *= 10;
  digitsMilli += 1;
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea!

segments(6) = segments(6) * 100
} else if (digitsMilli == 2) {
segments(6) = segments(6) * 10
}

if (segments(0) < 0 || segments(0) > 9999 || segments(1) < 1 || segments(1) > 12 ||
segments(2) < 1 || segments(2) > 31 || segments(3) < 0 || segments(3) > 23 ||
segments(4) < 0 || segments(4) > 59 || segments(5) < 0 || segments(5) > 59 ||
segments(6) < 0 || segments(6) > 999 || segments(7) < 0 || segments(7) > 14 ||
segments(8) < 0 || segments(8) > 59) {
return null
}
val c = if (timeZone.isEmpty) {
Calendar.getInstance()
} else {
Calendar.getInstance(
TimeZone.getTimeZone(f"GMT${timeZone.get.toChar}${segments(7)}%02d:${segments(8)}%02d"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could fail

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, if the given timezone is invalid, but should the result be null instead of an error? Or when will it fail?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The returned timzone could be null, can we have a test for that?

If should return null, but NPE now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seems to crash. It falls back to UTC

scala> TimeZone.getTimeZone("asd")
res0: java.util.TimeZone = sun.util.calendar.ZoneInfo[id="GMT",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null]

and



scala> TimeZone.getTimeZone("GMT+99:09")
res2: java.util.TimeZone = sun.util.calendar.ZoneInfo[id="GMT",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null]

http://docs.oracle.com/javase/7/docs/api/java/util/TimeZone.html#getTimeZone(java.lang.String)
the specified TimeZone, or the _GMT zone if the given ID cannot be understood._

But I added a filter:
segments(7) < 0 || segments(7) > 14 || segments(8) < 0 || segments(8) > 59
This isn't very consistent. We should either allow everything and fall back to GMT or filter
more sophisticated, shouldn't we?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better to return null if we could know that if failed to lookup a timezone.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is now checked by the check above. TimeZone allows 0-23 as hour and 0-59 as minute. If this is not given we are going to return None

}
c.set(segments(0), segments(1) - 1, segments(2), segments(3), segments(4), segments(5))
c.set(Calendar.MILLISECOND, segments(6))
new Timestamp(c.getTimeInMillis)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we return microseconds instead of Timestamp?

}

/**
* Parses a given UTF8 date string to the corresponding [[Date]] object. The format of the date
* has to be one of the following: `yyyy`, `yyyy-[m]m`, `yyyy-[m]m-[d]d`, `yyyy-[m]m-[d]d `,
* `yyyy-[m]m-[d]d *`, `yyyy-[m]m-[d]dT*`
*/
def stringToDate(s: UTF8String): Date = {
if (s == null) {
return null
}
val segments: Array[Int] = Array[Int](1, 1, 1)
var i = 0
var currentSegmentValue = 0
val bytes = s.getBytes
var j = 0
while (j < bytes.length && (i < 3 && !(bytes(j) == ' ' || bytes(j) == 'T'))) {
val b = bytes(j)
if (i < 2 && b == '-') {
segments(i) = currentSegmentValue
currentSegmentValue = 0
i += 1
} else {
val parsedValue = b - 48
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is '0'.toByte more meaning than 48?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that would look better, but I was wondering if there is any performance/memory implication?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the compiler could take care it (constant folding)

if (parsedValue < 0 || parsedValue > 9) {
return null
} else {
currentSegmentValue = currentSegmentValue * 10 + parsedValue
}
}
j += 1
}
segments(i) = currentSegmentValue
if (segments(0) < 0 || segments(0) > 9999 || segments(1) < 1 || segments(1) > 12 ||
segments(2) < 1 || segments(2) > 31) {
return null
}
val c = Calendar.getInstance()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add some tests for the date that is the begging of Daylight Saving Time? To make sure that we can get the correct date back.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added tests. On 2015-3-8, I check if 2:30 == 3:30. In November 2:30 is ambiguous, because of that, I check if we get the same result as the Calendar. Besides that I check if we can cast from TimeStamp to String to TimeStamp.

c.set(segments(0), segments(1) - 1, segments(2), 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this necessary?

new Date(c.getTimeInMillis)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

c.getTime()

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we return the date as Int (the internal type for DateType)?

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@
package org.apache.spark.sql.catalyst.expressions

import java.sql.{Timestamp, Date}
import java.util.{TimeZone, Calendar}

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

/**
* Test suite for data type casting expression [[Cast]].
Expand All @@ -41,6 +43,136 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(cast(v, Literal(expected).dataType), expected)
}

test("cast string to date") {
var c = Calendar.getInstance()
c.set(2015, 0, 1, 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015"), DateType), new Date(c.getTimeInMillis))
c = Calendar.getInstance()
c.set(2015, 2, 1, 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015-03"), DateType), new Date(c.getTimeInMillis))
c = Calendar.getInstance()
c.set(2015, 2, 18, 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015-03-18"), DateType), new Date(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18 "), DateType), new Date(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18 123142"), DateType), new Date(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18T123123"), DateType), new Date(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18T"), DateType), new Date(c.getTimeInMillis))

checkEvaluation(Cast(Literal("2015-03-18X"), DateType), null)
checkEvaluation(Cast(Literal("2015/03/18"), DateType), null)
checkEvaluation(Cast(Literal("2015.03.18"), DateType), null)
checkEvaluation(Cast(Literal("20150318"), DateType), null)
checkEvaluation(Cast(Literal("2015-031-8"), DateType), null)
}

test("cast string to timestamp") {
var c = Calendar.getInstance()
c.set(2015, 0, 1, 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015"), TimestampType),
new Timestamp(c.getTimeInMillis))
c = Calendar.getInstance()
c.set(2015, 2, 1, 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015-03"), TimestampType),
new Timestamp(c.getTimeInMillis))
c = Calendar.getInstance()
c.set(2015, 2, 18, 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015-03-18"), TimestampType),
new Timestamp(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18 "), TimestampType),
new Timestamp(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18T"), TimestampType),
new Timestamp(c.getTimeInMillis))

c = Calendar.getInstance()
c.set(2015, 2, 18, 12, 3, 17)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015-03-18 12:03:17"), TimestampType),
new Timestamp(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18T12:03:17"), TimestampType),
new Timestamp(c.getTimeInMillis))

c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
c.set(2015, 2, 18, 12, 3, 17)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015-03-18T12:03:17Z"), TimestampType),
new Timestamp(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18 12:03:17Z"), TimestampType),
new Timestamp(c.getTimeInMillis))

c = Calendar.getInstance(TimeZone.getTimeZone("GMT-01:00"))
c.set(2015, 2, 18, 12, 3, 17)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015-03-18T12:03:17-1:0"), TimestampType),
new Timestamp(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18T12:03:17-01:00"), TimestampType),
new Timestamp(c.getTimeInMillis))

c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
c.set(2015, 2, 18, 12, 3, 17)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015-03-18T12:03:17+07:30"), TimestampType),
new Timestamp(c.getTimeInMillis))

c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:03"))
c.set(2015, 2, 18, 12, 3, 17)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015-03-18T12:03:17+7:3"), TimestampType),
new Timestamp(c.getTimeInMillis))

c = Calendar.getInstance()
c.set(2015, 2, 18, 12, 3, 17)
c.set(Calendar.MILLISECOND, 123)
checkEvaluation(Cast(Literal("2015-03-18 12:03:17.123"), TimestampType),
new Timestamp(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18T12:03:17.123"), TimestampType),
new Timestamp(c.getTimeInMillis))

c = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
c.set(2015, 2, 18, 12, 3, 17)
c.set(Calendar.MILLISECOND, 456)
checkEvaluation(Cast(Literal("2015-03-18T12:03:17.456Z"), TimestampType),
new Timestamp(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18 12:03:17.456Z"), TimestampType),
new Timestamp(c.getTimeInMillis))

c = Calendar.getInstance(TimeZone.getTimeZone("GMT-01:00"))
c.set(2015, 2, 18, 12, 3, 17)
c.set(Calendar.MILLISECOND, 123)
checkEvaluation(Cast(Literal("2015-03-18T12:03:17.123-1:0"), TimestampType),
new Timestamp(c.getTimeInMillis))
checkEvaluation(Cast(Literal("2015-03-18T12:03:17.123-01:00"), TimestampType),
new Timestamp(c.getTimeInMillis))

c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:30"))
c.set(2015, 2, 18, 12, 3, 17)
c.set(Calendar.MILLISECOND, 123)
checkEvaluation(Cast(Literal("2015-03-18T12:03:17.123+07:30"), TimestampType),
new Timestamp(c.getTimeInMillis))

c = Calendar.getInstance(TimeZone.getTimeZone("GMT+07:03"))
c.set(2015, 2, 18, 12, 3, 17)
c.set(Calendar.MILLISECOND, 123)
checkEvaluation(Cast(Literal("2015-03-18T12:03:17.123+7:3"), TimestampType),
new Timestamp(c.getTimeInMillis))

checkEvaluation(Cast(Literal("2015-03-18 123142"), TimestampType), null)
checkEvaluation(Cast(Literal("2015-03-18T123123"), TimestampType), null)
checkEvaluation(Cast(Literal("2015-03-18X"), TimestampType), null)
checkEvaluation(Cast(Literal("2015/03/18"), TimestampType), null)
checkEvaluation(Cast(Literal("2015.03.18"), TimestampType), null)
checkEvaluation(Cast(Literal("20150318"), TimestampType), null)
checkEvaluation(Cast(Literal("2015-031-8"), TimestampType), null)
checkEvaluation(Cast(Literal("2015-03-18T12:03:17-20:0"), TimestampType), null)
checkEvaluation(Cast(Literal("2015-03-18T12:03:17-0:70"), TimestampType), null)
checkEvaluation(Cast(Literal("2015-03-18T12:03:17-1:0:0"), TimestampType), null)
}

test("cast from int") {
checkCast(0, false)
checkCast(1, true)
Expand Down Expand Up @@ -175,7 +307,7 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(
cast(cast(cast(cast(cast(cast("5", TimestampType), ByteType),
DecimalType.Unlimited), LongType), StringType), ShortType),
null)
0.toShort)
checkEvaluation(cast(cast(cast(cast(cast(cast("5", DecimalType.Unlimited),
ByteType), TimestampType), LongType), StringType), ShortType),
0.toShort)
Expand Down
Loading