Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Commit c1083fb

Browse files
committed
[SPARK-8995] cast date strings like '2015-01-01 12:15:31' to date or timestamp
1 parent 851e247 commit c1083fb

File tree

4 files changed

+161
-10
lines changed

4 files changed

+161
-10
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -165,15 +165,22 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
165165
private[this] def castToTimestamp(from: DataType): Any => Any = from match {
166166
case StringType =>
167167
buildCast[UTF8String](_, utfs => {
168-
// Throw away extra if more than 9 decimal places
169-
val s = utfs.toString
170-
val periodIdx = s.indexOf(".")
171-
var n = s
172-
if (periodIdx != -1 && n.length() - periodIdx > 9) {
173-
n = n.substring(0, periodIdx + 10)
168+
val parsedDateString = DateTimeUtils.stringToMillis(utfs) * 1000
169+
if (parsedDateString == null.asInstanceOf[Long]) {
170+
// Throw away extra if more than 9 decimal places
171+
val s = utfs.toString
172+
val periodIdx = s.indexOf(".")
173+
var n = s
174+
if (periodIdx != -1 && n.length() - periodIdx > 9) {
175+
n = n.substring(0, periodIdx + 10)
176+
}
177+
try DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(n))
178+
catch {
179+
case _: java.lang.IllegalArgumentException => null
180+
}
181+
} else {
182+
parsedDateString
174183
}
175-
try DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(n))
176-
catch { case _: java.lang.IllegalArgumentException => null }
177184
})
178185
case BooleanType =>
179186
buildCast[Boolean](_, b => if (b) 1L else 0)
@@ -223,8 +230,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
223230
private[this] def castToDate(from: DataType): Any => Any = from match {
224231
case StringType =>
225232
buildCast[UTF8String](_, s =>
226-
try DateTimeUtils.fromJavaDate(Date.valueOf(s.toString))
227-
catch { case _: java.lang.IllegalArgumentException => null }
233+
DateTimeUtils.millisToDays(DateTimeUtils.stringToMillis(s))
228234
)
229235
case TimestampType =>
230236
// throw valid precision more than seconds, according to Hive.

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import java.sql.{Date, Timestamp}
2121
import java.text.{DateFormat, SimpleDateFormat}
2222
import java.util.{Calendar, TimeZone}
2323

24+
import org.apache.spark.unsafe.types.UTF8String
25+
2426
/**
2527
* Helper functions for converting between internal and external date and time representations.
2628
* Dates are exposed externally as java.sql.Date and are represented internally as the number of
@@ -180,4 +182,52 @@ object DateTimeUtils {
180182
val nanos = (us % MICROS_PER_SECOND) * 1000L
181183
(day.toInt, secondsInDay * NANOS_PER_SECOND + nanos)
182184
}
185+
186+
/**
187+
* Parses a given UTF8 date string to the corresponding millisecond value. The format of the date
188+
* string has to be either `yyyy-[m]m-[d]d` or `yyyy-[m]m-[d]d [H]H:[m]m:[s]s` or
189+
* `[H]H:[m]m:[s]s`. If only a date is given, the time will be set to midnight. If only a time is
190+
* given, the date will be set to today.
191+
*/
192+
def stringToMillis(s: UTF8String): Long = {
193+
if (s == null) {
194+
return null.asInstanceOf[Long]
195+
}
196+
val segments: Array[Int] = new Array[Int](6)
197+
var i = 0
198+
var currentSegment = 0
199+
var justTime = false
200+
for {
201+
b <- s.getBytes
202+
} yield {
203+
if (b == 45 || b == 58 || b == 32) {
204+
segments(i) = currentSegment
205+
currentSegment = 0
206+
if (i == 0 && b == 58) {
207+
justTime = true
208+
}
209+
i += 1
210+
} else {
211+
val parsedValue = b - 48
212+
if (parsedValue < 0 || parsedValue > 9) {
213+
return null.asInstanceOf[Long]
214+
}
215+
currentSegment = currentSegment * 10 + parsedValue
216+
}
217+
}
218+
segments(i) = currentSegment
219+
if (i < 2) {
220+
null.asInstanceOf[Long]
221+
} else {
222+
val c = Calendar.getInstance()
223+
if (justTime) {
224+
c.set(Calendar.HOUR, segments(0))
225+
c.set(Calendar.MINUTE, segments(1))
226+
c.set(Calendar.SECOND, segments(2))
227+
} else {
228+
c.set(segments(0), segments(1) - 1, segments(2), segments(3), segments(4), segments(5))
229+
}
230+
c.getTimeInMillis
231+
}
232+
}
183233
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@
1818
package org.apache.spark.sql.catalyst.expressions
1919

2020
import java.sql.{Timestamp, Date}
21+
import java.util.Calendar
2122

2223
import org.apache.spark.SparkFunSuite
2324
import org.apache.spark.sql.catalyst.InternalRow
2425
import org.apache.spark.sql.catalyst.util.DateTimeUtils
2526
import org.apache.spark.sql.types._
27+
import org.apache.spark.unsafe.types.UTF8String
2628

2729
/**
2830
* Test suite for data type casting expression [[Cast]].
@@ -41,6 +43,42 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
4143
checkEvaluation(cast(v, Literal(expected).dataType), expected)
4244
}
4345

46+
test("cast string to date") {
47+
val c = Calendar.getInstance()
48+
c.set(2015, 0, 1)
49+
checkEvaluation(Cast(Literal("2015-1-1"), DateType),
50+
(c.getTimeInMillis / 1000L / 3600L / 24L - 1).toInt)
51+
c.set(2015, 1, 1)
52+
checkEvaluation(Cast(Literal("2015-02-01"), DateType),
53+
(c.getTimeInMillis / 1000L / 3600L / 24L - 1).toInt)
54+
c.set(2015, 5, 15)
55+
checkEvaluation(Cast(Literal("2015-06-15"), DateType),
56+
(c.getTimeInMillis / 1000L / 3600L / 24L - 1).toInt)
57+
58+
c.set(2015, 0, 1)
59+
checkEvaluation(Cast(Literal("2015-1-1 13:10:58"), DateType),
60+
(c.getTimeInMillis / 1000L / 3600L / 24L - 1).toInt)
61+
c.set(2015, 1, 1)
62+
checkEvaluation(Cast(Literal("2015-02-01 13:10:58"), DateType),
63+
(c.getTimeInMillis / 1000L / 3600L / 24L - 1).toInt)
64+
c.set(2015, 5, 15)
65+
checkEvaluation(Cast(Literal("2015-06-15 13:10:58"), DateType),
66+
(c.getTimeInMillis / 1000L / 3600L / 24L - 1).toInt)
67+
68+
c.set(2015, 0, 1, 13, 10, 58)
69+
assert(
70+
evaluate(Cast(Literal("2015-1-1 13:10:58"), TimestampType)).asInstanceOf[Long] / 1000000L ==
71+
c.getTimeInMillis / 1000L)
72+
c.set(2015, 1, 1, 12, 9, 15)
73+
assert(
74+
evaluate(Cast(Literal("2015-2-1 12:9:15"), TimestampType)).asInstanceOf[Long] / 1000000L ==
75+
c.getTimeInMillis / 1000L)
76+
c.set(2015, 5, 15, 17, 58, 46)
77+
assert(
78+
evaluate(Cast(Literal("2015-6-15 17:58:46"), TimestampType)).asInstanceOf[Long] / 1000000L ==
79+
c.getTimeInMillis / 1000L)
80+
}
81+
4482
test("cast from int") {
4583
checkCast(0, false)
4684
checkCast(1, true)

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@ package org.apache.spark.sql.catalyst.util
1919

2020
import java.sql.{Date, Timestamp}
2121
import java.text.SimpleDateFormat
22+
import java.util.Calendar
2223

2324
import org.apache.spark.SparkFunSuite
25+
import org.apache.spark.unsafe.types.UTF8String
2426

2527
class DateTimeUtilsSuite extends SparkFunSuite {
2628

@@ -86,4 +88,59 @@ class DateTimeUtilsSuite extends SparkFunSuite {
8688
checkFromToJavaDate(new Date(df1.parse("1776-07-04 10:30:00").getTime))
8789
checkFromToJavaDate(new Date(df2.parse("1776-07-04 18:30:00 UTC").getTime))
8890
}
91+
92+
test("string to millis") {
93+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("18,00:00")) ==
94+
null.asInstanceOf[Long])
95+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString(null)) ==
96+
null.asInstanceOf[Long])
97+
98+
var c = Calendar.getInstance()
99+
c.set(2015, 0, 1, 0, 0, 0)
100+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("2015-01-01")) / 1000 ==
101+
c.getTimeInMillis / 1000)
102+
c.set(2015, 2, 18, 0, 0, 0)
103+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("2015-3-18")) / 1000 ==
104+
c.getTimeInMillis / 1000)
105+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("2015-03-18")) / 1000 ==
106+
c.getTimeInMillis / 1000)
107+
c.set(2015, 11, 24, 0, 0, 0)
108+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("2015-12-24")) / 1000 ==
109+
c.getTimeInMillis / 1000)
110+
111+
c.set(2015, 0, 1, 12, 30, 58)
112+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("2015-01-01 12:30:58")) / 1000 ==
113+
c.getTimeInMillis / 1000)
114+
c.set(2015, 2, 18, 9, 7, 2)
115+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("2015-3-18 9:7:2")) / 1000 ==
116+
c.getTimeInMillis / 1000)
117+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("2015-03-18 09:07:02")) / 1000 ==
118+
c.getTimeInMillis / 1000)
119+
c.set(2015, 11, 24, 18, 0, 0)
120+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("2015-12-24 18:00:00")) / 1000 ==
121+
c.getTimeInMillis / 1000)
122+
123+
c = Calendar.getInstance()
124+
c.set(Calendar.HOUR, 12)
125+
c.set(Calendar.MINUTE, 30)
126+
c.set(Calendar.SECOND, 58)
127+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("12:30:58")) / 1000 ==
128+
c.getTimeInMillis / 1000)
129+
130+
c = Calendar.getInstance()
131+
c.set(Calendar.HOUR, 9)
132+
c.set(Calendar.MINUTE, 7)
133+
c.set(Calendar.SECOND, 2)
134+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("9:7:2")) / 1000 ==
135+
c.getTimeInMillis / 1000)
136+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("09:07:02")) / 1000 ==
137+
c.getTimeInMillis / 1000)
138+
139+
c = Calendar.getInstance()
140+
c.set(Calendar.HOUR, 18)
141+
c.set(Calendar.MINUTE, 0)
142+
c.set(Calendar.SECOND, 0)
143+
assert(DateTimeUtils.stringToMillis(UTF8String.fromString("18:00:00")) / 1000 ==
144+
c.getTimeInMillis / 1000)
145+
}
89146
}

0 commit comments

Comments
 (0)