Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Commit 4ea6480

Browse files
tarekbeckerdavies
authored andcommitted
[SPARK-8995] [SQL] cast date strings like '2015-01-01 12:15:31' to date
Jira https://issues.apache.org/jira/browse/SPARK-8995 In PR #6981we noticed that we cannot cast date strings that contains a time, like '2015-03-18 12:39:40' to date. Besides it's not possible to cast a string like '18:03:20' to a timestamp. If a time is passed without a date, today is inferred as date. Author: Tarek Auel <[email protected]> Author: Tarek Auel <[email protected]> Closes apache#7353 from tarekauel/SPARK-8995 and squashes the following commits: 14f333b [Tarek Auel] [SPARK-8995] added tests for daylight saving time ca1ae69 [Tarek Auel] [SPARK-8995] style fix d20b8b4 [Tarek Auel] [SPARK-8995] bug fix: distinguish between 0 and null ef05753 [Tarek Auel] [SPARK-8995] added check for year >= 1000 01c9ff3 [Tarek Auel] [SPARK-8995] support for time strings 34ec573 [Tarek Auel] fixed style 71622c0 [Tarek Auel] improved timestamp and date parsing 0e30c0a [Tarek Auel] Hive compatibility cfbaed7 [Tarek Auel] fixed wrong checks 71f89c1 [Tarek Auel] [SPARK-8995] minor style fix f7452fa [Tarek Auel] [SPARK-8995] removed old timestamp parsing 30e5aec [Tarek Auel] [SPARK-8995] date and timestamp cast c1083fb [Tarek Auel] [SPARK-8995] cast date strings like '2015-01-01 12:15:31' to date or timestamp
1 parent 0115516 commit 4ea6480

File tree

4 files changed

+562
-15
lines changed

4 files changed

+562
-15
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -167,17 +167,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
167167
// TimestampConverter
168168
private[this] def castToTimestamp(from: DataType): Any => Any = from match {
169169
case StringType =>
170-
buildCast[UTF8String](_, utfs => {
171-
// Throw away extra if more than 9 decimal places
172-
val s = utfs.toString
173-
val periodIdx = s.indexOf(".")
174-
var n = s
175-
if (periodIdx != -1 && n.length() - periodIdx > 9) {
176-
n = n.substring(0, periodIdx + 10)
177-
}
178-
try DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(n))
179-
catch { case _: java.lang.IllegalArgumentException => null }
180-
})
170+
buildCast[UTF8String](_, utfs => DateTimeUtils.stringToTimestamp(utfs).orNull)
181171
case BooleanType =>
182172
buildCast[Boolean](_, b => if (b) 1L else 0)
183173
case LongType =>
@@ -220,10 +210,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
220210
// DateConverter
221211
private[this] def castToDate(from: DataType): Any => Any = from match {
222212
case StringType =>
223-
buildCast[UTF8String](_, s =>
224-
try DateTimeUtils.fromJavaDate(Date.valueOf(s.toString))
225-
catch { case _: java.lang.IllegalArgumentException => null }
226-
)
213+
buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s).orNull)
227214
case TimestampType =>
228215
// throw valid precision more than seconds, according to Hive.
229216
// Timestamp.nanos is in 0 to 999,999,999, no more than a second.

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import java.sql.{Date, Timestamp}
2121
import java.text.{DateFormat, SimpleDateFormat}
2222
import java.util.{Calendar, TimeZone}
2323

24+
import org.apache.spark.unsafe.types.UTF8String
25+
2426
/**
2527
* Helper functions for converting between internal and external date and time representations.
2628
* Dates are exposed externally as java.sql.Date and are represented internally as the number of
@@ -180,4 +182,200 @@ object DateTimeUtils {
180182
val nanos = (us % MICROS_PER_SECOND) * 1000L
181183
(day.toInt, secondsInDay * NANOS_PER_SECOND + nanos)
182184
}
185+
186+
/**
187+
* Parses a given UTF8 date string to the corresponding a corresponding [[Long]] value.
188+
* The return type is [[Option]] in order to distinguish between 0L and null. The following
189+
* formats are allowed:
190+
*
191+
* `yyyy`
192+
* `yyyy-[m]m`
193+
* `yyyy-[m]m-[d]d`
194+
* `yyyy-[m]m-[d]d `
195+
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
196+
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
197+
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
198+
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
199+
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
200+
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
201+
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
202+
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
203+
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
204+
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
205+
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
206+
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
207+
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
208+
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
209+
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
210+
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
211+
*/
212+
def stringToTimestamp(s: UTF8String): Option[Long] = {
213+
if (s == null) {
214+
return None
215+
}
216+
var timeZone: Option[Byte] = None
217+
val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0)
218+
var i = 0
219+
var currentSegmentValue = 0
220+
val bytes = s.getBytes
221+
var j = 0
222+
var digitsMilli = 0
223+
var justTime = false
224+
while (j < bytes.length) {
225+
val b = bytes(j)
226+
val parsedValue = b - '0'.toByte
227+
if (parsedValue < 0 || parsedValue > 9) {
228+
if (j == 0 && b == 'T') {
229+
justTime = true
230+
i += 3
231+
} else if (i < 2) {
232+
if (b == '-') {
233+
segments(i) = currentSegmentValue
234+
currentSegmentValue = 0
235+
i += 1
236+
} else if (i == 0 && b == ':') {
237+
justTime = true
238+
segments(3) = currentSegmentValue
239+
currentSegmentValue = 0
240+
i = 4
241+
} else {
242+
return None
243+
}
244+
} else if (i == 2) {
245+
if (b == ' ' || b == 'T') {
246+
segments(i) = currentSegmentValue
247+
currentSegmentValue = 0
248+
i += 1
249+
} else {
250+
return None
251+
}
252+
} else if (i == 3 || i == 4) {
253+
if (b == ':') {
254+
segments(i) = currentSegmentValue
255+
currentSegmentValue = 0
256+
i += 1
257+
} else {
258+
return None
259+
}
260+
} else if (i == 5 || i == 6) {
261+
if (b == 'Z') {
262+
segments(i) = currentSegmentValue
263+
currentSegmentValue = 0
264+
i += 1
265+
timeZone = Some(43)
266+
} else if (b == '-' || b == '+') {
267+
segments(i) = currentSegmentValue
268+
currentSegmentValue = 0
269+
i += 1
270+
timeZone = Some(b)
271+
} else if (b == '.' && i == 5) {
272+
segments(i) = currentSegmentValue
273+
currentSegmentValue = 0
274+
i += 1
275+
} else {
276+
return None
277+
}
278+
if (i == 6 && b != '.') {
279+
i += 1
280+
}
281+
} else {
282+
if (b == ':' || b == ' ') {
283+
segments(i) = currentSegmentValue
284+
currentSegmentValue = 0
285+
i += 1
286+
} else {
287+
return None
288+
}
289+
}
290+
} else {
291+
if (i == 6) {
292+
digitsMilli += 1
293+
}
294+
currentSegmentValue = currentSegmentValue * 10 + parsedValue
295+
}
296+
j += 1
297+
}
298+
299+
segments(i) = currentSegmentValue
300+
301+
while (digitsMilli < 6) {
302+
segments(6) *= 10
303+
digitsMilli += 1
304+
}
305+
306+
if (!justTime && (segments(0) < 1000 || segments(0) > 9999 || segments(1) < 1 ||
307+
segments(1) > 12 || segments(2) < 1 || segments(2) > 31)) {
308+
return None
309+
}
310+
311+
if (segments(3) < 0 || segments(3) > 23 || segments(4) < 0 || segments(4) > 59 ||
312+
segments(5) < 0 || segments(5) > 59 || segments(6) < 0 || segments(6) > 999999 ||
313+
segments(7) < 0 || segments(7) > 23 || segments(8) < 0 || segments(8) > 59) {
314+
return None
315+
}
316+
317+
val c = if (timeZone.isEmpty) {
318+
Calendar.getInstance()
319+
} else {
320+
Calendar.getInstance(
321+
TimeZone.getTimeZone(f"GMT${timeZone.get.toChar}${segments(7)}%02d:${segments(8)}%02d"))
322+
}
323+
324+
if (justTime) {
325+
c.set(Calendar.HOUR, segments(3))
326+
c.set(Calendar.MINUTE, segments(4))
327+
c.set(Calendar.SECOND, segments(5))
328+
} else {
329+
c.set(segments(0), segments(1) - 1, segments(2), segments(3), segments(4), segments(5))
330+
}
331+
332+
Some(c.getTimeInMillis / 1000 * 1000000 + segments(6))
333+
}
334+
335+
/**
336+
* Parses a given UTF8 date string to the corresponding a corresponding [[Int]] value.
337+
* The return type is [[Option]] in order to distinguish between 0 and null. The following
338+
* formats are allowed:
339+
*
340+
* `yyyy`,
341+
* `yyyy-[m]m`
342+
* `yyyy-[m]m-[d]d`
343+
* `yyyy-[m]m-[d]d `
344+
* `yyyy-[m]m-[d]d *`
345+
* `yyyy-[m]m-[d]dT*`
346+
*/
347+
def stringToDate(s: UTF8String): Option[Int] = {
348+
if (s == null) {
349+
return None
350+
}
351+
val segments: Array[Int] = Array[Int](1, 1, 1)
352+
var i = 0
353+
var currentSegmentValue = 0
354+
val bytes = s.getBytes
355+
var j = 0
356+
while (j < bytes.length && (i < 3 && !(bytes(j) == ' ' || bytes(j) == 'T'))) {
357+
val b = bytes(j)
358+
if (i < 2 && b == '-') {
359+
segments(i) = currentSegmentValue
360+
currentSegmentValue = 0
361+
i += 1
362+
} else {
363+
val parsedValue = b - '0'.toByte
364+
if (parsedValue < 0 || parsedValue > 9) {
365+
return None
366+
} else {
367+
currentSegmentValue = currentSegmentValue * 10 + parsedValue
368+
}
369+
}
370+
j += 1
371+
}
372+
segments(i) = currentSegmentValue
373+
if (segments(0) < 1000 || segments(0) > 9999 || segments(1) < 1 || segments(1) > 12 ||
374+
segments(2) < 1 || segments(2) > 31) {
375+
return None
376+
}
377+
val c = Calendar.getInstance()
378+
c.set(segments(0), segments(1) - 1, segments(2), 0, 0, 0)
379+
Some((c.getTimeInMillis / 1000 / 3600 / 24).toInt)
380+
}
183381
}

0 commit comments

Comments
 (0)