Skip to content

Commit 71622c0

Browse files
committed
improved timestamp and date parsing
1 parent 0e30c0a commit 71622c0

File tree

3 files changed

+135
-109
lines changed

3 files changed

+135
-109
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -164,10 +164,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
164164
// TimestampConverter
165165
private[this] def castToTimestamp(from: DataType): Any => Any = from match {
166166
case StringType =>
167-
buildCast[UTF8String](_, utfs => {
168-
val parsedDateString = DateTimeUtils.stringToTimestamp(utfs)
169-
if (parsedDateString == null) null else DateTimeUtils.fromJavaTimestamp(parsedDateString)
170-
})
167+
buildCast[UTF8String](_, utfs => DateTimeUtils.stringToTimestamp(utfs))
171168
case BooleanType =>
172169
buildCast[Boolean](_, b => if (b) 1L else 0)
173170
case LongType =>
@@ -215,10 +212,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
215212
// DateConverter
216213
private[this] def castToDate(from: DataType): Any => Any = from match {
217214
case StringType =>
218-
buildCast[UTF8String](_, s => {
219-
val parsedDate = DateTimeUtils.stringToDate(s)
220-
if (parsedDate == null) null else DateTimeUtils.fromJavaDate(parsedDate)
221-
})
215+
buildCast[UTF8String](_, s => DateTimeUtils.stringToDate(s))
222216
case TimestampType =>
223217
// throw valid precision more than seconds, according to Hive.
224218
// Timestamp.nanos is in 0 to 999,999,999, no more than a second.

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala

Lines changed: 46 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -185,19 +185,23 @@ object DateTimeUtils {
185185

186186
/**
187187
* Parses a given UTF8 date string to the corresponding [[Timestamp]] object. The format of the
188-
* date has to be one of the following: `yyyy`, `yyyy-[m]m`, `yyyy-[m]m-[d]d`, `yyyy-[m]m-[d]d `,
189-
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][ms]`,
190-
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][ms]Z`,
191-
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][ms]-[h]h:[m]m`,
192-
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][ms]+[h]h:[m]m`,
193-
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][ms]`,
194-
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][ms]Z`,
195-
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][ms]-[h]h:[m]m`,
196-
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][ms]+[h]h:[m]m`,
188+
* date has to be one of the following:
189+
* `yyyy`
190+
* `yyyy-[m]m`
191+
* `yyyy-[m]m-[d]d`
192+
* `yyyy-[m]m-[d]d `
193+
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
194+
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
195+
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
196+
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
197+
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
198+
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
199+
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
200+
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
197201
*/
198-
def stringToTimestamp(s: UTF8String): Timestamp = {
202+
def stringToTimestamp(s: UTF8String): Long = {
199203
if (s == null) {
200-
return null
204+
return null.asInstanceOf[Long]
201205
}
202206
var timeZone: Option[Byte] = None
203207
val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0)
@@ -208,33 +212,33 @@ object DateTimeUtils {
208212
var digitsMilli = 0
209213
while (j < bytes.length) {
210214
val b = bytes(j)
211-
val parsedValue = b - 48
215+
val parsedValue = b - '0'.toByte
212216
if (parsedValue < 0 || parsedValue > 9) {
213217
if (i < 2) {
214218
if (b == '-') {
215219
segments(i) = currentSegmentValue
216220
currentSegmentValue = 0
217221
i += 1
218222
} else {
219-
return null
223+
return null.asInstanceOf[Long]
220224
}
221225
} else if (i == 2) {
222226
if (b == ' ' || b == 'T') {
223227
segments(i) = currentSegmentValue
224228
currentSegmentValue = 0
225229
i += 1
226230
} else {
227-
return null
231+
return null.asInstanceOf[Long]
228232
}
229-
} else if (i < 5) {
233+
} else if (i == 3 || i == 4) {
230234
if (b == ':') {
231235
segments(i) = currentSegmentValue
232236
currentSegmentValue = 0
233237
i += 1
234238
} else {
235-
return null
239+
return null.asInstanceOf[Long]
236240
}
237-
} else if (i < 7) {
241+
} else if (i == 5 || i == 6) {
238242
if (b == 'Z') {
239243
segments(i) = currentSegmentValue
240244
currentSegmentValue = 0
@@ -250,18 +254,18 @@ object DateTimeUtils {
250254
currentSegmentValue = 0
251255
i += 1
252256
} else {
253-
return null
257+
return null.asInstanceOf[Long]
254258
}
255259
if (i == 6 && b != '.') {
256260
i += 1
257261
}
258-
} else if (i > 6) {
259-
if (b == ':') {
262+
} else {
263+
if (b == ':' || b == ' ') {
260264
segments(i) = currentSegmentValue
261265
currentSegmentValue = 0
262266
i += 1
263267
} else {
264-
return null
268+
return null.asInstanceOf[Long]
265269
}
266270
}
267271
} else {
@@ -272,29 +276,20 @@ object DateTimeUtils {
272276
}
273277
j += 1
274278
}
275-
if (i > 8) {
276-
return null
277-
}
278-
segments(i) = currentSegmentValue
279279

280-
// Hive compatibility 2011-05-06 07:08:09.1000 == 2011-05-06 07:08:09.1
281-
if (digitsMilli == 4) {
282-
segments(6) = segments(6) / 10
283-
}
280+
segments(i) = currentSegmentValue
284281

285-
// 18:3:1.1 is equals to 18:3:1:100
286-
if (digitsMilli == 1) {
287-
segments(6) = segments(6) * 100
288-
} else if (digitsMilli == 2) {
289-
segments(6) = segments(6) * 10
282+
while (digitsMilli < 6) {
283+
segments(6) *= 10
284+
digitsMilli += 1
290285
}
291286

292287
if (segments(0) < 0 || segments(0) > 9999 || segments(1) < 1 || segments(1) > 12 ||
293288
segments(2) < 1 || segments(2) > 31 || segments(3) < 0 || segments(3) > 23 ||
294289
segments(4) < 0 || segments(4) > 59 || segments(5) < 0 || segments(5) > 59 ||
295-
segments(6) < 0 || segments(6) > 999 || segments(7) < 0 || segments(7) > 14 ||
290+
segments(6) < 0 || segments(6) > 999999 || segments(7) < 0 || segments(7) > 14 ||
296291
segments(8) < 0 || segments(8) > 59) {
297-
return null
292+
return null.asInstanceOf[Long]
298293
}
299294
val c = if (timeZone.isEmpty) {
300295
Calendar.getInstance()
@@ -303,18 +298,23 @@ object DateTimeUtils {
303298
TimeZone.getTimeZone(f"GMT${timeZone.get.toChar}${segments(7)}%02d:${segments(8)}%02d"))
304299
}
305300
c.set(segments(0), segments(1) - 1, segments(2), segments(3), segments(4), segments(5))
306-
c.set(Calendar.MILLISECOND, segments(6))
307-
new Timestamp(c.getTimeInMillis)
301+
c.set(Calendar.MILLISECOND, segments(6) / 1000)
302+
c.getTimeInMillis * 1000 + segments(6) % 1000
308303
}
309304

310305
/**
311306
* Parses a given UTF8 date string to the corresponding [[Date]] object. The format of the date
312-
* has to be one of the following: `yyyy`, `yyyy-[m]m`, `yyyy-[m]m-[d]d`, `yyyy-[m]m-[d]d `,
313-
* `yyyy-[m]m-[d]d *`, `yyyy-[m]m-[d]dT*`
307+
* has to be one of the following:
308+
* `yyyy`,
309+
* `yyyy-[m]m`
310+
* `yyyy-[m]m-[d]d`
311+
* `yyyy-[m]m-[d]d `
312+
* `yyyy-[m]m-[d]d *`
313+
* `yyyy-[m]m-[d]dT*`
314314
*/
315-
def stringToDate(s: UTF8String): Date = {
315+
def stringToDate(s: UTF8String): Int = {
316316
if (s == null) {
317-
return null
317+
return null.asInstanceOf[Int]
318318
}
319319
val segments: Array[Int] = Array[Int](1, 1, 1)
320320
var i = 0
@@ -328,9 +328,9 @@ object DateTimeUtils {
328328
currentSegmentValue = 0
329329
i += 1
330330
} else {
331-
val parsedValue = b - 48
331+
val parsedValue = b - '0'.toByte
332332
if (parsedValue < 0 || parsedValue > 9) {
333-
return null
333+
return null.asInstanceOf[Int]
334334
} else {
335335
currentSegmentValue = currentSegmentValue * 10 + parsedValue
336336
}
@@ -340,11 +340,11 @@ object DateTimeUtils {
340340
segments(i) = currentSegmentValue
341341
if (segments(0) < 0 || segments(0) > 9999 || segments(1) < 1 || segments(1) > 12 ||
342342
segments(2) < 1 || segments(2) > 31) {
343-
return null
343+
return null.asInstanceOf[Int]
344344
}
345345
val c = Calendar.getInstance()
346346
c.set(segments(0), segments(1) - 1, segments(2), 0, 0, 0)
347347
c.set(Calendar.MILLISECOND, 0)
348-
new Date(c.getTimeInMillis)
348+
(c.getTimeInMillis / 1000 / 3600 / 24).toInt
349349
}
350350
}

0 commit comments

Comments
 (0)