|
17 | 17 |
|
18 | 18 | package org.apache.spark.sql.parquet |
19 | 19 |
|
| 20 | +import java.sql.Timestamp |
| 21 | +import java.util.{TimeZone, Calendar} |
| 22 | + |
20 | 23 | import scala.collection.mutable.{Buffer, ArrayBuffer, HashMap} |
21 | 24 |
|
| 25 | +import jodd.datetime.JDateTime |
22 | 26 | import parquet.column.Dictionary |
23 | 27 | import parquet.io.api.{PrimitiveConverter, GroupConverter, Binary, Converter} |
24 | 28 | import parquet.schema.MessageType |
25 | 29 |
|
26 | 30 | import org.apache.spark.sql.catalyst.expressions._ |
27 | 31 | import org.apache.spark.sql.parquet.CatalystConverter.FieldType |
28 | 32 | import org.apache.spark.sql.types._ |
| 33 | +import org.apache.spark.sql.parquet.timestamp.NanoTime |
29 | 34 |
|
30 | 35 | /** |
31 | 36 | * Collection of converters of Parquet types (group and primitive types) that |
@@ -123,6 +128,12 @@ private[sql] object CatalystConverter { |
123 | 128 | parent.updateDecimal(fieldIndex, value, d) |
124 | 129 | } |
125 | 130 | } |
| 131 | + case TimestampType => { |
| 132 | + new CatalystPrimitiveConverter(parent, fieldIndex) { |
| 133 | + override def addBinary(value: Binary): Unit = |
| 134 | + parent.updateTimestamp(fieldIndex, value) |
| 135 | + } |
| 136 | + } |
126 | 137 | // All other primitive types use the default converter |
127 | 138 | case ctype: PrimitiveType => { // note: need the type tag here! |
128 | 139 | new CatalystPrimitiveConverter(parent, fieldIndex) |
@@ -197,9 +208,11 @@ private[parquet] abstract class CatalystConverter extends GroupConverter { |
197 | 208 | protected[parquet] def updateString(fieldIndex: Int, value: String): Unit = |
198 | 209 | updateField(fieldIndex, value) |
199 | 210 |
|
200 | | - protected[parquet] def updateDecimal(fieldIndex: Int, value: Binary, ctype: DecimalType): Unit = { |
| 211 | + protected[parquet] def updateTimestamp(fieldIndex: Int, value: Binary): Unit = |
| 212 | + updateField(fieldIndex, readTimestamp(value)) |
| 213 | + |
| 214 | + protected[parquet] def updateDecimal(fieldIndex: Int, value: Binary, ctype: DecimalType): Unit = |
201 | 215 | updateField(fieldIndex, readDecimal(new Decimal(), value, ctype)) |
202 | | - } |
203 | 216 |
|
204 | 217 | protected[parquet] def isRootConverter: Boolean = parent == null |
205 | 218 |
|
@@ -232,6 +245,13 @@ private[parquet] abstract class CatalystConverter extends GroupConverter { |
232 | 245 | unscaled = (unscaled << (64 - numBits)) >> (64 - numBits) |
233 | 246 | dest.set(unscaled, precision, scale) |
234 | 247 | } |
| 248 | + |
| 249 | + /** |
| 250 | + * Read a Timestamp value from a Parquet Int96Value |
| 251 | + */ |
| 252 | + protected[parquet] def readTimestamp(value: Binary): Timestamp = { |
| 253 | + CatalystTimestampConverter.convertToTimestamp(value) |
| 254 | + } |
235 | 255 | } |
236 | 256 |
|
237 | 257 | /** |
@@ -384,6 +404,9 @@ private[parquet] class CatalystPrimitiveRowConverter( |
384 | 404 | override protected[parquet] def updateString(fieldIndex: Int, value: String): Unit = |
385 | 405 | current.setString(fieldIndex, value) |
386 | 406 |
|
| 407 | + override protected[parquet] def updateTimestamp(fieldIndex: Int, value: Binary): Unit = |
| 408 | + current.update(fieldIndex, readTimestamp(value)) |
| 409 | + |
387 | 410 | override protected[parquet] def updateDecimal( |
388 | 411 | fieldIndex: Int, value: Binary, ctype: DecimalType): Unit = { |
389 | 412 | var decimal = current(fieldIndex).asInstanceOf[Decimal] |
@@ -454,6 +477,75 @@ private[parquet] object CatalystArrayConverter { |
454 | 477 | val INITIAL_ARRAY_SIZE = 20 |
455 | 478 | } |
456 | 479 |
|
| 480 | +private[parquet] object CatalystTimestampConverter { |
| 481 | + // TODO most part of this comes from Hive-0.14 |
| 482 | + // Hive code might have some issues, so we need to keep an eye on it. |
| 483 | + // Also we use NanoTime and Int96Values from parquet-examples. |
| 484 | + // We utilize jodd to convert between NanoTime and Timestamp |
| 485 | + val parquetTsCalendar = new ThreadLocal[Calendar] |
| 486 | + def getCalendar = { |
| 487 | + // this is a cache for the calendar instance. |
| 488 | + if (parquetTsCalendar.get == null) { |
| 489 | + parquetTsCalendar.set(Calendar.getInstance(TimeZone.getTimeZone("GMT"))) |
| 490 | + } |
| 491 | + parquetTsCalendar.get |
| 492 | + } |
| 493 | + val NANOS_PER_SECOND = 1000000000 |
| 494 | + val SECONDS_PER_MINUTE = 60 |
| 495 | + val MINUTES_PER_HOUR = 60 |
| 496 | + val NANOS_PER_MILLI = 1000000 |
| 497 | + |
| 498 | + def convertToTimestamp(value: Binary): Timestamp = { |
| 499 | + val nt = NanoTime.fromBinary(value) |
| 500 | + val timeOfDayNanos = nt.getTimeOfDayNanos |
| 501 | + val julianDay = nt.getJulianDay |
| 502 | + val jDateTime = new JDateTime(julianDay.toDouble) |
| 503 | + val calendar = getCalendar |
| 504 | + calendar.set(Calendar.YEAR, jDateTime.getYear) |
| 505 | + calendar.set(Calendar.MONTH, jDateTime.getMonth - 1) |
| 506 | + calendar.set(Calendar.DAY_OF_MONTH, jDateTime.getDay) |
| 507 | + |
| 508 | + // written in command style |
| 509 | + var remainder = timeOfDayNanos |
| 510 | + calendar.set( |
| 511 | + Calendar.HOUR_OF_DAY, |
| 512 | + (remainder / (NANOS_PER_SECOND * SECONDS_PER_MINUTE * MINUTES_PER_HOUR)).toInt) |
| 513 | + remainder = remainder % (NANOS_PER_SECOND * SECONDS_PER_MINUTE * MINUTES_PER_HOUR) |
| 514 | + calendar.set( |
| 515 | + Calendar.MINUTE, (remainder / (NANOS_PER_SECOND * SECONDS_PER_MINUTE)).toInt) |
| 516 | + remainder = remainder % (NANOS_PER_SECOND * SECONDS_PER_MINUTE) |
| 517 | + calendar.set(Calendar.SECOND, (remainder / NANOS_PER_SECOND).toInt) |
| 518 | + // Hive-0.14 put all of the remainder into nanos, while we put the millis part away |
| 519 | + val nanos = remainder % NANOS_PER_SECOND |
| 520 | + val ts = new Timestamp(calendar.getTimeInMillis + nanos / NANOS_PER_MILLI) |
| 521 | + ts.setNanos((nanos % NANOS_PER_MILLI).toInt) |
| 522 | + ts |
| 523 | + } |
| 524 | + |
| 525 | + def convertFromTimestamp(ts: Timestamp): Binary = { |
| 526 | + val calendar = getCalendar |
| 527 | + calendar.setTime(ts) |
| 528 | + val jDateTime = new JDateTime(calendar.get(Calendar.YEAR), |
| 529 | + calendar.get(Calendar.MONTH) + 1, calendar.get(Calendar.DAY_OF_MONTH)) |
| 530 | + // Hive-0.14 didn't set hour before get day number, while the day number should |
| 531 | + // has something to do with hour, since julian day number grows at 12h GMT |
| 532 | + // here we just follow what hive does. |
| 533 | + val julianDay = jDateTime.getJulianDayNumber |
| 534 | + |
| 535 | + val hour = calendar.get(Calendar.HOUR_OF_DAY) |
| 536 | + val minute = calendar.get(Calendar.MINUTE) |
| 537 | + val second = calendar.get(Calendar.SECOND) |
| 538 | + // Hive-0.14 would not consider millis part in ts itself |
| 539 | + val nanos = ts.getNanos + ts.getTime % 1000 * NANOS_PER_MILLI |
| 540 | + // Hive-0.14 would use hours directly, that might be wrong, since the day starts |
| 541 | + // from 12h in Julian. here we just follow what hive does. |
| 542 | + val nanosOfDay = nanos + second * NANOS_PER_SECOND + |
| 543 | + minute * NANOS_PER_SECOND * SECONDS_PER_MINUTE + |
| 544 | + hour * NANOS_PER_SECOND * SECONDS_PER_MINUTE * MINUTES_PER_HOUR |
| 545 | + NanoTime(julianDay, nanosOfDay).toBinary |
| 546 | + } |
| 547 | +} |
| 548 | + |
457 | 549 | /** |
458 | 550 | * A `parquet.io.api.GroupConverter` that converts a single-element groups that |
459 | 551 | * match the characteristics of an array (see |
|
0 commit comments