Skip to content

Commit e5cea56

Browse files
Davies Liuliancheng
authored andcommitted
[SPARK-10177] [SQL] fix reading Timestamp in parquet from Hive
We misunderstood the Julian days and nanoseconds of the day in parquet (as TimestampType) from Hive/Impala, they are overlapped, so can't be added together directly. In order to avoid the confusing rounding when do the converting, we use `2440588` as the Julian Day of epoch of unix timestamp (which should be 2440587.5). Author: Davies Liu <[email protected]> Author: Cheng Lian <[email protected]> Closes #8400 from davies/timestamp_parquet. (cherry picked from commit 2f493f7) Signed-off-by: Cheng Lian <[email protected]>
1 parent 2032d66 commit e5cea56

File tree

3 files changed

+14
-8
lines changed

3 files changed

+14
-8
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ object DateTimeUtils {
3737
type SQLTimestamp = Long
3838

3939
// see http://stackoverflow.com/questions/466321/convert-unix-timestamp-to-julian
40-
final val JULIAN_DAY_OF_EPOCH = 2440587 // and .5
40+
// it's 2440587.5, rounding up to compatible with Hive
41+
final val JULIAN_DAY_OF_EPOCH = 2440588
4142
final val SECONDS_PER_DAY = 60 * 60 * 24L
4243
final val MICROS_PER_SECOND = 1000L * 1000L
4344
final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L
@@ -183,15 +184,15 @@ object DateTimeUtils {
183184
*/
184185
def fromJulianDay(day: Int, nanoseconds: Long): SQLTimestamp = {
185186
// use Long to avoid rounding errors
186-
val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY - SECONDS_PER_DAY / 2
187+
val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY
187188
seconds * MICROS_PER_SECOND + nanoseconds / 1000L
188189
}
189190

190191
/**
191192
* Returns Julian day and nanoseconds in a day from the number of microseconds
192193
*/
193194
def toJulianDay(us: SQLTimestamp): (Int, Long) = {
194-
val seconds = us / MICROS_PER_SECOND + SECONDS_PER_DAY / 2
195+
val seconds = us / MICROS_PER_SECOND
195196
val day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH
196197
val secondsInDay = seconds % SECONDS_PER_DAY
197198
val nanos = (us % MICROS_PER_SECOND) * 1000L

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,18 @@ class DateTimeUtilsSuite extends SparkFunSuite {
4949
test("us and julian day") {
5050
val (d, ns) = toJulianDay(0)
5151
assert(d === JULIAN_DAY_OF_EPOCH)
52-
assert(ns === SECONDS_PER_DAY / 2 * NANOS_PER_SECOND)
52+
assert(ns === 0)
5353
assert(fromJulianDay(d, ns) == 0L)
5454

55-
val t = new Timestamp(61394778610000L) // (2015, 6, 11, 10, 10, 10, 100)
55+
val t = Timestamp.valueOf("2015-06-11 10:10:10.100")
5656
val (d1, ns1) = toJulianDay(fromJavaTimestamp(t))
57-
val t2 = toJavaTimestamp(fromJulianDay(d1, ns1))
58-
assert(t.equals(t2))
57+
val t1 = toJavaTimestamp(fromJulianDay(d1, ns1))
58+
assert(t.equals(t1))
59+
60+
val t2 = Timestamp.valueOf("2015-06-11 20:10:10.100")
61+
val (d2, ns2) = toJulianDay(fromJavaTimestamp(t2))
62+
val t22 = toJavaTimestamp(fromJulianDay(d2, ns2))
63+
assert(t2.equals(t22))
5964
}
6065

6166
test("SPARK-6785: java date conversion before and after epoch") {

sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with Before
113113
"BOOLEAN", "TINYINT", "SMALLINT", "INT", "BIGINT", "FLOAT", "DOUBLE", "STRING")
114114
}
115115

116-
ignore("SPARK-10177 timestamp") {
116+
test("SPARK-10177 timestamp") {
117117
testParquetHiveCompatibility(Row(Timestamp.valueOf("2015-08-24 00:31:00")), "TIMESTAMP")
118118
}
119119

0 commit comments

Comments
 (0)