Skip to content

Commit 0bdbf21

Browse files
committed
Make parquet tests less order dependent
1 parent b42eeab commit 0bdbf21

File tree

1 file changed

+31
-63
lines changed

1 file changed

+31
-63
lines changed

sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala

Lines changed: 31 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,7 @@ case class AllDataTypes(
5858
doubleField: Double,
5959
shortField: Short,
6060
byteField: Byte,
61-
booleanField: Boolean,
62-
binaryField: Array[Byte])
61+
booleanField: Boolean)
6362

6463
case class AllDataTypesWithNonPrimitiveType(
6564
stringField: String,
@@ -70,13 +69,14 @@ case class AllDataTypesWithNonPrimitiveType(
7069
shortField: Short,
7170
byteField: Byte,
7271
booleanField: Boolean,
73-
binaryField: Array[Byte],
7472
array: Seq[Int],
7573
arrayContainsNull: Seq[Option[Int]],
7674
map: Map[Int, Long],
7775
mapValueContainsNull: Map[Int, Option[Long]],
7876
data: Data)
7977

78+
case class BinaryData(binaryData: Array[Byte])
79+
8080
class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterAll {
8181
TestData // Load test data tables.
8282

@@ -108,23 +108,23 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
108108
test("Read/Write All Types") {
109109
val tempDir = getTempFilePath("parquetTest").getCanonicalPath
110110
val range = (0 to 255)
111-
TestSQLContext.sparkContext.parallelize(range)
112-
.map(x => AllDataTypes(s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0,
113-
(0 to x).map(_.toByte).toArray))
114-
.saveAsParquetFile(tempDir)
115-
val result = parquetFile(tempDir).collect()
116-
range.foreach {
117-
i =>
118-
assert(result(i).getString(0) == s"$i", s"row $i String field did not match, got ${result(i).getString(0)}")
119-
assert(result(i).getInt(1) === i)
120-
assert(result(i).getLong(2) === i.toLong)
121-
assert(result(i).getFloat(3) === i.toFloat)
122-
assert(result(i).getDouble(4) === i.toDouble)
123-
assert(result(i).getShort(5) === i.toShort)
124-
assert(result(i).getByte(6) === i.toByte)
125-
assert(result(i).getBoolean(7) === (i % 2 == 0))
126-
assert(result(i)(8) === (0 to i).map(_.toByte).toArray)
127-
}
111+
val data = sparkContext.parallelize(range)
112+
.map(x => AllDataTypes(s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
113+
114+
data.saveAsParquetFile(tempDir)
115+
116+
checkAnswer(
117+
parquetFile(tempDir),
118+
data.toSchemaRDD.collect().toSeq)
119+
}
120+
121+
test("read/write binary data") {
122+
// Since equality for Array[Byte] is broken we test this separately.
123+
val tempDir = getTempFilePath("parquetTest").getCanonicalPath
124+
sparkContext.parallelize(BinaryData("test".getBytes("utf8")) :: Nil).saveAsParquetFile(tempDir)
125+
parquetFile(tempDir)
126+
.map(r => new String(r(0).asInstanceOf[Array[Byte]], "utf8"))
127+
.collect().toSeq == Seq("test")
128128
}
129129

130130
test("Treat binary as string") {
@@ -275,34 +275,19 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
275275
test("Read/Write All Types with non-primitive type") {
276276
val tempDir = getTempFilePath("parquetTest").getCanonicalPath
277277
val range = (0 to 255)
278-
TestSQLContext.sparkContext.parallelize(range)
278+
val data = sparkContext.parallelize(range)
279279
.map(x => AllDataTypesWithNonPrimitiveType(
280280
s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0,
281-
(0 to x).map(_.toByte).toArray,
282281
(0 until x),
283282
(0 until x).map(Option(_).filter(_ % 3 == 0)),
284283
(0 until x).map(i => i -> i.toLong).toMap,
285284
(0 until x).map(i => i -> Option(i.toLong)).toMap + (x -> None),
286285
Data((0 until x), Nested(x, s"$x"))))
287-
.saveAsParquetFile(tempDir)
288-
val result = parquetFile(tempDir).collect()
289-
range.foreach {
290-
i =>
291-
assert(result(i).getString(0) == s"$i", s"row $i String field did not match, got ${result(i).getString(0)}")
292-
assert(result(i).getInt(1) === i)
293-
assert(result(i).getLong(2) === i.toLong)
294-
assert(result(i).getFloat(3) === i.toFloat)
295-
assert(result(i).getDouble(4) === i.toDouble)
296-
assert(result(i).getShort(5) === i.toShort)
297-
assert(result(i).getByte(6) === i.toByte)
298-
assert(result(i).getBoolean(7) === (i % 2 == 0))
299-
assert(result(i)(8) === (0 to i).map(_.toByte).toArray)
300-
assert(result(i)(9) === (0 until i))
301-
assert(result(i)(10) === (0 until i).map(i => if (i % 3 == 0) i else null))
302-
assert(result(i)(11) === (0 until i).map(i => i -> i.toLong).toMap)
303-
assert(result(i)(12) === (0 until i).map(i => i -> i.toLong).toMap + (i -> null))
304-
assert(result(i)(13) === new GenericRow(Array[Any]((0 until i), new GenericRow(Array[Any](i, s"$i")))))
305-
}
286+
data.saveAsParquetFile(tempDir)
287+
288+
checkAnswer(
289+
parquetFile(tempDir),
290+
data.toSchemaRDD.collect().toSeq)
306291
}
307292

308293
test("self-join parquet files") {
@@ -399,23 +384,6 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
399384
}
400385
}
401386

402-
test("Saving case class RDD table to file and reading it back in") {
403-
val file = getTempFilePath("parquet")
404-
val path = file.toString
405-
val rdd = TestSQLContext.sparkContext.parallelize((1 to 100))
406-
.map(i => TestRDDEntry(i, s"val_$i"))
407-
rdd.saveAsParquetFile(path)
408-
val readFile = parquetFile(path)
409-
readFile.registerTempTable("tmpx")
410-
val rdd_copy = sql("SELECT * FROM tmpx").collect()
411-
val rdd_orig = rdd.collect()
412-
for(i <- 0 to 99) {
413-
assert(rdd_copy(i).apply(0) === rdd_orig(i).key, s"key error in line $i")
414-
assert(rdd_copy(i).apply(1) === rdd_orig(i).value, s"value error in line $i")
415-
}
416-
Utils.deleteRecursively(file)
417-
}
418-
419387
test("Read a parquet file instead of a directory") {
420388
val file = getTempFilePath("parquet")
421389
val path = file.toString
@@ -453,14 +421,14 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
453421
// TODO: why does collecting break things? It seems InsertIntoParquet::execute() is
454422
// executed twice otherwise?!
455423
sql("INSERT INTO dest SELECT * FROM source")
456-
val rdd_copy2 = sql("SELECT * FROM dest").collect()
424+
val rdd_copy2 = sql("SELECT * FROM dest").collect().sortBy(_.getInt(0))
457425
assert(rdd_copy2.size === 200)
458426
assert(rdd_copy2(0).apply(0) === 1)
459427
assert(rdd_copy2(0).apply(1) === "val_1")
460-
assert(rdd_copy2(99).apply(0) === 100)
461-
assert(rdd_copy2(99).apply(1) === "val_100")
462-
assert(rdd_copy2(100).apply(0) === 1)
463-
assert(rdd_copy2(100).apply(1) === "val_1")
428+
assert(rdd_copy2(99).apply(0) === 50)
429+
assert(rdd_copy2(99).apply(1) === "val_50")
430+
assert(rdd_copy2(199).apply(0) === 100)
431+
assert(rdd_copy2(199).apply(1) === "val_100")
464432
Utils.deleteRecursively(dirname)
465433
}
466434

0 commit comments

Comments
 (0)