@@ -58,8 +58,7 @@ case class AllDataTypes(
5858 doubleField : Double ,
5959 shortField : Short ,
6060 byteField : Byte ,
61- booleanField : Boolean ,
62- binaryField : Array [Byte ])
61+ booleanField : Boolean )
6362
6463case class AllDataTypesWithNonPrimitiveType (
6564 stringField : String ,
@@ -70,13 +69,14 @@ case class AllDataTypesWithNonPrimitiveType(
7069 shortField : Short ,
7170 byteField : Byte ,
7271 booleanField : Boolean ,
73- binaryField : Array [Byte ],
7472 array : Seq [Int ],
7573 arrayContainsNull : Seq [Option [Int ]],
7674 map : Map [Int , Long ],
7775 mapValueContainsNull : Map [Int , Option [Long ]],
7876 data : Data )
7977
78+ case class BinaryData (binaryData : Array [Byte ])
79+
8080class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterAll {
8181 TestData // Load test data tables.
8282
@@ -108,23 +108,23 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
108108 test(" Read/Write All Types" ) {
109109 val tempDir = getTempFilePath(" parquetTest" ).getCanonicalPath
110110 val range = (0 to 255 )
111- TestSQLContext . sparkContext.parallelize(range)
112- .map(x => AllDataTypes (s " $x" , x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0 ,
113- ( 0 to x).map(_.toByte).toArray))
114- .saveAsParquetFile(tempDir)
115- val result = parquetFile(tempDir).collect()
116- range.foreach {
117- i =>
118- assert(result(i).getString( 0 ) == s " $i " , s " row $i String field did not match, got ${result(i).getString( 0 )} " )
119- assert(result(i).getInt( 1 ) === i)
120- assert(result(i).getLong( 2 ) === i.toLong)
121- assert(result(i).getFloat( 3 ) === i.toFloat)
122- assert(result(i).getDouble( 4 ) === i.toDouble)
123- assert(result(i).getShort( 5 ) === i.toShort)
124- assert(result(i).getByte( 6 ) === i.toByte )
125- assert(result(i).getBoolean( 7 ) === (i % 2 == 0 ) )
126- assert(result(i)( 8 ) === ( 0 to i).map(_.toByte).toArray )
127- }
111+ val data = sparkContext.parallelize(range)
112+ .map(x => AllDataTypes (s " $x" , x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0 ))
113+
114+ data .saveAsParquetFile(tempDir)
115+
116+ checkAnswer(
117+ parquetFile(tempDir),
118+ data.toSchemaRDD.collect().toSeq )
119+ }
120+
121+ test( " read/write binary data " ) {
122+ // Since equality for Array[Byte] is broken we test this separately.
123+ val tempDir = getTempFilePath( " parquetTest " ).getCanonicalPath
124+ sparkContext.parallelize( BinaryData ( " test " .getBytes( " utf8 " )) :: Nil ).saveAsParquetFile(tempDir )
125+ parquetFile(tempDir )
126+ .map(r => new String (r( 0 ). asInstanceOf [ Array [ Byte ]], " utf8 " ) )
127+ .collect().toSeq == Seq ( " test " )
128128 }
129129
130130 test(" Treat binary as string" ) {
@@ -275,34 +275,19 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
275275 test(" Read/Write All Types with non-primitive type" ) {
276276 val tempDir = getTempFilePath(" parquetTest" ).getCanonicalPath
277277 val range = (0 to 255 )
278- TestSQLContext . sparkContext.parallelize(range)
278+ val data = sparkContext.parallelize(range)
279279 .map(x => AllDataTypesWithNonPrimitiveType (
280280 s " $x" , x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0 ,
281- (0 to x).map(_.toByte).toArray,
282281 (0 until x),
283282 (0 until x).map(Option (_).filter(_ % 3 == 0 )),
284283 (0 until x).map(i => i -> i.toLong).toMap,
285284 (0 until x).map(i => i -> Option (i.toLong)).toMap + (x -> None ),
286285 Data ((0 until x), Nested (x, s " $x" ))))
287- .saveAsParquetFile(tempDir)
288- val result = parquetFile(tempDir).collect()
289- range.foreach {
290- i =>
291- assert(result(i).getString(0 ) == s " $i" , s " row $i String field did not match, got ${result(i).getString(0 )}" )
292- assert(result(i).getInt(1 ) === i)
293- assert(result(i).getLong(2 ) === i.toLong)
294- assert(result(i).getFloat(3 ) === i.toFloat)
295- assert(result(i).getDouble(4 ) === i.toDouble)
296- assert(result(i).getShort(5 ) === i.toShort)
297- assert(result(i).getByte(6 ) === i.toByte)
298- assert(result(i).getBoolean(7 ) === (i % 2 == 0 ))
299- assert(result(i)(8 ) === (0 to i).map(_.toByte).toArray)
300- assert(result(i)(9 ) === (0 until i))
301- assert(result(i)(10 ) === (0 until i).map(i => if (i % 3 == 0 ) i else null ))
302- assert(result(i)(11 ) === (0 until i).map(i => i -> i.toLong).toMap)
303- assert(result(i)(12 ) === (0 until i).map(i => i -> i.toLong).toMap + (i -> null ))
304- assert(result(i)(13 ) === new GenericRow (Array [Any ]((0 until i), new GenericRow (Array [Any ](i, s " $i" )))))
305- }
286+ data.saveAsParquetFile(tempDir)
287+
288+ checkAnswer(
289+ parquetFile(tempDir),
290+ data.toSchemaRDD.collect().toSeq)
306291 }
307292
308293 test(" self-join parquet files" ) {
@@ -399,23 +384,6 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
399384 }
400385 }
401386
402- test(" Saving case class RDD table to file and reading it back in" ) {
403- val file = getTempFilePath(" parquet" )
404- val path = file.toString
405- val rdd = TestSQLContext .sparkContext.parallelize((1 to 100 ))
406- .map(i => TestRDDEntry (i, s " val_ $i" ))
407- rdd.saveAsParquetFile(path)
408- val readFile = parquetFile(path)
409- readFile.registerTempTable(" tmpx" )
410- val rdd_copy = sql(" SELECT * FROM tmpx" ).collect()
411- val rdd_orig = rdd.collect()
412- for (i <- 0 to 99 ) {
413- assert(rdd_copy(i).apply(0 ) === rdd_orig(i).key, s " key error in line $i" )
414- assert(rdd_copy(i).apply(1 ) === rdd_orig(i).value, s " value error in line $i" )
415- }
416- Utils .deleteRecursively(file)
417- }
418-
419387 test(" Read a parquet file instead of a directory" ) {
420388 val file = getTempFilePath(" parquet" )
421389 val path = file.toString
@@ -453,14 +421,14 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
453421 // TODO: why does collecting break things? It seems InsertIntoParquet::execute() is
454422 // executed twice otherwise?!
455423 sql(" INSERT INTO dest SELECT * FROM source" )
456- val rdd_copy2 = sql(" SELECT * FROM dest" ).collect()
424+ val rdd_copy2 = sql(" SELECT * FROM dest" ).collect().sortBy(_.getInt( 0 ))
457425 assert(rdd_copy2.size === 200 )
458426 assert(rdd_copy2(0 ).apply(0 ) === 1 )
459427 assert(rdd_copy2(0 ).apply(1 ) === " val_1" )
460- assert(rdd_copy2(99 ).apply(0 ) === 100 )
461- assert(rdd_copy2(99 ).apply(1 ) === " val_100 " )
462- assert(rdd_copy2(100 ).apply(0 ) === 1 )
463- assert(rdd_copy2(100 ).apply(1 ) === " val_1 " )
428+ assert(rdd_copy2(99 ).apply(0 ) === 50 )
429+ assert(rdd_copy2(99 ).apply(1 ) === " val_50 " )
430+ assert(rdd_copy2(199 ).apply(0 ) === 100 )
431+ assert(rdd_copy2(199 ).apply(1 ) === " val_100 " )
464432 Utils .deleteRecursively(dirname)
465433 }
466434
0 commit comments