diff --git a/sql/catalyst/benchmarks/GenericArrayDataBenchmark-jdk11-results.txt b/sql/catalyst/benchmarks/GenericArrayDataBenchmark-jdk11-results.txt new file mode 100644 index 0000000000000..6a5a9b781e894 --- /dev/null +++ b/sql/catalyst/benchmarks/GenericArrayDataBenchmark-jdk11-results.txt @@ -0,0 +1,10 @@ +OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.14.6 +Intel(R) Core(TM) i5-8210Y CPU @ 1.60GHz +constructor: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +arrayOfAny 6 7 1 1770.9 0.6 1.0X +arrayOfAnyAsObject 6 7 2 1709.3 0.6 1.0X +arrayOfAnyAsSeq 5 6 2 2195.5 0.5 1.2X +arrayOfInt 452 469 13 22.1 45.2 0.0X +arrayOfIntAsObject 678 690 11 14.7 67.8 0.0X + diff --git a/sql/catalyst/benchmarks/GenericArrayDataBenchmark-results.txt b/sql/catalyst/benchmarks/GenericArrayDataBenchmark-results.txt new file mode 100644 index 0000000000000..02971749662f5 --- /dev/null +++ b/sql/catalyst/benchmarks/GenericArrayDataBenchmark-results.txt @@ -0,0 +1,10 @@ +Java HotSpot(TM) 64-Bit Server VM 1.8.0_191-b12 on Mac OS X 10.14.6 +Intel(R) Core(TM) i5-8210Y CPU @ 1.60GHz +constructor: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +arrayOfAny 7 8 2 1471.6 0.7 1.0X +arrayOfAnyAsObject 197 207 9 50.7 19.7 0.0X +arrayOfAnyAsSeq 25 27 2 398.0 2.5 0.3X +arrayOfInt 613 630 15 16.3 61.3 0.0X +arrayOfIntAsObject 866 872 8 11.5 86.6 0.0X + diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala index 83ad08d8e1758..1f88a700847de 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala @@ -23,16 +23,6 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.{DataType, Decimal} import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} -private object GenericArrayData { - - // SPARK-16634: Workaround for JVM bug present in some 1.7 versions. - def anyToSeq(seqOrArray: Any): Seq[Any] = seqOrArray match { - case seq: Seq[Any] => seq - case array: Array[_] => array.toSeq - } - -} - class GenericArrayData(val array: Array[Any]) extends ArrayData { def this(seq: Seq[Any]) = this(seq.toArray) @@ -47,7 +37,11 @@ class GenericArrayData(val array: Array[Any]) extends ArrayData { def this(primitiveArray: Array[Byte]) = this(primitiveArray.toSeq) def this(primitiveArray: Array[Boolean]) = this(primitiveArray.toSeq) - def this(seqOrArray: Any) = this(GenericArrayData.anyToSeq(seqOrArray)) + def this(seqOrArray: Any) = this(seqOrArray match { + case seq: Seq[Any] => seq.toArray + case array: Array[Any] => array // array of objects, so no need to convert + case array: Array[_] => array.toSeq.toArray[Any] // array of primitives, so box them + }) override def copy(): ArrayData = { val newValues = new Array[Any](array.length) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala new file mode 100644 index 0000000000000..3ad045f29c07d --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/GenericArrayDataBenchmark.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util + +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} + +/** + * Benchmark for [[GenericArrayData]]. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "catalyst/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain " + * Results will be written to "benchmarks/GenericArrayDataBenchmark-results.txt". + * }}} + */ +object GenericArrayDataBenchmark extends BenchmarkBase { + + // Benchmarks of GenericArrayData's constructors (see SPARK-30413): + def constructorBenchmark(): Unit = { + val valuesPerIteration: Long = 1000 * 1000 * 10 + val arraySize = 10 + val benchmark = new Benchmark("constructor", valuesPerIteration, output = output) + + benchmark.addCase("arrayOfAny") { _ => + val arr: Array[Any] = new Array[Any](arraySize) + var n = 0 + while (n < valuesPerIteration) { + new GenericArrayData(arr) + n += 1 + } + } + + benchmark.addCase("arrayOfAnyAsObject") { _ => + val arr: Object = new Array[Any](arraySize) + var n = 0 + while (n < valuesPerIteration) { + new GenericArrayData(arr) + n += 1 + } + } + + benchmark.addCase("arrayOfAnyAsSeq") { _ => + val arr: Seq[Any] = new Array[Any](arraySize) + var n = 0 + while (n < valuesPerIteration) { + new GenericArrayData(arr) + n += 1 + } + } + + benchmark.addCase("arrayOfInt") { _ => + val arr: Array[Int] = new Array[Int](arraySize) + var n = 0 + while (n < valuesPerIteration) { + new GenericArrayData(arr) + n += 1 + } + } + + benchmark.addCase("arrayOfIntAsObject") { _ => + val arr: Object = new Array[Int](arraySize) + var n = 0 + while (n < valuesPerIteration) { + new GenericArrayData(arr) + n += 1 + } + } + + benchmark.run() + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + constructorBenchmark() + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala index c4a2df6642519..98ac2ecd2955c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala @@ -591,7 +591,10 @@ private[parquet] class ParquetRowConverter( // The parquet map may contains null or duplicated map keys. When it happens, the behavior is // undefined. // TODO (SPARK-26174): disallow it with a config. - updater.set(ArrayBasedMapData(currentKeys.toArray, currentValues.toArray)) + updater.set( + new ArrayBasedMapData( + new GenericArrayData(currentKeys.toArray), + new GenericArrayData(currentValues.toArray))) } override def start(): Unit = {