Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.14.6
Intel(R) Core(TM) i5-8210Y CPU @ 1.60GHz
constructor: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
arrayOfAny 6 7 1 1770.9 0.6 1.0X
arrayOfAnyAsObject 6 7 2 1709.3 0.6 1.0X
arrayOfAnyAsSeq 5 6 2 2195.5 0.5 1.2X
arrayOfInt 452 469 13 22.1 45.2 0.0X
arrayOfIntAsObject 678 690 11 14.7 67.8 0.0X

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here are the JDK11 benchmark results before this patch's changes:

[info] OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.14.6
[info] Intel(R) Core(TM) i5-8210Y CPU @ 1.60GHz
[info] constructor:                              Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] arrayOfAny                                            6              6           1       1776.5           0.6       1.0X
[info] arrayOfAnyAsObject                                  374            386           9         26.7          37.4       0.0X
[info] arrayOfAnyAsSeq                                       5              5           0       2211.1           0.5       1.2X
[info] arrayOfInt                                          472            494          29         21.2          47.2       0.0X
[info] arrayOfIntAsObject                                  798            799           1         12.5          79.8       0.0X

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you so much!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A pleasant discovery is that this patch has an even larger positive impact on JDK11: on JDK11, this PR's changes seem to more-or-less completely eliminate the performance gap between the this(Any) and this(Array[Any]) constructors 🎉

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, the result looks nice.

10 changes: 10 additions & 0 deletions sql/catalyst/benchmarks/GenericArrayDataBenchmark-results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Java HotSpot(TM) 64-Bit Server VM 1.8.0_191-b12 on Mac OS X 10.14.6
Intel(R) Core(TM) i5-8210Y CPU @ 1.60GHz
constructor: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
arrayOfAny 7 8 2 1471.6 0.7 1.0X
arrayOfAnyAsObject 197 207 9 50.7 19.7 0.0X
arrayOfAnyAsSeq 25 27 2 398.0 2.5 0.3X
arrayOfInt 613 630 15 16.3 61.3 0.0X
arrayOfIntAsObject 866 872 8 11.5 86.6 0.0X

Copy link
Contributor Author

@JoshRosen JoshRosen Jan 19, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here are the JDK8 benchmark results before this patch's changes:

[info] Java HotSpot(TM) 64-Bit Server VM 1.8.0_191-b12 on Mac OS X 10.14.6
[info] Intel(R) Core(TM) i5-8210Y CPU @ 1.60GHz
[info] constructor:                              Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
[info] ------------------------------------------------------------------------------------------------------------------------
[info] arrayOfAny                                            8             12           4       1281.5           0.8       1.0X
[info] arrayOfAnyAsObject                                  531            605          76         18.8          53.1       0.0X
[info] arrayOfAnyAsSeq                                      26             35          20        384.4           2.6       0.3X
[info] arrayOfInt                                          703            779         117         14.2          70.3       0.0X
[info] arrayOfIntAsObject                                 1336           1397          86          7.5         133.6       0.0X

Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,6 @@ import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types.{DataType, Decimal}
import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}

private object GenericArrayData {

// SPARK-16634: Workaround for JVM bug present in some 1.7 versions.
def anyToSeq(seqOrArray: Any): Seq[Any] = seqOrArray match {
case seq: Seq[Any] => seq
case array: Array[_] => array.toSeq
}

}

class GenericArrayData(val array: Array[Any]) extends ArrayData {

def this(seq: Seq[Any]) = this(seq.toArray)
Expand All @@ -47,7 +37,11 @@ class GenericArrayData(val array: Array[Any]) extends ArrayData {
def this(primitiveArray: Array[Byte]) = this(primitiveArray.toSeq)
def this(primitiveArray: Array[Boolean]) = this(primitiveArray.toSeq)

def this(seqOrArray: Any) = this(GenericArrayData.anyToSeq(seqOrArray))
def this(seqOrArray: Any) = this(seqOrArray match {
case seq: Seq[Any] => seq.toArray
case array: Array[Any] => array // array of objects, so no need to convert
case array: Array[_] => array.toSeq.toArray[Any] // array of primitives, so box them
})

override def copy(): ArrayData = {
val newValues = new Array[Any](array.length)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.catalyst.util

import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}

/**
* Benchmark for [[GenericArrayData]].
* To run this benchmark:
* {{{
* 1. without sbt:
* bin/spark-submit --class <this class> --jars <spark core test jar> <spark catalyst test jar>
* 2. build/sbt "catalyst/test:runMain <this class>"
* 3. generate result:
* SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain <this class>"
* Results will be written to "benchmarks/GenericArrayDataBenchmark-results.txt".
* }}}
*/
object GenericArrayDataBenchmark extends BenchmarkBase {

// Benchmarks of GenericArrayData's constructors (see SPARK-30413):
def constructorBenchmark(): Unit = {
val valuesPerIteration: Long = 1000 * 1000 * 10
val arraySize = 10
val benchmark = new Benchmark("constructor", valuesPerIteration, output = output)

benchmark.addCase("arrayOfAny") { _ =>
val arr: Array[Any] = new Array[Any](arraySize)
var n = 0
while (n < valuesPerIteration) {
new GenericArrayData(arr)
n += 1
}
}

benchmark.addCase("arrayOfAnyAsObject") { _ =>
val arr: Object = new Array[Any](arraySize)
var n = 0
while (n < valuesPerIteration) {
new GenericArrayData(arr)
n += 1
}
}

benchmark.addCase("arrayOfAnyAsSeq") { _ =>
val arr: Seq[Any] = new Array[Any](arraySize)
var n = 0
while (n < valuesPerIteration) {
new GenericArrayData(arr)
n += 1
}
}

benchmark.addCase("arrayOfInt") { _ =>
val arr: Array[Int] = new Array[Int](arraySize)
var n = 0
while (n < valuesPerIteration) {
new GenericArrayData(arr)
n += 1
}
}

benchmark.addCase("arrayOfIntAsObject") { _ =>
val arr: Object = new Array[Int](arraySize)
var n = 0
while (n < valuesPerIteration) {
new GenericArrayData(arr)
n += 1
}
}

benchmark.run()
}

override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
constructorBenchmark()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,10 @@ private[parquet] class ParquetRowConverter(
// The parquet map may contains null or duplicated map keys. When it happens, the behavior is
// undefined.
// TODO (SPARK-26174): disallow it with a config.
updater.set(ArrayBasedMapData(currentKeys.toArray, currentValues.toArray))
updater.set(
new ArrayBasedMapData(
new GenericArrayData(currentKeys.toArray),
new GenericArrayData(currentValues.toArray)))
}

override def start(): Unit = {
Expand Down