Only zero the used portion of groupingKeyConversionScratchSpace

JoshRosen · JoshRosen · commit 3ca84b2c28c0 · 2015-04-28T11:28:33.000-07:00
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
@@ -139,13 +139,17 @@ private static long[] convertToUnsafeRow(Row javaRow, StructType schema) {
    * return the same object.
    */
   public UnsafeRow getAggregationBuffer(Row groupingKey) {
-    // Zero out the buffer that's used to hold the current row. This is necessary in order
-    // to ensure that rows hash properly, since garbage data from the previous row could
-    // otherwise end up as padding in this row.
-    Arrays.fill(groupingKeyConversionScratchSpace, 0);
     final int groupingKeySize = groupingKeyToUnsafeRowConverter.getSizeRequirement(groupingKey);
+    // Make sure that the buffer is large enough to hold the key. If it's not, grow it:
     if (groupingKeySize > groupingKeyConversionScratchSpace.length) {
+      // This new array will be initially zero, so there's no need to zero it out here
       groupingKeyConversionScratchSpace = new long[groupingKeySize];
+    } else {
+      // Zero out the buffer that's used to hold the current row. This is necessary in order
+      // to ensure that rows hash properly, since garbage data from the previous row could
+      // otherwise end up as padding in this row. As a performance optimization, we only zero out
+      // the portion of the buffer that we'll actually write to.
+      Arrays.fill(groupingKeyConversionScratchSpace, 0, groupingKeySize, 0);
     }
     final long actualGroupingKeySize = groupingKeyToUnsafeRowConverter.writeRow(
       groupingKey,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMapSuite.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import scala.collection.JavaConverters._
+import scala.util.Random
+
 import org.apache.spark.unsafe.memory.{MemoryManager, MemoryAllocator}
 import org.scalatest.{BeforeAndAfterEach, FunSuite, Matchers}
 
@@ -59,8 +62,8 @@ class UnsafeFixedWidthAggregationMapSuite extends FunSuite with Matchers with Be
       aggBufferSchema,
       groupKeySchema,
       memoryManager,
-      1024,
-      false
+      1024, // initial capacity
+      false // disable perf metrics
     )
     assert(!map.iterator().hasNext)
     map.free()
@@ -72,8 +75,8 @@ class UnsafeFixedWidthAggregationMapSuite extends FunSuite with Matchers with Be
       aggBufferSchema,
       groupKeySchema,
       memoryManager,
-      1024,
-      false
+      1024, // initial capacity
+      false // disable perf metrics
     )
     val groupKey = new GenericRow(Array[Any](UTF8String("cats")))
 
@@ -92,4 +95,25 @@ class UnsafeFixedWidthAggregationMapSuite extends FunSuite with Matchers with Be
     map.free()
   }
 
+  test("inserting large random keys") {
+    val map = new UnsafeFixedWidthAggregationMap(
+      emptyAggregationBuffer,
+      aggBufferSchema,
+      groupKeySchema,
+      memoryManager,
+      128, // initial capacity
+      false // disable perf metrics
+    )
+    val rand = new Random(42)
+    val groupKeys: Set[String] = Seq.fill(512)(rand.nextString(1024)).toSet
+    groupKeys.foreach { keyString =>
+      map.getAggregationBuffer(new GenericRow(Array[Any](UTF8String(keyString))))
+    }
+    val seenKeys: Set[String] = map.iterator().asScala.map { entry =>
+      entry.key.getString(0)
+    }.toSet
+    seenKeys.size should be (groupKeys.size)
+    seenKeys should be (groupKeys)
+  }
+
 }