Address a number of minor code review comments.

JoshRosen · JoshRosen · commit 92d5a06b181b · 2015-04-23T12:10:11.000-07:00
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java
@@ -154,7 +154,7 @@ public UnsafeRow getAggregationBuffer(Row groupingKey) {
     if (!loc.isDefined()) {
       // This is the first time that we've seen this grouping key, so we'll insert a copy of the
       // empty aggregation buffer into the map:
-      loc.storeKeyAndValue(
+      loc.putNewKey(
         groupingKeyConversionScratchSpace,
         PlatformDependent.LONG_ARRAY_OFFSET,
         groupingKeySize,
@@ -166,7 +166,7 @@ public UnsafeRow getAggregationBuffer(Row groupingKey) {
 
     // Reset the pointer to point to the value that we just stored or looked up:
     final MemoryLocation address = loc.getValueAddress();
-    currentAggregationBuffer.set(
+    currentAggregationBuffer.pointTo(
       address.getBaseObject(),
       address.getBaseOffset(),
       aggregationBufferSchema.length(),
@@ -201,13 +201,13 @@ public MapEntry next() {
         final BytesToBytesMap.Location loc = mapLocationIterator.next();
         final MemoryLocation keyAddress = loc.getKeyAddress();
         final MemoryLocation valueAddress = loc.getValueAddress();
-        entry.key.set(
+        entry.key.pointTo(
           keyAddress.getBaseObject(),
           keyAddress.getBaseOffset(),
           groupingKeySchema.length(),
           groupingKeySchema
         );
-        entry.value.set(
+        entry.value.pointTo(
           valueAddress.getBaseObject(),
           valueAddress.getBaseOffset(),
           aggregationBufferSchema.length(),
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -17,17 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions;
 
-
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.types.DataType;
-import static org.apache.spark.sql.types.DataTypes.*;
-
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-import org.apache.spark.sql.types.UTF8String;
-import org.apache.spark.unsafe.PlatformDependent;
-import org.apache.spark.unsafe.bitset.BitSetMethods;
-import org.apache.spark.unsafe.string.UTF8StringMethods;
 import scala.collection.Map;
 import scala.collection.Seq;
 import scala.collection.mutable.ArraySeq;
@@ -40,12 +29,20 @@
 import java.util.List;
 import java.util.Set;
 
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.types.DataType;
+import static org.apache.spark.sql.types.DataTypes.*;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.types.UTF8String;
+import org.apache.spark.unsafe.PlatformDependent;
+import org.apache.spark.unsafe.bitset.BitSetMethods;
+import org.apache.spark.unsafe.string.UTF8StringMethods;
 
 // TODO: pick a better name for this class, since this is potentially confusing.
 // Maybe call it UnsafeMutableRow?
 
 /**
- * An Unsafe implementation of Row which is backed by raw memory instead of Java objets.
+ * An Unsafe implementation of Row which is backed by raw memory instead of Java objects.
  *
  * Each tuple has three parts: [null bit set] [values] [variable length portion]
  *
@@ -56,6 +53,9 @@
  * primitive types, such as long, double, or int, we store the value directly in the word. For
  * fields with non-primitive or variable-length values, we store a relative offset (w.r.t. the
  * base address of the row) that points to the beginning of the variable-length field.
+ *
+ * Instances of `UnsafeRow` act as pointers to row data stored in this format, similar to how
+ * `Writable` objects work in Hadoop.
  */
 public final class UnsafeRow implements MutableRow {
 
@@ -64,6 +64,11 @@ public final class UnsafeRow implements MutableRow {
   private int numFields;
   /** The width of the null tracking bit set, in bytes */
   private int bitSetWidthInBytes;
+  /**
+   * This optional schema is required if you want to call generic get() and set() methods on
+   * this UnsafeRow, but is optional if callers will only use type-specific getTYPE() and setTYPE()
+   * methods.
+   */
   @Nullable
   private StructType schema;
 
@@ -103,9 +108,27 @@ public static int calculateBitSetWidthInBytes(int numFields) {
     readableFieldTypes.addAll(settableFieldTypes);
   }
 
+  /**
+   * Construct a new UnsafeRow. The resulting row won't be usable until `pointTo()` has been called,
+   * since the value returned by this constructor is equivalent to a null pointer.
+   */
   public UnsafeRow() { }
 
-  public void set(Object baseObject, long baseOffset, int numFields, StructType schema) {
+  /**
+   * Update this UnsafeRow to point to different backing data.
+   *
+   * @param baseObject the base object
+   * @param baseOffset the offset within the base object
+   * @param numFields the number of fields in this row
+   * @param schema an optional schema; this is necessary if you want to call generic get() or set()
+   *               methods on this row, but is optional if the caller will only use type-specific
+   *               getTYPE() and setTYPE() methods.
+   */
+  public void pointTo(
+      Object baseObject,
+      long baseOffset,
+      int numFields,
+      @Nullable StructType schema) {
     assert numFields >= 0 : "numFields should >= 0";
     assert schema == null || schema.fields().length == numFields;
     this.bitSetWidthInBytes = calculateBitSetWidthInBytes(numFields);
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverter.scala
@@ -180,7 +180,7 @@ class UnsafeRowConverter(fieldTypes: Array[DataType]) {
   }
 
   def writeRow(row: Row, baseObject: Object, baseOffset: Long): Long = {
-    unsafeRow.set(baseObject, baseOffset, writers.length, null)
+    unsafeRow.pointTo(baseObject, baseOffset, writers.length, null)
     var fieldNumber = 0
     var appendCursor: Int = fixedLengthSize
     while (fieldNumber < writers.length) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -38,7 +38,7 @@ class UnsafeRowConverterSuite extends FunSuite with Matchers {
     val numBytesWritten = converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET)
     numBytesWritten should be (sizeRequired)
     val unsafeRow = new UnsafeRow()
-    unsafeRow.set(buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, null)
+    unsafeRow.pointTo(buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, null)
     unsafeRow.getLong(0) should be (0)
     unsafeRow.getLong(1) should be (1)
     unsafeRow.getInt(2) should be (2)
@@ -59,7 +59,7 @@ class UnsafeRowConverterSuite extends FunSuite with Matchers {
     val numBytesWritten = converter.writeRow(row, buffer, PlatformDependent.LONG_ARRAY_OFFSET)
     numBytesWritten should be (sizeRequired)
     val unsafeRow = new UnsafeRow()
-    unsafeRow.set(buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, null)
+    unsafeRow.pointTo(buffer, PlatformDependent.LONG_ARRAY_OFFSET, fieldTypes.length, null)
     unsafeRow.getLong(0) should be (0)
     unsafeRow.getString(1) should be ("Hello")
     unsafeRow.getString(2) should be ("World")
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/unsafe/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -17,14 +17,6 @@
 
 package org.apache.spark.unsafe.map;
 
-import org.apache.spark.unsafe.*;
-import org.apache.spark.unsafe.array.ByteArrayMethods;
-import org.apache.spark.unsafe.array.LongArray;
-import org.apache.spark.unsafe.bitset.BitSet;
-import org.apache.spark.unsafe.hash.Murmur3_x86_32;
-import org.apache.spark.unsafe.memory.*;
-
-import java.lang.IllegalStateException;
 import java.lang.Long;
 import java.lang.Object;
 import java.lang.Override;
@@ -33,8 +25,17 @@
 import java.util.LinkedList;
 import java.util.List;
 
+import org.apache.spark.unsafe.*;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
+import org.apache.spark.unsafe.array.LongArray;
+import org.apache.spark.unsafe.bitset.BitSet;
+import org.apache.spark.unsafe.hash.Murmur3_x86_32;
+import org.apache.spark.unsafe.memory.*;
+
 /**
- * A bytes to bytes hash map where keys and values are contiguous regions of bytes.
+ * An append-only hash map where keys and values are contiguous regions of bytes.
+ *
+ * This class is not thread-safe.
  *
  * This is backed by a power-of-2-sized hash table, using quadratic probing with triangular numbers,
  * which is guaranteed to exhaust the space.
@@ -350,36 +351,34 @@ public long getValueLength() {
     }
 
     /**
-     * Sets the value defined at this position. This method may only be called once for a given
-     * key; if you want to update the value associated with a key, then you can directly manipulate
-     * the bytes stored at the value address.
+     * Store a new key and value. This method may only be called once for a given key; if you want
+     * to update the value associated with a key, then you can directly manipulate the bytes stored
+     * at the value address.
      *
-     * It is only valid to call this method after having first called `lookup()` using the same key.
+     * It is only valid to call this method immediately after calling `lookup()` using the same key.
      *
      * After calling this method, calls to `get[Key|Value]Address()` and `get[Key|Value]Length`
-     * will return information on the data stored by this `storeKeyAndValue` call.
+     * will return information on the data stored by this `putNewKey` call.
      *
      * As an example usage, here's the proper way to store a new key:
      *
      * <code>
      *   Location loc = map.lookup(keyBaseOffset, keyBaseObject, keyLengthInBytes);
      *   if (!loc.isDefined()) {
-     *     loc.storeKeyAndValue(keyBaseOffset, keyBaseObject, keyLengthInBytes, ...)
+     *     loc.putNewKey(keyBaseOffset, keyBaseObject, keyLengthInBytes, ...)
      *   }
      * </code>
      *
      * Unspecified behavior if the key is not defined.
      */
-    public void storeKeyAndValue(
-        Object keyBaseObject,
-        long keyBaseOffset,
-        int keyLengthBytes,  // TODO(josh): words?  bytes? eventually, we'll want to be more consistent about this
-        Object valueBaseObject,
-        long valueBaseOffset,
-        long valueLengthBytes) {
-      if (isDefined) {
-        throw new IllegalStateException("Can only set value once for a key");
-      }
+    public void putNewKey(
+      Object keyBaseObject,
+      long keyBaseOffset,
+      int keyLengthBytes,  // TODO(josh): words?  bytes? eventually, we'll want to be more consistent about this
+      Object valueBaseObject,
+      long valueBaseOffset,
+      long valueLengthBytes) {
+      assert (!isDefined) : "Can only set value once for a key";
       isDefined = true;
       assert (keyLengthBytes % 8 == 0);
       assert (valueLengthBytes % 8 == 0);
@@ -388,7 +387,6 @@ public void storeKeyAndValue(
       // must be stored in the same memory page.
       final long requiredSize = 8 + 8 + keyLengthBytes + valueLengthBytes;
       assert(requiredSize <= PAGE_SIZE_BYTES);
-      // Bookeeping
       size++;
       bitset.set(pos);
 
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java b/unsafe/src/main/java/org/apache/spark/unsafe/map/HashMapGrowthStrategy.java
@@ -27,7 +27,7 @@ public interface HashMapGrowthStrategy {
   /**
    * Double the size of the hash map every time.
    */
-   HashMapGrowthStrategy DOUBLING = new Doubling();
+  HashMapGrowthStrategy DOUBLING = new Doubling();
 
   class Doubling implements HashMapGrowthStrategy {
     @Override
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/string/UTF8StringMethods.java b/unsafe/src/main/java/org/apache/spark/unsafe/string/UTF8StringMethods.java
@@ -17,11 +17,13 @@
 
 package org.apache.spark.unsafe.string;
 
+import java.io.UnsupportedEncodingException;
+import java.lang.Object;
+import java.lang.String;
+
 import org.apache.spark.unsafe.PlatformDependent;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 
-import java.io.UnsupportedEncodingException;import java.lang.Object;import java.lang.String;
-
 /**
  * A String encoded in UTF-8 as long representing the string's length, followed by a
  * contiguous region of bytes; see http://en.wikipedia.org/wiki/UTF-8 for details.
@@ -33,14 +35,6 @@ private UTF8StringMethods() {
     // See UTF8StringPointer for a more object-oriented interface to UTF8String data.
   }
 
-  /**
-   * Return the length of the string, in bytes (NOT characters), not including
-   * the space to store the length itself.
-   */
-  static long getLengthInBytes(Object baseObject, long baseOffset) {
-    return PlatformDependent.UNSAFE.getLong(baseObject, baseOffset);
-  }
-
   public static int compare(
       Object leftBaseObject,
       long leftBaseOffset,
@@ -68,7 +62,7 @@ public static boolean startsWith(
       int prefixLengthInBytes) {
     if (prefixLengthInBytes > strLengthInBytes) {
       return false;
-    } {
+    } else {
       return ByteArrayMethods.arrayEquals(
         strBaseObject,
         strBaseOffset,
@@ -87,7 +81,7 @@ public static boolean endsWith(
       int suffixLengthInBytes) {
     if (suffixLengthInBytes > strLengthInBytes) {
       return false;
-    } {
+    } else {
       return ByteArrayMethods.arrayEquals(
         strBaseObject,
         strBaseOffset + strLengthInBytes - suffixLengthInBytes,
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractTestBytesToBytesMap.java b/unsafe/src/test/java/org/apache/spark/unsafe/map/AbstractTestBytesToBytesMap.java
@@ -96,7 +96,7 @@ public void setAndRetrieveAKey() {
       final BytesToBytesMap.Location loc =
         map.lookup(keyData, BYTE_ARRAY_OFFSET, recordLengthBytes);
       Assert.assertFalse(loc.isDefined());
-      loc.storeKeyAndValue(
+      loc.putNewKey(
         keyData,
         BYTE_ARRAY_OFFSET,
         recordLengthBytes,
@@ -119,7 +119,7 @@ public void setAndRetrieveAKey() {
       Assert.assertArrayEquals(valueData, getByteArray(loc.getValueAddress(), recordLengthBytes));
 
       try {
-        loc.storeKeyAndValue(
+        loc.putNewKey(
           keyData,
           BYTE_ARRAY_OFFSET,
           recordLengthBytes,
@@ -146,7 +146,7 @@ public void iteratorTest() throws Exception {
         final BytesToBytesMap.Location loc =
           map.lookup(value, PlatformDependent.LONG_ARRAY_OFFSET, 8);
         Assert.assertFalse(loc.isDefined());
-        loc.storeKeyAndValue(
+        loc.putNewKey(
           value,
           PlatformDependent.LONG_ARRAY_OFFSET,
           8,
@@ -196,15 +196,15 @@ public void randomizedStressTest() {
             key.length
           );
           Assert.assertFalse(loc.isDefined());
-          loc.storeKeyAndValue(
+          loc.putNewKey(
             key,
             BYTE_ARRAY_OFFSET,
             key.length,
             value,
             BYTE_ARRAY_OFFSET,
             value.length
           );
-          // After calling storeKeyAndValue, the following should be true, even before calling
+          // After calling putNewKey, the following should be true, even before calling
           // lookup():
           Assert.assertTrue(loc.isDefined());
           Assert.assertEquals(key.length, loc.getKeyLength());

Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ class UnsafeRowConverter(fieldTypes: Array[DataType]) {`
`180`	`180`	`}`
`181`	`181`
`182`	`182`	`def writeRow(row: Row, baseObject: Object, baseOffset: Long): Long = {`
`183`		`- unsafeRow.set(baseObject, baseOffset, writers.length, null)`
	`183`	`+ unsafeRow.pointTo(baseObject, baseOffset, writers.length, null)`
`184`	`184`	`var fieldNumber = 0`
`185`	`185`	`var appendCursor: Int = fixedLengthSize`
`186`	`186`	`while (fieldNumber < writers.length) {`