Add tests for sorting on all primitive types.

JoshRosen · JoshRosen · commit 4c37ba622ca5 · 2015-07-06T11:51:28.000-07:00
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillWriter.java
@@ -37,8 +37,7 @@ final class UnsafeSorterSpillWriter {
 
   // Small writes to DiskBlockObjectWriter will be fairly inefficient. Since there doesn't seem to
   // be an API to directly transfer bytes from managed memory to the disk writer, we buffer
-  // data through a byte array. This array does not need to be large enough to hold a single
-  // record;
+  // data through a byte array.
   private byte[] writeBuffer = new byte[DISK_WRITE_BUFFER_SIZE];
 
   private final File file;
@@ -115,10 +114,6 @@ public void close() throws IOException {
     writeBuffer = null;
   }
 
-  public long numberOfSpilledBytes() {
-    return file.length();
-  }
-
   public UnsafeSorterSpillReader getReader(BlockManager blockManager) throws IOException {
     return new UnsafeSorterSpillReader(blockManager, file, blockId);
   }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/execution/UnsafeExternalRowSorter.java
@@ -20,7 +20,6 @@
 import java.io.IOException;
 import java.util.Arrays;
 
-import scala.Function1;
 import scala.collection.Iterator;
 import scala.math.Ordering;
 
@@ -47,17 +46,20 @@ final class UnsafeExternalRowSorter {
 
   private final StructType schema;
   private final UnsafeRowConverter rowConverter;
-  private final Function1<InternalRow, Long> prefixComputer;
+  private final PrefixComputer prefixComputer;
   private final ObjectPool objPool = new ObjectPool(128);
   private final UnsafeExternalSorter sorter;
   private byte[] rowConversionBuffer = new byte[1024 * 8];
 
+  public static abstract class PrefixComputer {
+    abstract long computePrefix(InternalRow row);
+  }
+
   public UnsafeExternalRowSorter(
       StructType schema,
       Ordering<InternalRow> ordering,
       PrefixComparator prefixComparator,
-      // TODO: if possible, avoid this boxing of the return value
-      Function1<InternalRow, Long> prefixComputer) throws IOException {
+      PrefixComputer prefixComputer) throws IOException {
     this.schema = schema;
     this.rowConverter = new UnsafeRowConverter(schema);
     this.prefixComputer = prefixComputer;
@@ -90,7 +92,7 @@ void insertRow(InternalRow row) throws IOException {
     final int bytesWritten = rowConverter.writeRow(
       row, rowConversionBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, objPool);
     assert (bytesWritten == sizeRequirement);
-    final long prefix = prefixComputer.apply(row);
+    final long prefix = prefixComputer.computePrefix(row);
     sorter.insertRecord(
       rowConversionBuffer,
       PlatformDependent.BYTE_ARRAY_OFFSET,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -27,7 +27,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.util.collection.ExternalSorter
-import org.apache.spark.util.collection.unsafe.sort.PrefixComparator
 import org.apache.spark.util.{CompletionIterator, MutablePair}
 import org.apache.spark.{HashPartitioner, SparkEnv}
 
@@ -271,11 +270,13 @@ case class UnsafeExternalSort(
     assert(codegenEnabled, "UnsafeExternalSort requires code generation to be enabled")
     def doSort(iterator: Iterator[InternalRow]): Iterator[InternalRow] = {
       val ordering = newOrdering(sortOrder, child.output)
-      val prefixComparator = new PrefixComparator {
-        override def compare(prefix1: Long, prefix2: Long): Int = 0
+      val prefixComparator = SortPrefixUtils.getPrefixComparator(sortOrder.head)
+      val prefixComputer = {
+        val prefixComputer = SortPrefixUtils.getPrefixComputer(sortOrder.head)
+        new UnsafeExternalRowSorter.PrefixComputer {
+          override def computePrefix(row: InternalRow): Long = prefixComputer(row)
+        }
       }
-      // TODO: do real prefix comparison. For dev/testing purposes, this is a dummy implementation.
-      def prefixComputer(row: InternalRow): Long = 0
       new UnsafeExternalRowSorter(schema, ordering, prefixComparator, prefixComputer).sort(iterator)
     }
     child.execute().mapPartitions(doSort, preservesPartitioning = true)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala
@@ -17,18 +17,16 @@
 
 package org.apache.spark.sql.execution
 
-import scala.language.implicitConversions
-import scala.reflect.runtime.universe.TypeTag
-import scala.util.control.NonFatal
-
 import org.apache.spark.SparkFunSuite
-
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.BoundReference
 import org.apache.spark.sql.catalyst.util._
-
 import org.apache.spark.sql.test.TestSQLContext
-import org.apache.spark.sql.{DataFrameHolder, Row, DataFrame}
+import org.apache.spark.sql.{DataFrame, DataFrameHolder, Row}
+
+import scala.language.implicitConversions
+import scala.reflect.runtime.universe.TypeTag
+import scala.util.control.NonFatal
 
 /**
  * Base class for writing tests for individual physical operators. For an example of how this
@@ -77,13 +75,93 @@ class SparkPlanTest extends SparkFunSuite {
       case None =>
     }
   }
+
+  /**
+   * Runs the plan and makes sure the answer matches the result produced by a reference plan.
+   * @param input the input data to be used.
+   * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
+   *                     the physical operator that's being tested.
+   * @param expectedPlanFunction a function which accepts the input SparkPlan and uses it to
+   *                             instantiate a reference implementation of the physical operator
+   *                             that's being tested. The result of executing this plan will be
+   *                             treated as the source-of-truth for the test.
+   */
+  protected def checkAnswer(
+      input: DataFrame,
+      planFunction: SparkPlan => SparkPlan,
+      expectedPlanFunction: SparkPlan => SparkPlan): Unit = {
+    SparkPlanTest.checkAnswer(input, planFunction, expectedPlanFunction) match {
+      case Some(errorMessage) => fail(errorMessage)
+      case None =>
+    }
+  }
 }
 
 /**
  * Helper methods for writing tests of individual physical operators.
  */
 object SparkPlanTest {
 
+  /**
+   * Runs the plan and makes sure the answer matches the result produced by a reference plan.
+   * @param input the input data to be used.
+   * @param planFunction a function which accepts the input SparkPlan and uses it to instantiate
+   *                     the physical operator that's being tested.
+   * @param expectedPlanFunction a function which accepts the input SparkPlan and uses it to
+   *                             instantiate a reference implementation of the physical operator
+   *                             that's being tested. The result of executing this plan will be
+   *                             treated as the source-of-truth for the test.
+   */
+  def checkAnswer(
+      input: DataFrame,
+      planFunction: SparkPlan => SparkPlan,
+      expectedPlanFunction: SparkPlan => SparkPlan): Option[String] = {
+
+    val outputPlan = planFunction(input.queryExecution.sparkPlan)
+    val expectedOutputPlan = expectedPlanFunction(input.queryExecution.sparkPlan)
+
+    val expectedAnswer: Seq[Row] = try {
+      executePlan(input, expectedOutputPlan)
+    } catch {
+      case NonFatal(e) =>
+        val errorMessage =
+          s"""
+             | Exception thrown while executing Spark plan to calculate expected answer:
+             | $expectedOutputPlan
+             | == Exception ==
+             | $e
+             | ${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
+          """.stripMargin
+        return Some(errorMessage)
+    }
+
+    val actualAnswer: Seq[Row] = try {
+      executePlan(input, outputPlan)
+    } catch {
+      case NonFatal(e) =>
+        val errorMessage =
+          s"""
+             | Exception thrown while executing Spark plan:
+             | $outputPlan
+             | == Exception ==
+             | $e
+             | ${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
+          """.stripMargin
+        return Some(errorMessage)
+    }
+
+    compareAnswers(actualAnswer, expectedAnswer).map { errorMessage =>
+      s"""
+         | Results do not match.
+         | Actual result Spark plan:
+         | $outputPlan
+         | Expected result Spark plan:
+         | $expectedOutputPlan
+         | $errorMessage
+       """.stripMargin
+    }
+  }
+
   /**
    * Runs the plan and makes sure the answer matches the expected result.
    * @param input the input data to be used.
@@ -98,22 +176,33 @@ object SparkPlanTest {
 
     val outputPlan = planFunction(input.queryExecution.sparkPlan)
 
-    // A very simple resolver to make writing tests easier. In contrast to the real resolver
-    // this is always case sensitive and does not try to handle scoping or complex type resolution.
-    val resolvedPlan = outputPlan transform {
-      case plan: SparkPlan =>
-        val inputMap = plan.children.flatMap(_.output).zipWithIndex.map {
-          case (a, i) =>
-            (a.name, BoundReference(i, a.dataType, a.nullable))
-        }.toMap
+    val sparkAnswer: Seq[Row] = try {
+      executePlan(input, outputPlan)
+    } catch {
+      case NonFatal(e) =>
+        val errorMessage =
+          s"""
+             | Exception thrown while executing Spark plan:
+             | $outputPlan
+             | == Exception ==
+             | $e
+             | ${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
+          """.stripMargin
+        return Some(errorMessage)
+    }
 
-        plan.transformExpressions {
-          case UnresolvedAttribute(Seq(u)) =>
-            inputMap.getOrElse(u,
-              sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
-        }
+    compareAnswers(sparkAnswer, expectedAnswer).map { errorMessage =>
+      s"""
+         | Results do not match for Spark plan:
+         | $outputPlan
+         | $errorMessage
+       """.stripMargin
     }
+  }
 
+  private def compareAnswers(
+      sparkAnswer: Seq[Row],
+      expectedAnswer: Seq[Row]): Option[String] = {
     def prepareAnswer(answer: Seq[Row]): Seq[Row] = {
       // Converts data to types that we can do equality comparison using Scala collections.
       // For BigDecimal type, the Scala type has a better definition of equality test (similar to
@@ -130,38 +219,39 @@ object SparkPlanTest {
       }
       converted.sortBy(_.toString())
     }
-
-    val sparkAnswer: Seq[Row] = try {
-      resolvedPlan.executeCollect().toSeq
-    } catch {
-      case NonFatal(e) =>
-        val errorMessage =
-          s"""
-             | Exception thrown while executing Spark plan:
-             | $outputPlan
-             | == Exception ==
-             | $e
-             | ${org.apache.spark.sql.catalyst.util.stackTraceToString(e)}
-          """.stripMargin
-        return Some(errorMessage)
-    }
-
     if (prepareAnswer(expectedAnswer) != prepareAnswer(sparkAnswer)) {
       val errorMessage =
         s"""
-           | Results do not match for Spark plan:
-           | $outputPlan
            | == Results ==
            | ${sideBySide(
-              s"== Correct Answer - ${expectedAnswer.size} ==" +:
+              s"== Expected Answer - ${expectedAnswer.size} ==" +:
               prepareAnswer(expectedAnswer).map(_.toString()),
-              s"== Spark Answer - ${sparkAnswer.size} ==" +:
+              s"== Actual Answer - ${sparkAnswer.size} ==" +:
               prepareAnswer(sparkAnswer).map(_.toString())).mkString("\n")}
       """.stripMargin
-      return Some(errorMessage)
+      Some(errorMessage)
+    } else {
+      None
     }
+  }
 
-    None
+  private def executePlan(input: DataFrame, outputPlan: SparkPlan): Seq[Row] = {
+    // A very simple resolver to make writing tests easier. In contrast to the real resolver
+    // this is always case sensitive and does not try to handle scoping or complex type resolution.
+    val resolvedPlan = outputPlan transform {
+      case plan: SparkPlan =>
+        val inputMap = plan.children.flatMap(_.output).zipWithIndex.map {
+          case (a, i) =>
+            (a.name, BoundReference(i, a.dataType, a.nullable))
+        }.toMap
+
+        plan.transformExpressions {
+          case UnresolvedAttribute(Seq(u)) =>
+            inputMap.getOrElse(u,
+              sys.error(s"Invalid Test: Cannot resolve $u given input $inputMap"))
+        }
+    }
+    resolvedPlan.executeCollect().toSeq
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeExternalSortSuite.scala