[SPARK-2314][SQL] Override collect and take in sql.py, forwarding to SchemaRDD's collect.

staple · staple · commit f03cdfa4756a · 2014-09-15T19:39:36.000-07:00
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
@@ -1550,6 +1550,18 @@ def id(self):
             self._id = self._jrdd.id()
         return self._id
 
+    def limit(self, num):
+        """Limit the result count to the number specified.
+
+        >>> srdd = sqlCtx.inferSchema(rdd)
+        >>> srdd.limit(2).collect()
+        [Row(field1=1, field2=u'row1'), Row(field1=2, field2=u'row2')]
+        >>> srdd.limit(0).collect()
+        []
+        """
+        rdd = self._jschema_rdd.limit(num)
+        return SchemaRDD(rdd, self.sql_ctx)
+
     def saveAsParquetFile(self, path):
         """Save the contents as a Parquet file, preserving the schema.
 
@@ -1626,15 +1638,40 @@ def count(self):
         return self._jschema_rdd.count()
 
     def collect(self):
-        """
-        Return a list that contains all of the rows in this RDD.
+        """Return a list that contains all of the rows in this RDD.
 
-        Each object in the list is on Row, the fields can be accessed as
+        Each object in the list is a Row, the fields can be accessed as
         attributes.
+
+        Unlike the base RDD implementation of collect, this implementation
+        leverages the query optimizer to perform a collect on the SchemaRDD,
+        which supports features such as filter pushdown.
+
+        >>> srdd = sqlCtx.inferSchema(rdd)
+        >>> srdd.collect()
+        [Row(field1=1, field2=u'row1'), ..., Row(field1=3, field2=u'row3')]
         """
-        rows = RDD.collect(self)
+        from pyspark.context import JavaStackTrace
+        with JavaStackTrace(self.context) as st:
+            bytesInJava = self._jschema_rdd.collectToPython().iterator()
         cls = _create_cls(self.schema())
-        return map(cls, rows)
+        return map(cls, self._collect_iterator_through_file(bytesInJava))
+
+    def take(self, num):
+        """Take the first num rows of the RDD.
+
+        Each object in the list is a Row, the fields can be accessed as
+        attributes.
+
+        Unlike the base RDD implementation of take, this implementation
+        leverages the query optimizer to perform a collect on a SchemaRDD,
+        which supports features such as filter pushdown.
+
+        >>> srdd = sqlCtx.inferSchema(rdd)
+        >>> srdd.take(2)
+        [Row(field1=1, field2=u'row1'), Row(field1=2, field2=u'row2')]
+        """
+        return self.limit(num).collect()
 
     # Convert each object in the RDD to a Row with the right class
     # for this SchemaRDD, so that fields can be accessed as attributes.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
@@ -377,15 +377,15 @@ class SchemaRDD(
   def toJavaSchemaRDD: JavaSchemaRDD = new JavaSchemaRDD(sqlContext, logicalPlan)
 
   /**
-   * Converts a JavaRDD to a PythonRDD. It is used by pyspark.
+   * Helper for converting a Row to a simple Array suitable for pyspark serialization.
    */
-  private[sql] def javaToPython: JavaRDD[Array[Byte]] = {
+  private def rowToJArray(row: Row, structType: StructType): Array[Any] = {
     import scala.collection.Map
 
     def toJava(obj: Any, dataType: DataType): Any = (obj, dataType) match {
       case (null, _) => null
 
-      case (obj: Row, struct: StructType) => rowToArray(obj, struct)
+      case (obj: Row, struct: StructType) => rowToJArray(obj, struct)
 
       case (seq: Seq[Any], array: ArrayType) =>
         seq.map(x => toJava(x, array.elementType)).asJava
@@ -402,22 +402,37 @@ class SchemaRDD(
       case (other, _) => other
     }
 
-    def rowToArray(row: Row, structType: StructType): Array[Any] = {
-      val fields = structType.fields.map(field => field.dataType)
-      row.zip(fields).map {
-        case (obj, dataType) => toJava(obj, dataType)
-      }.toArray
-    }
+    val fields = structType.fields.map(field => field.dataType)
+    row.zip(fields).map {
+      case (obj, dataType) => toJava(obj, dataType)
+    }.toArray
+  }
 
+  /**
+   * Converts a JavaRDD to a PythonRDD. It is used by pyspark.
+   */
+  private[sql] def javaToPython: JavaRDD[Array[Byte]] = {
     val rowSchema = StructType.fromAttributes(this.queryExecution.analyzed.output)
     this.mapPartitions { iter =>
       val pickle = new Pickler
       iter.map { row =>
-        rowToArray(row, rowSchema)
+        rowToJArray(row, rowSchema)
       }.grouped(100).map(batched => pickle.dumps(batched.toArray))
     }
   }
 
+  /**
+   * Serializes the Array[Row] returned by SchemaRDD's optimized collect(), using the same
+   * format as javaToPython. It is used by pyspark.
+   */
+  private[sql] def collectToPython: JList[Array[Byte]] = {
+    val rowSchema = StructType.fromAttributes(this.queryExecution.analyzed.output)
+    val pickle = new Pickler
+    new java.util.ArrayList(collect().map { row =>
+      rowToJArray(row, rowSchema)
+    }.grouped(100).map(batched => pickle.dumps(batched.toArray)).toIterable)
+  }
+
   /**
    * Creates SchemaRDD by applying own schema to derived RDD. Typically used to wrap return value
    * of base RDD functions that do not change schema.