-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21583][SQL] Create a ColumnarBatch from ArrowColumnVectors #18787
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
0c39389
a4be6cf
f35b92c
43214b1
f906156
3d80e54
23d19df
cc81d48
4e2b081
9eb929a
a90a71b
3fcdec5
ffcbf75
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -65,15 +65,35 @@ public final class ColumnarBatch { | |
| final Row row; | ||
|
|
||
| public static ColumnarBatch allocate(StructType schema, MemoryMode memMode) { | ||
| return new ColumnarBatch(schema, DEFAULT_BATCH_SIZE, memMode); | ||
| return allocate(schema, memMode, DEFAULT_BATCH_SIZE); | ||
| } | ||
|
|
||
| public static ColumnarBatch allocate(StructType type) { | ||
| return new ColumnarBatch(type, DEFAULT_BATCH_SIZE, DEFAULT_MEMORY_MODE); | ||
| return allocate(type, DEFAULT_MEMORY_MODE, DEFAULT_BATCH_SIZE); | ||
| } | ||
|
|
||
| public static ColumnarBatch allocate(StructType schema, MemoryMode memMode, int maxRows) { | ||
| return new ColumnarBatch(schema, maxRows, memMode); | ||
| ColumnVector[] columns = allocateCols(schema, maxRows, memMode); | ||
| return new ColumnarBatch(schema, columns, maxRows); | ||
| } | ||
|
|
||
| private static ColumnVector[] allocateCols(StructType schema, int maxRows, MemoryMode memMode) { | ||
| ColumnVector[] columns = new ColumnVector[schema.size()]; | ||
| for (int i = 0; i < schema.fields().length; ++i) { | ||
| StructField field = schema.fields()[i]; | ||
| columns[i] = ColumnVector.allocate(maxRows, field.dataType(), memMode); | ||
| } | ||
| return columns; | ||
| } | ||
|
|
||
| public static ColumnarBatch createReadOnly( | ||
| StructType schema, | ||
| ReadOnlyColumnVector[] columns, | ||
| int numRows) { | ||
| assert(schema.length() == columns.length); | ||
| ColumnarBatch batch = new ColumnarBatch(schema, columns, numRows); | ||
|
||
| batch.setNumRows(numRows); | ||
|
||
| return batch; | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -505,18 +525,12 @@ public void filterNullsInColumn(int ordinal) { | |
| nullFilteredColumns.add(ordinal); | ||
| } | ||
|
|
||
| private ColumnarBatch(StructType schema, int maxRows, MemoryMode memMode) { | ||
| private ColumnarBatch(StructType schema, ColumnVector[] columns, int capacity) { | ||
| this.schema = schema; | ||
| this.capacity = maxRows; | ||
| this.columns = new ColumnVector[schema.size()]; | ||
| this.columns = columns; | ||
| this.capacity = capacity; | ||
| this.nullFilteredColumns = new HashSet<>(); | ||
| this.filteredRows = new boolean[maxRows]; | ||
|
|
||
| for (int i = 0; i < schema.fields().length; ++i) { | ||
| StructField field = schema.fields()[i]; | ||
| columns[i] = ColumnVector.allocate(maxRows, field.dataType(), memMode); | ||
| } | ||
|
|
||
| this.filteredRows = new boolean[this.capacity]; | ||
| this.row = new Row(this); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,15 +22,18 @@ import java.sql.{Date, Timestamp} | |
| import java.text.SimpleDateFormat | ||
| import java.util.Locale | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
|
|
||
| import com.google.common.io.Files | ||
| import org.apache.arrow.memory.RootAllocator | ||
| import org.apache.arrow.vector.{VectorLoader, VectorSchemaRoot} | ||
| import org.apache.arrow.vector.{NullableIntVector, VectorLoader, VectorSchemaRoot} | ||
| import org.apache.arrow.vector.file.json.JsonFileReader | ||
| import org.apache.arrow.vector.util.Validator | ||
| import org.scalatest.BeforeAndAfterAll | ||
|
|
||
| import org.apache.spark.SparkException | ||
| import org.apache.spark.{SparkException, TaskContext} | ||
| import org.apache.spark.sql.{DataFrame, Row} | ||
| import org.apache.spark.sql.execution.vectorized.{ArrowColumnVector, ColumnarBatch, ReadOnlyColumnVector} | ||
| import org.apache.spark.sql.test.SharedSQLContext | ||
| import org.apache.spark.sql.types.{BinaryType, IntegerType, StructField, StructType} | ||
| import org.apache.spark.util.Utils | ||
|
|
@@ -1629,6 +1632,42 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll { | |
| } | ||
| } | ||
|
|
||
| test("roundtrip payloads") { | ||
| val allocator = ArrowUtils.rootAllocator.newChildAllocator("int", 0, Long.MaxValue) | ||
| val vector = ArrowUtils.toArrowField("int", IntegerType, nullable = true) | ||
| .createVector(allocator).asInstanceOf[NullableIntVector] | ||
|
||
| vector.allocateNew() | ||
| val mutator = vector.getMutator() | ||
|
|
||
| (0 until 10).foreach { i => | ||
| mutator.setSafe(i, i) | ||
| } | ||
| mutator.setNull(10) | ||
| mutator.setValueCount(11) | ||
|
|
||
| val schema = StructType(Seq(StructField("int", IntegerType))) | ||
|
|
||
| val columnarBatch = ColumnarBatch.createReadOnly( | ||
| schema, Array[ReadOnlyColumnVector](new ArrowColumnVector(vector)), 11) | ||
|
|
||
| val context = TaskContext.empty() | ||
|
|
||
| val payloadIter = ArrowConverters.toPayloadIterator( | ||
| columnarBatch.rowIterator().asScala, schema, 0, context) | ||
|
|
||
| val (rowIter, schemaRead) = ArrowConverters.fromPayloadIterator(payloadIter, context) | ||
|
|
||
| assert(schema.equals(schemaRead)) | ||
|
|
||
| rowIter.zipWithIndex.foreach { case (row, i) => | ||
| if (i == 10) { | ||
| assert(row.isNullAt(0)) | ||
| } else { | ||
| assert(row.getInt(0) == i) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** Test that a converted DataFrame to Arrow record batch equals batch read from JSON file */ | ||
| private def collectAndValidate(df: DataFrame, json: String, file: String): Unit = { | ||
| // NOTE: coalesce to single partition because can only load 1 batch in validator | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,10 +25,13 @@ import scala.collection.JavaConverters._ | |
| import scala.collection.mutable | ||
| import scala.util.Random | ||
|
|
||
| import org.apache.arrow.vector.NullableIntVector | ||
|
|
||
| import org.apache.spark.SparkFunSuite | ||
| import org.apache.spark.memory.MemoryMode | ||
| import org.apache.spark.sql.{RandomDataGenerator, Row} | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.execution.arrow.ArrowUtils | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.unsafe.Platform | ||
| import org.apache.spark.unsafe.types.CalendarInterval | ||
|
|
@@ -1248,4 +1251,55 @@ class ColumnarBatchSuite extends SparkFunSuite { | |
| s"vectorized reader")) | ||
| } | ||
| } | ||
|
|
||
| test("create read-only batch") { | ||
|
||
| val allocator = ArrowUtils.rootAllocator.newChildAllocator("int", 0, Long.MaxValue) | ||
| val vector1 = ArrowUtils.toArrowField("int1", IntegerType, nullable = true) | ||
| .createVector(allocator).asInstanceOf[NullableIntVector] | ||
| vector1.allocateNew() | ||
| val mutator1 = vector1.getMutator() | ||
| val vector2 = ArrowUtils.toArrowField("int2", IntegerType, nullable = true) | ||
| .createVector(allocator).asInstanceOf[NullableIntVector] | ||
| vector2.allocateNew() | ||
| val mutator2 = vector2.getMutator() | ||
|
|
||
| (0 until 10).foreach { i => | ||
| mutator1.setSafe(i, i) | ||
| mutator2.setSafe(i + 1, i) | ||
| } | ||
| mutator1.setNull(10) | ||
| mutator1.setValueCount(11) | ||
| mutator2.setNull(0) | ||
| mutator2.setValueCount(11) | ||
|
|
||
| val columnVectors = Seq(new ArrowColumnVector(vector1), new ArrowColumnVector(vector2)) | ||
|
|
||
| val schema = StructType(Seq(StructField("int1", IntegerType), StructField("int2", IntegerType))) | ||
| val batch = ColumnarBatch.createReadOnly( | ||
| schema, columnVectors.toArray[ReadOnlyColumnVector], 11) | ||
|
|
||
| assert(batch.numCols() == 2) | ||
| assert(batch.numRows() == 11) | ||
|
|
||
| val rowIter = batch.rowIterator().asScala | ||
| rowIter.zipWithIndex.foreach { case (row, i) => | ||
| if (i == 10) { | ||
| assert(row.isNullAt(0)) | ||
| } else { | ||
| assert(row.getInt(0) == i) | ||
| } | ||
| if (i == 0) { | ||
| assert(row.isNullAt(1)) | ||
| } else { | ||
| assert(row.getInt(1) == i - 1) | ||
| } | ||
| } | ||
|
|
||
| intercept[java.lang.AssertionError] { | ||
| batch.getRow(100) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, @BryanCutler and @ueshin .
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, that is strange. I'll take a look, thanks.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks! It seems to happen Maven only. sbt-hadoop-2.6 passed.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's probably because the assert is being compiled out.. This should probably not be in the test then.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the problem is that if the Java assertion is compiled out, then no error is produced and the test fails.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just made #19098 to remove this check - it's not really testing the functionality added here anyway but maybe another test should be added for checkout index out of bounds errors. |
||
| } | ||
|
|
||
| columnVectors.foreach(_.close()) | ||
|
||
| allocator.close() | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to restrict this to only
ReadOnlyColumnVector?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it necessary? What impact will it cause?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't need to be restricted, but if they are
ReadOnlyColumnVectorsthen it means they are already populated and it is safe to callsetNumRows(numRows)here. If it took in anyColumnVectorthen it might cause issues by someone passing in unallocated vectors.