-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-8492] [SQL] support binaryType in UnsafeRow #6911
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
447dea0
6abfe93
22e4c0a
180b49d
98a964b
519f698
d68706f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,7 +47,8 @@ | |
| * In the `values` region, we store one 8-byte word per field. For fields that hold fixed-length | ||
| * primitive types, such as long, double, or int, we store the value directly in the word. For | ||
| * fields with non-primitive or variable-length values, we store a relative offset (w.r.t. the | ||
| * base address of the row) that points to the beginning of the variable-length field. | ||
| * base address of the row) that points to the beginning of the variable-length field, and length | ||
| * (they are combined into a long). | ||
| * | ||
| * Instances of `UnsafeRow` act as pointers to row data stored in this format. | ||
| */ | ||
|
|
@@ -92,6 +93,7 @@ public static int calculateBitSetWidthInBytes(int numFields) { | |
| */ | ||
| public static final Set<DataType> readableFieldTypes; | ||
|
|
||
| // TODO: support DecimalType | ||
| static { | ||
| settableFieldTypes = Collections.unmodifiableSet( | ||
| new HashSet<DataType>( | ||
|
|
@@ -111,7 +113,8 @@ public static int calculateBitSetWidthInBytes(int numFields) { | |
| // We support get() on a superset of the types for which we support set(): | ||
| final Set<DataType> _readableFieldTypes = new HashSet<DataType>( | ||
| Arrays.asList(new DataType[]{ | ||
| StringType | ||
| StringType, | ||
| BinaryType | ||
| })); | ||
| _readableFieldTypes.addAll(settableFieldTypes); | ||
| readableFieldTypes = Collections.unmodifiableSet(_readableFieldTypes); | ||
|
|
@@ -221,11 +224,6 @@ public void setFloat(int ordinal, float value) { | |
| PlatformDependent.UNSAFE.putFloat(baseObject, getFieldOffset(ordinal), value); | ||
| } | ||
|
|
||
| @Override | ||
| public void setString(int ordinal, String value) { | ||
| throw new UnsupportedOperationException(); | ||
| } | ||
|
|
||
| @Override | ||
| public int size() { | ||
| return numFields; | ||
|
|
@@ -249,6 +247,8 @@ public Object get(int i) { | |
| return null; | ||
| } else if (dataType == StringType) { | ||
| return getUTF8String(i); | ||
| } else if (dataType == BinaryType) { | ||
| return getBinary(i); | ||
| } else { | ||
| throw new UnsupportedOperationException(); | ||
| } | ||
|
|
@@ -311,19 +311,23 @@ public double getDouble(int i) { | |
| } | ||
|
|
||
| public UTF8String getUTF8String(int i) { | ||
| return UTF8String.fromBytes(getBinary(i)); | ||
| } | ||
|
|
||
| public byte[] getBinary(int i) { | ||
| assertIndexIsValid(i); | ||
| final long offsetToStringSize = getLong(i); | ||
| final int stringSizeInBytes = | ||
| (int) PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offsetToStringSize); | ||
| final byte[] strBytes = new byte[stringSizeInBytes]; | ||
| final long offsetAndSize = getLong(i); | ||
| final int offset = (int)(offsetAndSize >> 32); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to mask out the upper 32 bits before converting to a long? I guess the uppermost bit probably can't be 1 because the offset can't be negative, so I guess we don't need to worry about sign-extension during the shift.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. |
||
| final int size = (int)(offsetAndSize & ((1L << 32) - 1)); | ||
| final byte[] bytes = new byte[size]; | ||
| PlatformDependent.copyMemory( | ||
| baseObject, | ||
| baseOffset + offsetToStringSize + 8, // The `+ 8` is to skip past the size to get the data | ||
| strBytes, | ||
| baseOffset + offset, | ||
| bytes, | ||
| PlatformDependent.BYTE_ARRAY_OFFSET, | ||
| stringSizeInBytes | ||
| size | ||
| ); | ||
| return UTF8String.fromBytes(strBytes); | ||
| return bytes; | ||
| } | ||
|
|
||
| @Override | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,8 +17,6 @@ | |
|
|
||
| package org.apache.spark.sql.catalyst.expressions | ||
|
|
||
| import org.apache.spark.sql.catalyst.util.DateUtils | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.unsafe.PlatformDependent | ||
| import org.apache.spark.unsafe.array.ByteArrayMethods | ||
|
|
@@ -72,6 +70,19 @@ class UnsafeRowConverter(fieldTypes: Array[DataType]) { | |
| */ | ||
| def writeRow(row: InternalRow, baseObject: Object, baseOffset: Long): Int = { | ||
| unsafeRow.pointTo(baseObject, baseOffset, writers.length, null) | ||
|
|
||
| if (writers.length > 0) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible to declare a row which has no columns? I'm just wondering if we ever run into a case where we need to worry about
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's possible to have a Row with no columns:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that we need to change this to
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, since
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. EDIT: actually, I think this is fine, since UnsafeRow will allocate no space for the bitset for an empty row: https://github.com/davies/spark/blob/d68706fee5261d253278ff1d3af83a1a2846c5a7/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java#L82.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, I had checked that. |
||
| // zero-out the bitset | ||
| var n = writers.length / 64 | ||
| while (n >= 0) { | ||
| PlatformDependent.UNSAFE.putLong( | ||
| unsafeRow.getBaseObject, | ||
| unsafeRow.getBaseOffset + n * 8, | ||
| 0L) | ||
| n -= 1 | ||
| } | ||
| } | ||
|
|
||
| var fieldNumber = 0 | ||
| var appendCursor: Int = fixedLengthSize | ||
| while (fieldNumber < writers.length) { | ||
|
|
@@ -122,6 +133,7 @@ private object UnsafeColumnWriter { | |
| case FloatType => FloatUnsafeColumnWriter | ||
| case DoubleType => DoubleUnsafeColumnWriter | ||
| case StringType => StringUnsafeColumnWriter | ||
| case BinaryType => BinaryUnsafeColumnWriter | ||
| case DateType => IntUnsafeColumnWriter | ||
| case TimestampType => LongUnsafeColumnWriter | ||
| case t => | ||
|
|
@@ -141,6 +153,7 @@ private object LongUnsafeColumnWriter extends LongUnsafeColumnWriter | |
| private object FloatUnsafeColumnWriter extends FloatUnsafeColumnWriter | ||
| private object DoubleUnsafeColumnWriter extends DoubleUnsafeColumnWriter | ||
| private object StringUnsafeColumnWriter extends StringUnsafeColumnWriter | ||
| private object BinaryUnsafeColumnWriter extends BinaryUnsafeColumnWriter | ||
|
|
||
| private abstract class PrimitiveUnsafeColumnWriter extends UnsafeColumnWriter { | ||
| // Primitives don't write to the variable-length region: | ||
|
|
@@ -235,30 +248,47 @@ private class DoubleUnsafeColumnWriter private() extends PrimitiveUnsafeColumnWr | |
| } | ||
| } | ||
|
|
||
| private class StringUnsafeColumnWriter private() extends UnsafeColumnWriter { | ||
| private abstract class BytesUnsafeColumnWriter extends UnsafeColumnWriter { | ||
|
|
||
| def getBytes(source: InternalRow, column: Int): Array[Byte] | ||
|
|
||
| def getSize(source: InternalRow, column: Int): Int = { | ||
| val numBytes = source.get(column).asInstanceOf[UTF8String].getBytes.length | ||
| 8 + ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes) | ||
| val numBytes = getBytes(source, column).length | ||
| ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes) | ||
| } | ||
|
|
||
| override def write( | ||
| source: InternalRow, | ||
| target: UnsafeRow, | ||
| column: Int, | ||
| appendCursor: Int): Int = { | ||
| val value = source.get(column).asInstanceOf[UTF8String] | ||
| val baseObject = target.getBaseObject | ||
| val baseOffset = target.getBaseOffset | ||
| val numBytes = value.getBytes.length | ||
| PlatformDependent.UNSAFE.putLong(baseObject, baseOffset + appendCursor, numBytes) | ||
| val offset = target.getBaseOffset + appendCursor | ||
| val bytes = getBytes(source, column) | ||
| val numBytes = bytes.length | ||
| if ((numBytes & 0x07) > 0) { | ||
| // zero-out the padding bytes | ||
| PlatformDependent.UNSAFE.putLong(target.getBaseObject, offset + ((numBytes >> 3) << 3), 0L) | ||
| } | ||
| PlatformDependent.copyMemory( | ||
| value.getBytes, | ||
| bytes, | ||
| PlatformDependent.BYTE_ARRAY_OFFSET, | ||
| baseObject, | ||
| baseOffset + appendCursor + 8, | ||
| target.getBaseObject, | ||
| offset, | ||
| numBytes | ||
| ) | ||
| target.setLong(column, appendCursor) | ||
| 8 + ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes) | ||
| target.setLong(column, (appendCursor.toLong << 32L) | numBytes.toLong) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess that |
||
| ByteArrayMethods.roundNumberOfBytesToNearestWord(numBytes) | ||
| } | ||
| } | ||
|
|
||
| private class StringUnsafeColumnWriter private() extends BytesUnsafeColumnWriter { | ||
| def getBytes(source: InternalRow, column: Int): Array[Byte] = { | ||
| source.getAs[UTF8String](column).getBytes | ||
| } | ||
| } | ||
|
|
||
| private class BinaryUnsafeColumnWriter private() extends BytesUnsafeColumnWriter { | ||
| def getBytes(source: InternalRow, column: Int): Array[Byte] = { | ||
| source.getAs[Array[Byte]](column) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that it's safe to remove this zeroing as long as we assume that a) every column will actually end up writing to the row, and b) for null columns, we zero out the fixed length section, and c) we zero the bitset when starting to write the row. All three of these assumptions seem to hold, so this seems fine.