-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21271][SQL] Ensure Unsafe.sizeInBytes is a multiple of 8 #18503
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
eb87c63
6cde32f
ccc820f
3bd72c9
8a7a948
7f5a269
762f02a
0159701
54be80e
cc467de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -167,6 +167,7 @@ public UnsafeRow() {} | |
| */ | ||
| public void pointTo(Object baseObject, long baseOffset, int sizeInBytes) { | ||
| assert numFields >= 0 : "numFields (" + numFields + ") should >= 0"; | ||
| assert sizeInBytes % 8 == 0 : "sizeInBytes (" + sizeInBytes + ") should be a multiple of 8"; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we only need the assertion here, in
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, done. |
||
| this.baseObject = baseObject; | ||
| this.baseOffset = baseOffset; | ||
| this.sizeInBytes = sizeInBytes; | ||
|
|
@@ -183,6 +184,7 @@ public void pointTo(byte[] buf, int sizeInBytes) { | |
| } | ||
|
|
||
| public void setTotalSize(int sizeInBytes) { | ||
| assert sizeInBytes % 8 == 0 : "sizeInBytes (" + sizeInBytes + ") should be a multiple of 8"; | ||
| this.sizeInBytes = sizeInBytes; | ||
| } | ||
|
|
||
|
|
@@ -538,6 +540,7 @@ public void copyFrom(UnsafeRow row) { | |
| row.baseObject, row.baseOffset, this.baseObject, this.baseOffset, row.sizeInBytes); | ||
| // update the sizeInBytes. | ||
| this.sizeInBytes = row.sizeInBytes; | ||
| assert sizeInBytes % 8 == 0 : "sizeInBytes (" + sizeInBytes + ") should be a multiple of 8"; | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -664,6 +667,7 @@ public void writeExternal(ObjectOutput out) throws IOException { | |
| public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { | ||
| this.baseOffset = BYTE_ARRAY_OFFSET; | ||
| this.sizeInBytes = in.readInt(); | ||
| assert sizeInBytes % 8 == 0 : "sizeInBytes (" + sizeInBytes + ") should be a multiple of 8"; | ||
| this.numFields = in.readInt(); | ||
| this.bitSetWidthInBytes = calculateBitSetWidthInBytes(numFields); | ||
| this.baseObject = new byte[sizeInBytes]; | ||
|
|
@@ -682,6 +686,7 @@ public void write(Kryo kryo, Output out) { | |
| public void read(Kryo kryo, Input in) { | ||
| this.baseOffset = BYTE_ARRAY_OFFSET; | ||
| this.sizeInBytes = in.readInt(); | ||
| assert sizeInBytes % 8 == 0 : "sizeInBytes (" + sizeInBytes + ") should be a multiple of 8"; | ||
| this.numFields = in.readInt(); | ||
| this.bitSetWidthInBytes = calculateBitSetWidthInBytes(numFields); | ||
| this.baseObject = new byte[sizeInBytes]; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -350,20 +350,24 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit | |
| throw new IOException( | ||
| s"Error reading delta file $fileToRead of $this: key size cannot be $keySize") | ||
| } else { | ||
| val keyRowBuffer = new Array[Byte](keySize) | ||
| // If key size in an existing file is not a multiple of 8, round it to multiple of 8 | ||
|
||
| val keyAllocationSize = ((keySize + 7) / 8) * 8 | ||
| val keyRowBuffer = new Array[Byte](keyAllocationSize) | ||
|
||
| ByteStreams.readFully(input, keyRowBuffer, 0, keySize) | ||
|
|
||
| val keyRow = new UnsafeRow(keySchema.fields.length) | ||
| keyRow.pointTo(keyRowBuffer, keySize) | ||
| keyRow.pointTo(keyRowBuffer, keyAllocationSize) | ||
|
|
||
| val valueSize = input.readInt() | ||
| if (valueSize < 0) { | ||
| map.remove(keyRow) | ||
| } else { | ||
| val valueRowBuffer = new Array[Byte](valueSize) | ||
| // If value size in an existing file is not a multiple of 8, round it to multiple of 8 | ||
| val valueAllocationSize = ((valueSize + 7) / 8) * 8 | ||
| val valueRowBuffer = new Array[Byte](valueAllocationSize) | ||
| ByteStreams.readFully(input, valueRowBuffer, 0, valueSize) | ||
| val valueRow = new UnsafeRow(valueSchema.fields.length) | ||
| valueRow.pointTo(valueRowBuffer, valueSize) | ||
| valueRow.pointTo(valueRowBuffer, valueAllocationSize) | ||
| map.put(keyRow, valueRow) | ||
| } | ||
| } | ||
|
|
@@ -413,21 +417,25 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit | |
| throw new IOException( | ||
| s"Error reading snapshot file $fileToRead of $this: key size cannot be $keySize") | ||
| } else { | ||
| val keyRowBuffer = new Array[Byte](keySize) | ||
| // If key size in an existing file is not a multiple of 8, round it to multiple of 8 | ||
| val keyAllocationSize = ((keySize + 7) / 8) * 8 | ||
| val keyRowBuffer = new Array[Byte](keyAllocationSize) | ||
| ByteStreams.readFully(input, keyRowBuffer, 0, keySize) | ||
|
|
||
| val keyRow = new UnsafeRow(keySchema.fields.length) | ||
| keyRow.pointTo(keyRowBuffer, keySize) | ||
| keyRow.pointTo(keyRowBuffer, keyAllocationSize) | ||
|
|
||
| val valueSize = input.readInt() | ||
| if (valueSize < 0) { | ||
| throw new IOException( | ||
| s"Error reading snapshot file $fileToRead of $this: value size cannot be $valueSize") | ||
| } else { | ||
| val valueRowBuffer = new Array[Byte](valueSize) | ||
| // If value size in an existing file is not a multiple of 8, round it to multiple of 8 | ||
| val valueAllocationSize = ((valueSize + 7) / 8) * 8 | ||
|
||
| val valueRowBuffer = new Array[Byte](valueAllocationSize) | ||
| ByteStreams.readFully(input, valueRowBuffer, 0, valueSize) | ||
| val valueRow = new UnsafeRow(valueSchema.fields.length) | ||
| valueRow.pointTo(valueRowBuffer, valueSize) | ||
| valueRow.pointTo(valueRowBuffer, valueAllocationSize) | ||
| map.put(keyRow, valueRow) | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -479,6 +479,61 @@ class StreamSuite extends StreamTest { | |
| CheckAnswer((1, 2), (2, 2), (3, 2))) | ||
| } | ||
|
|
||
| testQuietly("store to and recover from a checkpoint") { | ||
|
||
| val checkpointDir = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath | ||
|
|
||
| def query(data: MemoryStream[Int], checkpointDir: String, queryName: String): | ||
| DataStreamWriter[Row] = { | ||
| data.toDF | ||
| .groupBy($"value") | ||
| .agg(count("*")) | ||
| .writeStream | ||
| .outputMode("complete") | ||
| .option("checkpointLocation", checkpointDir) | ||
| .format("memory") | ||
| .queryName(queryName) | ||
| } | ||
|
|
||
| withSQLConf( | ||
| SQLConf.SHUFFLE_PARTITIONS.key -> "10") { | ||
| var writeQuery: StreamingQuery = null | ||
| try { | ||
| val data = MemoryStream[Int] | ||
| writeQuery = query(data, checkpointDir, "write").start() | ||
|
|
||
| data.addData(1, 2, 3, 4) | ||
| writeQuery.processAllAvailable() | ||
| data.addData(3, 4, 5, 6) | ||
| writeQuery.processAllAvailable() | ||
| data.addData(5, 6, 7, 8) | ||
| writeQuery.processAllAvailable() | ||
| } finally { | ||
| assert(writeQuery != null) | ||
| writeQuery.stop() | ||
| } | ||
|
|
||
| var restartQuery: StreamingQuery = null | ||
| try { | ||
| val data = MemoryStream[Int] | ||
| data.addData(1, 2, 3, 4) | ||
| data.addData(3, 4, 5, 6) | ||
| data.addData(5, 6, 7, 8) | ||
|
|
||
| restartQuery = query(data, checkpointDir, "counts").start() | ||
| restartQuery.processAllAvailable() | ||
| data.addData(9) | ||
| restartQuery.processAllAvailable() | ||
|
|
||
| QueryTest.checkAnswer(spark.table("counts").toDF, | ||
| Row("1", 1) :: Row("2", 1) :: Row("3", 2) :: Row("4", 2) :: | ||
| Row("5", 2) :: Row("6", 2) :: Row("7", 1) :: Row("8", 1) :: Row("9", 1) :: Nil) | ||
| } finally { | ||
| assert(restartQuery != null) | ||
| restartQuery.stop() | ||
| } | ||
| } | ||
| } | ||
|
|
||
| testQuietly("recover from a Spark v2.1 checkpoint") { | ||
| var inputData: MemoryStream[Int] = null | ||
| var query: DataStreamWriter[Row] = null | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm wondering why we did this before. Was it a mistake?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have the same question.
@sameeragarwal had similar question one year ago. However, no response from @ooq
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I recall it being intentional.
See discussion here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ooq thank you for pointing out interesting discussion.
This discussion seems to make sense for page management. The question of @cloud-fan and me is whether
valueRowuses onlyvlen. I think that+4is for page management.