-
Notifications
You must be signed in to change notification settings - Fork 51
[SPARK-25299] Use the shuffle writer plugin for the SortShuffleWriter. #532
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
85ed3fd
1209cc8
5d15c88
666b679
f4f3eb8
967e55f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,10 +26,11 @@ import scala.collection.mutable.ArrayBuffer | |
| import com.google.common.io.ByteStreams | ||
|
|
||
| import org.apache.spark._ | ||
| import org.apache.spark.api.shuffle.{ShuffleMapOutputWriter, ShufflePartitionWriter} | ||
| import org.apache.spark.executor.ShuffleWriteMetrics | ||
| import org.apache.spark.internal.{config, Logging} | ||
| import org.apache.spark.serializer._ | ||
| import org.apache.spark.storage.{BlockId, DiskBlockObjectWriter} | ||
| import org.apache.spark.storage.{BlockId, DiskBlockObjectWriter, ShuffleBlockId} | ||
|
|
||
| /** | ||
| * Sorts and potentially merges a number of key-value pairs of type (K, V) to produce key-combiner | ||
|
|
@@ -674,11 +675,9 @@ private[spark] class ExternalSorter[K, V, C]( | |
| } | ||
|
|
||
| /** | ||
| * Write all the data added into this ExternalSorter into a file in the disk store. This is | ||
| * called by the SortShuffleWriter. | ||
| * | ||
| * @param blockId block ID to write to. The index file will be blockId.name + ".index". | ||
| * @return array of lengths, in bytes, of each partition of the file (used by map output tracker) | ||
| * TODO remove this, as this is only used by UnsafeRowSerializerSuite in the SQL project. | ||
| * We should figure out an alternative way to test that so that we can remove this otherwise | ||
| * unused code path. | ||
| */ | ||
| def writePartitionedFile( | ||
| blockId: BlockId, | ||
|
|
@@ -722,6 +721,123 @@ private[spark] class ExternalSorter[K, V, C]( | |
| lengths | ||
| } | ||
|
|
||
| private def writeEmptyPartition(mapOutputWriter: ShuffleMapOutputWriter): Unit = { | ||
| var partitionWriter: ShufflePartitionWriter = null | ||
| try { | ||
| partitionWriter = mapOutputWriter.getNextPartitionWriter | ||
| } finally { | ||
| if (partitionWriter != null) { | ||
| partitionWriter.close() | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Write all the data added into this ExternalSorter into a map output writer that pushes bytes | ||
| * to some arbitrary backing store. This is called by the SortShuffleWriter. | ||
| * | ||
| * @return array of lengths, in bytes, of each partition of the file (used by map output tracker) | ||
| */ | ||
| def writePartitionedMapOutput( | ||
| shuffleId: Int, mapId: Int, mapOutputWriter: ShuffleMapOutputWriter): Array[Long] = { | ||
| // Track location of each range in the map output | ||
| val lengths = new Array[Long](numPartitions) | ||
| var nextPartitionId = 0 | ||
| if (spills.isEmpty) { | ||
| // Case where we only have in-memory data | ||
| val collection = if (aggregator.isDefined) map else buffer | ||
| val it = collection.destructiveSortedWritablePartitionedIterator(comparator) | ||
| while (it.hasNext()) { | ||
| val partitionId = it.nextPartition() | ||
| // The contract for the plugin is that we will ask for a writer for every partition | ||
| // even if it's empty. However, the external sorter will return non-contiguous | ||
| // partition ids. So this loop "backfills" the empty partitions that form the gaps. | ||
|
|
||
| // The algorithm as a whole is correct because the partition ids are returned by the | ||
| // iterator in ascending order. | ||
| for (emptyPartition <- nextPartitionId until partitionId) { | ||
| writeEmptyPartition(mapOutputWriter) | ||
| } | ||
| var partitionWriter: ShufflePartitionWriter = null | ||
| var partitionPairsWriter: ShufflePartitionPairsWriter = null | ||
| try { | ||
| partitionWriter = mapOutputWriter.getNextPartitionWriter | ||
| val blockId = ShuffleBlockId(shuffleId, mapId, partitionId) | ||
| partitionPairsWriter = new ShufflePartitionPairsWriter( | ||
| partitionWriter, | ||
| serializerManager, | ||
| serInstance, | ||
| blockId, | ||
| context.taskMetrics().shuffleWriteMetrics) | ||
| while (it.hasNext && it.nextPartition() == partitionId) { | ||
| it.writeNext(partitionPairsWriter) | ||
| } | ||
| } finally { | ||
| if (partitionPairsWriter != null) { | ||
| partitionPairsWriter.close() | ||
| } | ||
| if (partitionWriter != null) { | ||
| partitionWriter.close() | ||
| } | ||
| } | ||
| if (partitionWriter != null) { | ||
| lengths(partitionId) = partitionWriter.getNumBytesWritten | ||
| } | ||
| nextPartitionId = partitionId + 1 | ||
| } | ||
| } else { | ||
| // We must perform merge-sort; get an iterator by partition and write everything directly. | ||
| for ((id, elements) <- this.partitionedIterator) { | ||
| // The contract for the plugin is that we will ask for a writer for every partition | ||
| // even if it's empty. However, the external sorter will return non-contiguous | ||
| // partition ids. So this loop "backfills" the empty partitions that form the gaps. | ||
|
|
||
| // The algorithm as a whole is correct because the partition ids are returned by the | ||
| // iterator in ascending order. | ||
| for (emptyPartition <- nextPartitionId until id) { | ||
| writeEmptyPartition(mapOutputWriter) | ||
| } | ||
| val blockId = ShuffleBlockId(shuffleId, mapId, id) | ||
| var partitionWriter: ShufflePartitionWriter = null | ||
| var partitionPairsWriter: ShufflePartitionPairsWriter = null | ||
| try { | ||
| partitionWriter = mapOutputWriter.getNextPartitionWriter | ||
| partitionPairsWriter = new ShufflePartitionPairsWriter( | ||
| partitionWriter, | ||
| serializerManager, | ||
| serInstance, | ||
| blockId, | ||
| context.taskMetrics().shuffleWriteMetrics) | ||
| if (elements.hasNext) { | ||
| for (elem <- elements) { | ||
| partitionPairsWriter.write(elem._1, elem._2) | ||
| } | ||
| } | ||
| } finally { | ||
| if (partitionPairsWriter!= null) { | ||
| partitionPairsWriter.close() | ||
| } | ||
| } | ||
| if (partitionWriter != null) { | ||
| lengths(id) = partitionWriter.getNumBytesWritten | ||
| } | ||
| nextPartitionId = id + 1 | ||
| } | ||
| } | ||
|
|
||
| // The iterator may have stopped short of opening a writer for every partition. So fill in the | ||
| // remaining empty partitions. | ||
| for (emptyPartition <- nextPartitionId until numPartitions) { | ||
| writeEmptyPartition(mapOutputWriter) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm wait why do we need this? Shouldn't
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It depends on the contract we want to present to other plugin writers. I.e. do we make a contract that we open a writer for strictly every partition, even empty ones? Or do we say we open for the first N partitions where N is the last non-empty partition? My take is that we should have the contract that we always open a writer for every partition, empty or not, from 0 through numPartitions - 1. But, again, this shows the limitation of presenting an API that doesn't include the partition identifier explicitly when getting partition writers. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ohh i see hmm yea I'm ok keeping it like this then. It does show more consistency for plugin implementers |
||
| } | ||
|
|
||
| context.taskMetrics().incMemoryBytesSpilled(memoryBytesSpilled) | ||
| context.taskMetrics().incDiskBytesSpilled(diskBytesSpilled) | ||
| context.taskMetrics().incPeakExecutionMemory(peakMemoryUsedBytes) | ||
|
|
||
| lengths | ||
| } | ||
|
|
||
| def stop(): Unit = { | ||
| spills.foreach(s => s.file.delete()) | ||
| spills.clear() | ||
|
|
@@ -785,7 +901,7 @@ private[spark] class ExternalSorter[K, V, C]( | |
| val inMemoryIterator = new WritablePartitionedIterator { | ||
| private[this] var cur = if (upstream.hasNext) upstream.next() else null | ||
|
|
||
| def writeNext(writer: DiskBlockObjectWriter): Unit = { | ||
| def writeNext(writer: PairsWriter): Unit = { | ||
| writer.write(cur._1._2, cur._2) | ||
| cur = if (upstream.hasNext) upstream.next() else null | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.util.collection | ||
|
|
||
| private[spark] trait PairsWriter { | ||
|
|
||
| def write(key: Any, value: Any): Unit | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,105 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.util.collection | ||
|
|
||
| import java.io.{Closeable, FilterOutputStream, OutputStream} | ||
|
|
||
| import org.apache.spark.api.shuffle.ShufflePartitionWriter | ||
| import org.apache.spark.serializer.{SerializationStream, SerializerInstance, SerializerManager} | ||
| import org.apache.spark.shuffle.ShuffleWriteMetricsReporter | ||
| import org.apache.spark.storage.BlockId | ||
|
|
||
| /** | ||
| * A key-value writer inspired by {@link DiskBlockObjectWriter} that pushes the bytes to an | ||
| * arbitrary partition writer instead of writing to local disk through the block manager. | ||
| */ | ||
| private[spark] class ShufflePartitionPairsWriter( | ||
| partitionWriter: ShufflePartitionWriter, | ||
| serializerManager: SerializerManager, | ||
| serializerInstance: SerializerInstance, | ||
| blockId: BlockId, | ||
| writeMetrics: ShuffleWriteMetricsReporter) | ||
| extends PairsWriter with Closeable { | ||
|
|
||
| private var isOpen = false | ||
| private var partitionStream: OutputStream = _ | ||
| private var wrappedStream: OutputStream = _ | ||
| private var objOut: SerializationStream = _ | ||
| private var numRecordsWritten = 0 | ||
| private var curNumBytesWritten = 0L | ||
|
|
||
| override def write(key: Any, value: Any): Unit = { | ||
| if (!isOpen) { | ||
| open() | ||
| isOpen = true | ||
| } | ||
| objOut.writeKey(key) | ||
| objOut.writeValue(value) | ||
| writeMetrics.incRecordsWritten(1) | ||
| } | ||
|
|
||
| private def open(): Unit = { | ||
| // The contract is that the partition writer is expected to close its own streams, but | ||
| // the compressor will only flush the stream when it is specifically closed. So we want to | ||
| // close objOut to flush the compressed bytes to the partition writer stream, but we don't want | ||
| // to close the partition output stream in the process. | ||
| partitionStream = new CloseShieldOutputStream(partitionWriter.toStream) | ||
| wrappedStream = serializerManager.wrapStream(blockId, partitionStream) | ||
| objOut = serializerInstance.serializeStream(wrappedStream) | ||
| } | ||
|
|
||
| override def close(): Unit = { | ||
| if (isOpen) { | ||
| // Closing objOut should propagate close to all inner layers | ||
| // We can't close wrappedStream explicitly because closing objOut and closing wrappedStream | ||
| // causes problems when closing compressed output streams twice. | ||
| objOut.close() | ||
| objOut = null | ||
| wrappedStream = null | ||
| partitionStream = null | ||
| partitionWriter.close() | ||
| isOpen = false | ||
| updateBytesWritten() | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Notify the writer that a record worth of bytes has been written with OutputStream#write. | ||
| */ | ||
| private def recordWritten(): Unit = { | ||
| numRecordsWritten += 1 | ||
| writeMetrics.incRecordsWritten(1) | ||
|
|
||
| if (numRecordsWritten % 16384 == 0) { | ||
| updateBytesWritten() | ||
| } | ||
| } | ||
|
|
||
| private def updateBytesWritten(): Unit = { | ||
| val numBytesWritten = partitionWriter.getNumBytesWritten | ||
| val bytesWrittenDiff = numBytesWritten - curNumBytesWritten | ||
| writeMetrics.incBytesWritten(bytesWrittenDiff) | ||
| curNumBytesWritten = numBytesWritten | ||
| } | ||
|
|
||
| private class CloseShieldOutputStream(delegate: OutputStream) | ||
| extends FilterOutputStream(delegate) { | ||
|
|
||
| override def close(): Unit = flush() | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would you need to do
partitonWriter.toStream(), as inUnsafeShuffleWriter, to ensure that theoutputFileStreamis created and an empty file exists. It seems be expected by theUnsafeShuffleWriterSuite, idk if it is the same hereThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That shouldn't be necessary, the
writer.close()should properly know what to do if a stream was never created.