-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-31801][API][SHUFFLE] Register map output metadata #28618
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f427ab5
65d077d
c2882ab
dc6f853
529122f
d20a0ee
dc8d15c
e340bca
edd5c05
7baa3d2
cce67ab
409bebb
a6fabd2
3c66353
b88d724
3505af8
89bb528
0da9e22
2b5108f
1e10577
cac0e9e
861f089
a6d974c
e210160
f69cba7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.shuffle.api.metadata; | ||
|
|
||
| /** | ||
| * An implementation of shuffle output tracking that does not keep track of any shuffle state. | ||
| */ | ||
| public class NoOpShuffleOutputTracker implements ShuffleOutputTracker { | ||
|
|
||
| @Override | ||
| public void registerShuffle(int shuffleId) {} | ||
|
|
||
| @Override | ||
| public void unregisterShuffle(int shuffleId, boolean blocking) {} | ||
|
|
||
| @Override | ||
| public void registerMapOutput( | ||
| int shuffleId, int mapIndex, long mapId, MapOutputMetadata mapOutputMetadata) {} | ||
|
|
||
| @Override | ||
| public void removeMapOutput(int shuffleId, int mapIndex, long mapId) {} | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.shuffle.api.metadata; | ||
|
|
||
| /** | ||
| * :: Private :: | ||
tgravescs marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| * | ||
| * A plugin that can monitor the storage of shuffle data from map tasks, and can provide | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. assume this is on the driver, might be nice just to mention. |
||
| * metadata to shuffle readers to aid their reading of shuffle blocks in reduce tasks. | ||
| * <p> | ||
| * {@link MapOutputMetadata} instances provided from the plugin tree's implementation of | ||
| * {@link org.apache.spark.shuffle.api.ShuffleMapOutputWriter} are sent to this module's map output | ||
| * metadata registration method in {@link #registerMapOutput(int, int, long, MapOutputMetadata)}. | ||
| * <p> | ||
| * Implementations MUST be thread-safe. Spark will invoke methods in this module in parallel. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a note about what locking semantics are when the methods are invoked ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what can be added. Can you give an example of guidance we could give to developers?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As an example, Additionally, it also informs users about how to design their implementations. For example, if an implementation makes rpc calls (or other blocking calls) from (I just picked some api calls to illustrate, we need to elaborate for each).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it sufficient to simply say that all APIs need to have exclusive access from each other, at least locking on the shuffle id level?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, that's up to the implementation to handle, no?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was not referring to how implementations manage their state/locking (if they have additional state/coordination required, you are right, it has to be handled there - state managed across shuffles as an example for some custom impl). I was referring to what guarantees/expectations that implementations have from spark ( For example, as mentioned above,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The whole point of just saying that implementations must be thread-safe is that implementations cannot make any guarantees about how the caller will invoke this API. Even if we can say that the current implementation of I think saying that there is no guarantee about concurrent access to all methods suffices. But then that basically means "the implementation should be thread-safe". Is that reasonable, or do we need to be more granular than that? |
||
| * <p> | ||
| * A singleton instance of this module is instantiated on the driver via | ||
| * {@link ShuffleDriverComponents#shuffleOutputTracker()}. | ||
| */ | ||
| public interface ShuffleOutputTracker { | ||
|
|
||
| /** | ||
| * Called when a new shuffle stage is going to be run. | ||
| * | ||
| * @param shuffleId the unique identifier for the new shuffle stage | ||
| */ | ||
| void registerShuffle(int shuffleId); | ||
tgravescs marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A callback to signal completion (or failure) of shuffle for a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm hesitant to make significant API changes given how late this is in the review stage - let's add it as a follow-up if there are concrete use cases that require it down the road.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking at the currently designed api, this is a missing gap.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again - I don't want to add it here, since it would require further integration of such a new API call in the rest of the Spark codebase, which I am holding as out of scope for this patch.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is as a gap in the proposed api - which limits the effectiveness to leverage it for other shuffle usecases.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This API isn't complete as of this patch anyways, so I'd prefer that functionality to be deferred since the complexity being added here is already pretty significant. Can we file a follow up JIRA for it and go from there? |
||
| /** | ||
| * Called when the shuffle with the given id is unregistered because it will no longer | ||
| * be used by Spark tasks. | ||
| * | ||
| * @param shuffleId the unique identifier for the shuffle stage to be unregistered | ||
| */ | ||
| void unregisterShuffle(int shuffleId, boolean blocking); | ||
|
|
||
| /** | ||
| * Called when a map task completes, and the map output writer has provided metadata to be | ||
| * persisted by this shuffle output tracker. | ||
| * | ||
| * @param shuffleId the unique identifier for the shuffle stage that the map task is a | ||
| * part of | ||
| * @param mapIndex the map index of the map task in its shuffle map stage - not | ||
| * necessarily unique across multiple attempts of this task | ||
| * @param mapId the identifier for this map task, which is unique even across | ||
| * multiple attempts at this task | ||
| * @param mapOutputMetadata metadata about the map output data's storage returned by the map | ||
| * task's writer | ||
| * | ||
| */ | ||
| void registerMapOutput( | ||
| int shuffleId, int mapIndex, long mapId, MapOutputMetadata mapOutputMetadata); | ||
|
|
||
| /** | ||
| * Called when the given map output is discarded, and will not longer be used in future Spark | ||
| * shuffles. | ||
| * | ||
| * @param shuffleId the unique identifier for the shuffle stage that the map task is a | ||
| * part of | ||
| * @param mapIndex the map index of the map task which is having its output being | ||
| * discarded - not necessarily unique across multiple attempts of this | ||
| * task | ||
| * @param mapId the identifier for the map task which is having its output being | ||
| * discarded, which is unique even across multiple attempts at this task | ||
| */ | ||
| void removeMapOutput(int shuffleId, int mapIndex, long mapId); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,7 @@ | |
| import scala.Product2; | ||
| import scala.Tuple2; | ||
| import scala.collection.Iterator; | ||
| import scala.compat.java8.OptionConverters; | ||
|
|
||
| import com.google.common.annotations.VisibleForTesting; | ||
| import com.google.common.io.Closeables; | ||
|
|
@@ -39,17 +40,18 @@ | |
| import org.apache.spark.Partitioner; | ||
| import org.apache.spark.ShuffleDependency; | ||
| import org.apache.spark.SparkConf; | ||
| import org.apache.spark.scheduler.MapTaskResult; | ||
| import org.apache.spark.shuffle.api.ShuffleExecutorComponents; | ||
| import org.apache.spark.shuffle.api.ShuffleMapOutputWriter; | ||
| import org.apache.spark.shuffle.api.ShufflePartitionWriter; | ||
| import org.apache.spark.shuffle.api.WritableByteChannelWrapper; | ||
| import org.apache.spark.internal.config.package$; | ||
| import org.apache.spark.scheduler.MapStatus; | ||
| import org.apache.spark.scheduler.MapStatus$; | ||
| import org.apache.spark.serializer.Serializer; | ||
| import org.apache.spark.serializer.SerializerInstance; | ||
| import org.apache.spark.shuffle.ShuffleWriteMetricsReporter; | ||
| import org.apache.spark.shuffle.ShuffleWriter; | ||
| import org.apache.spark.shuffle.api.metadata.MapOutputCommitMessage; | ||
| import org.apache.spark.storage.*; | ||
| import org.apache.spark.util.Utils; | ||
|
|
||
|
|
@@ -92,8 +94,8 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> { | |
| /** Array of file writers, one for each partition */ | ||
| private DiskBlockObjectWriter[] partitionWriters; | ||
| private FileSegment[] partitionWriterSegments; | ||
| @Nullable private MapStatus mapStatus; | ||
| private long[] partitionLengths; | ||
| @Nullable private MapTaskResult taskResult; | ||
| private MapOutputCommitMessage mapOutputCommitMessage; | ||
|
|
||
| /** | ||
| * Are we in the process of stopping? Because map tasks can call stop() with success = true | ||
|
|
@@ -130,9 +132,13 @@ public void write(Iterator<Product2<K, V>> records) throws IOException { | |
| .createMapOutputWriter(shuffleId, mapId, numPartitions); | ||
| try { | ||
| if (!records.hasNext()) { | ||
| partitionLengths = mapOutputWriter.commitAllPartitions().getPartitionLengths(); | ||
| mapStatus = MapStatus$.MODULE$.apply( | ||
| blockManager.shuffleServerId(), partitionLengths, mapId); | ||
| mapOutputCommitMessage = mapOutputWriter.commitAllPartitions(); | ||
| taskResult = new MapTaskResult( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As these lines are repeating you could extract them into a new protected void setTaskResult(MapOutputCommitMessage mapOutputCommitMessage) {
taskResult = new MapTaskResult(
MapStatus$.MODULE$.apply(
blockManager.shuffleServerId(),
mapOutputCommitMessage.getPartitionLengths(),
mapId),
OptionConverters.toScala(mapOutputCommitMessage.getMapOutputMetadata()));
}With the help of this new
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ack - didn't address this in my latest patch but will get around to this |
||
| MapStatus$.MODULE$.apply( | ||
| blockManager.shuffleServerId(), | ||
| mapOutputCommitMessage.getPartitionLengths(), | ||
| mapId), | ||
| OptionConverters.toScala(mapOutputCommitMessage.getMapOutputMetadata())); | ||
| return; | ||
| } | ||
| final SerializerInstance serInstance = serializer.newInstance(); | ||
|
|
@@ -164,9 +170,13 @@ public void write(Iterator<Product2<K, V>> records) throws IOException { | |
| } | ||
| } | ||
|
|
||
| partitionLengths = writePartitionedData(mapOutputWriter); | ||
| mapStatus = MapStatus$.MODULE$.apply( | ||
| blockManager.shuffleServerId(), partitionLengths, mapId); | ||
| mapOutputCommitMessage = writePartitionedData(mapOutputWriter); | ||
| taskResult = new MapTaskResult( | ||
| MapStatus$.MODULE$.apply( | ||
| blockManager.shuffleServerId(), | ||
| mapOutputCommitMessage.getPartitionLengths(), | ||
| mapId), | ||
| OptionConverters.toScala(mapOutputCommitMessage.getMapOutputMetadata())); | ||
| } catch (Exception e) { | ||
| try { | ||
| mapOutputWriter.abort(e); | ||
|
|
@@ -179,16 +189,17 @@ public void write(Iterator<Product2<K, V>> records) throws IOException { | |
| } | ||
|
|
||
| @VisibleForTesting | ||
| long[] getPartitionLengths() { | ||
| return partitionLengths; | ||
| MapOutputCommitMessage getMapOutputCommitMessage() { | ||
| return mapOutputCommitMessage; | ||
| } | ||
|
|
||
| /** | ||
| * Concatenate all of the per-partition files into a single combined file. | ||
| * | ||
| * @return array of lengths, in bytes, of each partition of the file (used by map output tracker). | ||
| */ | ||
| private long[] writePartitionedData(ShuffleMapOutputWriter mapOutputWriter) throws IOException { | ||
| private MapOutputCommitMessage writePartitionedData(ShuffleMapOutputWriter mapOutputWriter) | ||
| throws IOException { | ||
| // Track location of the partition starts in the output file | ||
| if (partitionWriters != null) { | ||
| final long writeStartTime = System.nanoTime(); | ||
|
|
@@ -219,7 +230,7 @@ private long[] writePartitionedData(ShuffleMapOutputWriter mapOutputWriter) thro | |
| } | ||
| partitionWriters = null; | ||
| } | ||
| return mapOutputWriter.commitAllPartitions().getPartitionLengths(); | ||
| return mapOutputWriter.commitAllPartitions(); | ||
| } | ||
|
|
||
| private void writePartitionedDataWithChannel( | ||
|
|
@@ -259,16 +270,16 @@ private void writePartitionedDataWithStream(File file, ShufflePartitionWriter wr | |
| } | ||
|
|
||
| @Override | ||
| public Option<MapStatus> stop(boolean success) { | ||
| public Option<MapTaskResult> stop(boolean success) { | ||
| if (stopping) { | ||
| return None$.empty(); | ||
| } else { | ||
| stopping = true; | ||
| if (success) { | ||
| if (mapStatus == null) { | ||
| if (taskResult == null) { | ||
| throw new IllegalStateException("Cannot call stop(true) without having called write()"); | ||
| } | ||
| return Option.apply(mapStatus); | ||
| return Option.apply(taskResult); | ||
| } else { | ||
| // The map task failed, so delete our output data. | ||
| if (partitionWriters != null) { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.