Skip to content

Commit 2825a13

Browse files
committed
up-merging to the current master branch of the apache spark
2 parents 8968b67 + 646e554 commit 2825a13

File tree

11,572 files changed

+253338
-2800
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

11,572 files changed

+253338
-2800
lines changed

NOTICE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Apache Spark
2-
Copyright 2013 The Apache Software Foundation.
2+
Copyright 2014 The Apache Software Foundation.
33

44
This product includes software developed at
55
The Apache Software Foundation (http://www.apache.org/).

assembly/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,11 @@
7979
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
8080
<version>${project.version}</version>
8181
</dependency>
82+
<dependency>
83+
<groupId>org.apache.spark</groupId>
84+
<artifactId>spark-sql_${scala.binary.version}</artifactId>
85+
<version>${project.version}</version>
86+
</dependency>
8287
<dependency>
8388
<groupId>net.sf.py4j</groupId>
8489
<artifactId>py4j</artifactId>

bin/compute-classpath.sh

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,23 +33,43 @@ fi
3333
# Build up classpath
3434
CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
3535

36+
# Support for interacting with Hive. Since hive pulls in a lot of dependencies that might break
37+
# existing Spark applications, it is not included in the standard spark assembly. Instead, we only
38+
# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
39+
# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
40+
# the future.
41+
if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
42+
echo "Hive assembly found, including hive support. If this isn't desired run sbt hive/clean."
43+
44+
# Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
45+
DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
46+
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
47+
48+
ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
49+
else
50+
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
51+
fi
52+
3653
# First check if we have a dependencies jar. If so, include binary classes with the deps jar
37-
if [ -f "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar ]; then
54+
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
3855
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
3956
CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
4057
CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
4158
CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/classes"
4259
CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/classes"
4360
CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/classes"
61+
CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/classes"
62+
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
63+
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
4464

45-
DEPS_ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar`
65+
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
4666
CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
4767
else
4868
# Else use spark-assembly jar from either RELEASE or assembly directory
4969
if [ -f "$FWDIR/RELEASE" ]; then
50-
ASSEMBLY_JAR=`ls "$FWDIR"/jars/spark-assembly*.jar`
70+
ASSEMBLY_JAR=`ls "$FWDIR"/jars/spark*-assembly*.jar`
5171
else
52-
ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*.jar`
72+
ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*.jar`
5373
fi
5474
CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
5575
fi
@@ -62,6 +82,9 @@ if [[ $SPARK_TESTING == 1 ]]; then
6282
CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/test-classes"
6383
CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/test-classes"
6484
CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/test-classes"
85+
CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/test-classes"
86+
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/test-classes"
87+
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/test-classes"
6588
fi
6689

6790
# Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail !

core/src/main/scala/org/apache/spark/Aggregator.scala

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717

1818
package org.apache.spark
1919

20-
import scala.{Option, deprecated}
21-
2220
import org.apache.spark.util.collection.{AppendOnlyMap, ExternalAppendOnlyMap}
2321

2422
/**

core/src/main/scala/org/apache/spark/CacheManager.scala

Lines changed: 45 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,12 @@ package org.apache.spark
2020
import scala.collection.mutable.{ArrayBuffer, HashSet}
2121

2222
import org.apache.spark.rdd.RDD
23-
import org.apache.spark.storage.{BlockManager, RDDBlockId, StorageLevel}
23+
import org.apache.spark.storage.{BlockId, BlockManager, BlockStatus, RDDBlockId, StorageLevel}
2424

25-
/** Spark class responsible for passing RDDs split contents to the BlockManager and making
26-
sure a node doesn't load two copies of an RDD at once.
27-
*/
25+
/**
26+
* Spark class responsible for passing RDDs split contents to the BlockManager and making
27+
* sure a node doesn't load two copies of an RDD at once.
28+
*/
2829
private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
2930

3031
/** Keys of RDD splits that are being computed/loaded. */
@@ -49,11 +50,11 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
4950
try {loading.wait()} catch {case _ : Throwable =>}
5051
}
5152
logInfo("Finished waiting for %s".format(key))
52-
// See whether someone else has successfully loaded it. The main way this would fail
53-
// is for the RDD-level cache eviction policy if someone else has loaded the same RDD
54-
// partition but we didn't want to make space for it. However, that case is unlikely
55-
// because it's unlikely that two threads would work on the same RDD partition. One
56-
// downside of the current code is that threads wait serially if this does happen.
53+
/* See whether someone else has successfully loaded it. The main way this would fail
54+
* is for the RDD-level cache eviction policy if someone else has loaded the same RDD
55+
* partition but we didn't want to make space for it. However, that case is unlikely
56+
* because it's unlikely that two threads would work on the same RDD partition. One
57+
* downside of the current code is that threads wait serially if this does happen. */
5758
blockManager.get(key) match {
5859
case Some(values) =>
5960
return new InterruptibleIterator(context, values.asInstanceOf[Iterator[T]])
@@ -69,32 +70,45 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
6970
// If we got here, we have to load the split
7071
logInfo("Partition %s not found, computing it".format(key))
7172
val computedValues = rdd.computeOrReadCheckpoint(split, context)
73+
7274
// Persist the result, so long as the task is not running locally
7375
if (context.runningLocally) { return computedValues }
74-
if (storageLevel.useDisk && !storageLevel.useMemory) {
75-
// In the case that this RDD is to be persisted using DISK_ONLY
76-
// the iterator will be passed directly to the blockManager (rather then
77-
// caching it to an ArrayBuffer first), then the resulting block data iterator
78-
// will be passed back to the user. If the iterator generates a lot of data,
79-
// this means that it doesn't all have to be held in memory at one time.
80-
// This could also apply to MEMORY_ONLY_SER storage, but we need to make sure
81-
// blocks aren't dropped by the block store before enabling that.
82-
blockManager.put(key, computedValues, storageLevel, tellMaster = true)
83-
return blockManager.get(key) match {
84-
case Some(values) =>
85-
return new InterruptibleIterator(context, values.asInstanceOf[Iterator[T]])
86-
case None =>
87-
logInfo("Failure to store %s".format(key))
88-
throw new Exception("Block manager failed to return persisted valued")
76+
77+
// Keep track of blocks with updated statuses
78+
var updatedBlocks = Seq[(BlockId, BlockStatus)]()
79+
val returnValue: Iterator[T] = {
80+
if (storageLevel.useDisk && !storageLevel.useMemory) {
81+
/* In the case that this RDD is to be persisted using DISK_ONLY
82+
* the iterator will be passed directly to the blockManager (rather then
83+
* caching it to an ArrayBuffer first), then the resulting block data iterator
84+
* will be passed back to the user. If the iterator generates a lot of data,
85+
* this means that it doesn't all have to be held in memory at one time.
86+
* This could also apply to MEMORY_ONLY_SER storage, but we need to make sure
87+
* blocks aren't dropped by the block store before enabling that. */
88+
updatedBlocks = blockManager.put(key, computedValues, storageLevel, tellMaster = true)
89+
blockManager.get(key) match {
90+
case Some(values) =>
91+
new InterruptibleIterator(context, values.asInstanceOf[Iterator[T]])
92+
case None =>
93+
logInfo("Failure to store %s".format(key))
94+
throw new Exception("Block manager failed to return persisted valued")
95+
}
96+
} else {
97+
// In this case the RDD is cached to an array buffer. This will save the results
98+
// if we're dealing with a 'one-time' iterator
99+
val elements = new ArrayBuffer[Any]
100+
elements ++= computedValues
101+
updatedBlocks = blockManager.put(key, elements, storageLevel, tellMaster = true)
102+
elements.iterator.asInstanceOf[Iterator[T]]
89103
}
90-
} else {
91-
// In this case the RDD is cached to an array buffer. This will save the results
92-
// if we're dealing with a 'one-time' iterator
93-
val elements = new ArrayBuffer[Any]
94-
elements ++= computedValues
95-
blockManager.put(key, elements, storageLevel, tellMaster = true)
96-
return elements.iterator.asInstanceOf[Iterator[T]]
97104
}
105+
106+
// Update task metrics to include any blocks whose storage status is updated
107+
val metrics = context.taskMetrics
108+
metrics.updatedBlocks = Some(updatedBlocks)
109+
110+
returnValue
111+
98112
} finally {
99113
loading.synchronized {
100114
loading.remove(key)

core/src/main/scala/org/apache/spark/Dependency.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.spark
1919

2020
import org.apache.spark.rdd.RDD
21+
import org.apache.spark.serializer.Serializer
2122

2223
/**
2324
* Base class for dependencies.
@@ -43,12 +44,13 @@ abstract class NarrowDependency[T](rdd: RDD[T]) extends Dependency(rdd) {
4344
* Represents a dependency on the output of a shuffle stage.
4445
* @param rdd the parent RDD
4546
* @param partitioner partitioner used to partition the shuffle output
46-
* @param serializerClass class name of the serializer to use
47+
* @param serializer [[Serializer]] to use. If set to null, the default serializer, as specified
48+
* by `spark.serializer` config option, will be used.
4749
*/
4850
class ShuffleDependency[K, V](
4951
@transient rdd: RDD[_ <: Product2[K, V]],
5052
val partitioner: Partitioner,
51-
val serializerClass: String = null)
53+
val serializer: Serializer = null)
5254
extends Dependency(rdd.asInstanceOf[RDD[Product2[K, V]]]) {
5355

5456
val shuffleId: Int = rdd.context.newShuffleId()

core/src/main/scala/org/apache/spark/MapOutputTracker.scala

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,28 @@ private[spark] case class GetMapOutputStatuses(shuffleId: Int)
3535
extends MapOutputTrackerMessage
3636
private[spark] case object StopMapOutputTracker extends MapOutputTrackerMessage
3737

38-
private[spark] class MapOutputTrackerMasterActor(tracker: MapOutputTrackerMaster)
38+
private[spark] class MapOutputTrackerMasterActor(tracker: MapOutputTrackerMaster, conf: SparkConf)
3939
extends Actor with Logging {
40+
val maxAkkaFrameSize = AkkaUtils.maxFrameSizeBytes(conf)
41+
4042
def receive = {
4143
case GetMapOutputStatuses(shuffleId: Int) =>
4244
val hostPort = sender.path.address.hostPort
4345
logInfo("Asked to send map output locations for shuffle " + shuffleId + " to " + hostPort)
44-
sender ! tracker.getSerializedMapOutputStatuses(shuffleId)
46+
val mapOutputStatuses = tracker.getSerializedMapOutputStatuses(shuffleId)
47+
val serializedSize = mapOutputStatuses.size
48+
if (serializedSize > maxAkkaFrameSize) {
49+
val msg = s"Map output statuses were $serializedSize bytes which " +
50+
s"exceeds spark.akka.frameSize ($maxAkkaFrameSize bytes)."
51+
52+
/* For SPARK-1244 we'll opt for just logging an error and then throwing an exception.
53+
* Note that on exception the actor will just restart. A bigger refactoring (SPARK-1239)
54+
* will ultimately remove this entire code path. */
55+
val exception = new SparkException(msg)
56+
logError(msg, exception)
57+
throw exception
58+
}
59+
sender ! mapOutputStatuses
4560

4661
case StopMapOutputTracker =>
4762
logInfo("MapOutputTrackerActor stopped!")

core/src/main/scala/org/apache/spark/SecurityManager.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818
package org.apache.spark
1919

2020
import java.net.{Authenticator, PasswordAuthentication}
21-
import org.apache.hadoop.io.Text
22-
import org.apache.hadoop.security.Credentials
23-
import org.apache.hadoop.security.UserGroupInformation
24-
import org.apache.spark.deploy.SparkHadoopUtil
2521

2622
import scala.collection.mutable.ArrayBuffer
2723

24+
import org.apache.hadoop.io.Text
25+
26+
import org.apache.spark.deploy.SparkHadoopUtil
27+
2828
/**
2929
* Spark class responsible for security.
3030
*

core/src/main/scala/org/apache/spark/ShuffleFetcher.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ private[spark] abstract class ShuffleFetcher {
2929
shuffleId: Int,
3030
reduceId: Int,
3131
context: TaskContext,
32-
serializer: Serializer = SparkEnv.get.serializerManager.default): Iterator[T]
32+
serializer: Serializer = SparkEnv.get.serializer): Iterator[T]
3333

3434
/** Stop the fetcher */
3535
def stop() {}

0 commit comments

Comments
 (0)