Skip to content

Commit f489fdc

Browse files
committed
Merge remote-tracking branch 'apache/master' into state-cleanup
Conflicts: core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala core/src/main/scala/org/apache/spark/storage/BlockManager.scala core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
2 parents d25a86e + 0307db0 commit f489fdc

File tree

135 files changed

+2472
-753
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

135 files changed

+2472
-753
lines changed

.rat-excludes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,4 @@ work
3939
.*\.q
4040
golden
4141
test.out/*
42+
.*iml

assembly/pom.xml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,16 @@
163163
</dependency>
164164
</dependencies>
165165
</profile>
166+
<profile>
167+
<id>hive</id>
168+
<dependencies>
169+
<dependency>
170+
<groupId>org.apache.spark</groupId>
171+
<artifactId>spark-hive_${scala.binary.version}</artifactId>
172+
<version>${project.version}</version>
173+
</dependency>
174+
</dependencies>
175+
</profile>
166176
<profile>
167177
<id>spark-ganglia-lgpl</id>
168178
<dependencies>
@@ -208,7 +218,7 @@
208218
<plugin>
209219
<groupId>org.codehaus.mojo</groupId>
210220
<artifactId>buildnumber-maven-plugin</artifactId>
211-
<version>1.1</version>
221+
<version>1.2</version>
212222
<executions>
213223
<execution>
214224
<phase>validate</phase>

bin/compute-classpath.sh

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -30,21 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
3030
# Build up classpath
3131
CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
3232

33-
# Support for interacting with Hive. Since hive pulls in a lot of dependencies that might break
34-
# existing Spark applications, it is not included in the standard spark assembly. Instead, we only
35-
# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
36-
# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
37-
# the future.
38-
if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
39-
40-
# Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
41-
DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
42-
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
43-
44-
ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
45-
else
46-
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
47-
fi
33+
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
4834

4935
# First check if we have a dependencies jar. If so, include binary classes with the deps jar
5036
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
@@ -59,7 +45,7 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
5945
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
6046
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
6147

62-
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
48+
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
6349
CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
6450
else
6551
# Else use spark-assembly jar from either RELEASE or assembly directory
@@ -71,6 +57,23 @@ else
7157
CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
7258
fi
7359

60+
# When Hive support is needed, Datanucleus jars must be included on the classpath.
61+
# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
62+
# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
63+
# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
64+
# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
65+
# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
66+
num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ | grep "datanucleus-.*\\.jar" | wc -l)
67+
if [ $num_datanucleus_jars -gt 0 ]; then
68+
AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
69+
num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
70+
if [ $num_hive_files -gt 0 ]; then
71+
echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
72+
DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
73+
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
74+
fi
75+
fi
76+
7477
# Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
7578
if [[ $SPARK_TESTING == 1 ]]; then
7679
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/test-classes"

bin/load-spark-env.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ if [ -z "$SPARK_ENV_LOADED" ]; then
3030
use_conf_dir=${SPARK_CONF_DIR:-"$parent_dir/conf"}
3131

3232
if [ -f "${use_conf_dir}/spark-env.sh" ]; then
33+
# Promote all variable declarations to environment (exported) variables
34+
set -a
3335
. "${use_conf_dir}/spark-env.sh"
36+
set +a
3437
fi
3538
fi

bin/spark-class

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,5 +154,3 @@ if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
154154
fi
155155

156156
exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
157-
158-

bin/spark-shell

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ set -o posix
3434
FWDIR="$(cd `dirname $0`/..; pwd)"
3535

3636
SPARK_REPL_OPTS="${SPARK_REPL_OPTS:-""}"
37-
DEFAULT_MASTER="local"
37+
DEFAULT_MASTER="local[*]"
3838
MASTER=${MASTER:-""}
3939

4040
info_log=0
@@ -64,7 +64,7 @@ ${txtbld}OPTIONS${txtrst}:
6464
is followed by m for megabytes or g for gigabytes, e.g. "1g".
6565
-dm --driver-memory : The memory used by the Spark Shell, the number is followed
6666
by m for megabytes or g for gigabytes, e.g. "1g".
67-
-m --master : A full string that describes the Spark Master, defaults to "local"
67+
-m --master : A full string that describes the Spark Master, defaults to "local[*]"
6868
e.g. "spark://localhost:7077".
6969
--log-conf : Enables logging of the supplied SparkConf as INFO at start of the
7070
Spark Context.
@@ -127,7 +127,7 @@ function set_spark_log_conf(){
127127

128128
function set_spark_master(){
129129
if ! [[ "$1" =~ $ARG_FLAG_PATTERN ]]; then
130-
MASTER="$1"
130+
export MASTER="$1"
131131
else
132132
out_error "wrong format for $2"
133133
fi
@@ -145,7 +145,7 @@ function resolve_spark_master(){
145145
fi
146146

147147
if [ -z "$MASTER" ]; then
148-
MASTER="$DEFAULT_MASTER"
148+
export MASTER="$DEFAULT_MASTER"
149149
fi
150150

151151
}

core/pom.xml

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,10 @@
117117
<dependency>
118118
<groupId>com.twitter</groupId>
119119
<artifactId>chill_${scala.binary.version}</artifactId>
120-
<version>0.3.1</version>
121120
</dependency>
122121
<dependency>
123122
<groupId>com.twitter</groupId>
124123
<artifactId>chill-java</artifactId>
125-
<version>0.3.1</version>
126124
</dependency>
127125
<dependency>
128126
<groupId>commons-net</groupId>
@@ -200,6 +198,53 @@
200198
<artifactId>derby</artifactId>
201199
<scope>test</scope>
202200
</dependency>
201+
<dependency>
202+
<groupId>org.tachyonproject</groupId>
203+
<artifactId>tachyon</artifactId>
204+
<version>0.4.1-thrift</version>
205+
<exclusions>
206+
<exclusion>
207+
<groupId>org.apache.hadoop</groupId>
208+
<artifactId>hadoop-client</artifactId>
209+
</exclusion>
210+
<exclusion>
211+
<groupId>org.apache.curator</groupId>
212+
<artifactId>curator-recipes</artifactId>
213+
</exclusion>
214+
<exclusion>
215+
<groupId>org.eclipse.jetty</groupId>
216+
<artifactId>jetty-jsp</artifactId>
217+
</exclusion>
218+
<exclusion>
219+
<groupId>org.eclipse.jetty</groupId>
220+
<artifactId>jetty-webapp</artifactId>
221+
</exclusion>
222+
<exclusion>
223+
<groupId>org.eclipse.jetty</groupId>
224+
<artifactId>jetty-server</artifactId>
225+
</exclusion>
226+
<exclusion>
227+
<groupId>org.eclipse.jetty</groupId>
228+
<artifactId>jetty-servlet</artifactId>
229+
</exclusion>
230+
<exclusion>
231+
<groupId>junit</groupId>
232+
<artifactId>junit</artifactId>
233+
</exclusion>
234+
<exclusion>
235+
<groupId>org.powermock</groupId>
236+
<artifactId>powermock-module-junit4</artifactId>
237+
</exclusion>
238+
<exclusion>
239+
<groupId>org.powermock</groupId>
240+
<artifactId>powermock-api-mockito</artifactId>
241+
</exclusion>
242+
<exclusion>
243+
<groupId>org.apache.curator</groupId>
244+
<artifactId>curator-test</artifactId>
245+
</exclusion>
246+
</exclusions>
247+
</dependency>
203248
<dependency>
204249
<groupId>org.scalatest</groupId>
205250
<artifactId>scalatest_${scala.binary.version}</artifactId>

core/src/main/java/org/apache/spark/api/java/StorageLevels.java

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,18 @@
2323
* Expose some commonly useful storage level constants.
2424
*/
2525
public class StorageLevels {
26-
public static final StorageLevel NONE = create(false, false, false, 1);
27-
public static final StorageLevel DISK_ONLY = create(true, false, false, 1);
28-
public static final StorageLevel DISK_ONLY_2 = create(true, false, false, 2);
29-
public static final StorageLevel MEMORY_ONLY = create(false, true, true, 1);
30-
public static final StorageLevel MEMORY_ONLY_2 = create(false, true, true, 2);
31-
public static final StorageLevel MEMORY_ONLY_SER = create(false, true, false, 1);
32-
public static final StorageLevel MEMORY_ONLY_SER_2 = create(false, true, false, 2);
33-
public static final StorageLevel MEMORY_AND_DISK = create(true, true, true, 1);
34-
public static final StorageLevel MEMORY_AND_DISK_2 = create(true, true, true, 2);
35-
public static final StorageLevel MEMORY_AND_DISK_SER = create(true, true, false, 1);
36-
public static final StorageLevel MEMORY_AND_DISK_SER_2 = create(true, true, false, 2);
26+
public static final StorageLevel NONE = create(false, false, false, false, 1);
27+
public static final StorageLevel DISK_ONLY = create(true, false, false, false, 1);
28+
public static final StorageLevel DISK_ONLY_2 = create(true, false, false, false, 2);
29+
public static final StorageLevel MEMORY_ONLY = create(false, true, false, true, 1);
30+
public static final StorageLevel MEMORY_ONLY_2 = create(false, true, false, true, 2);
31+
public static final StorageLevel MEMORY_ONLY_SER = create(false, true, false, false, 1);
32+
public static final StorageLevel MEMORY_ONLY_SER_2 = create(false, true, false, false, 2);
33+
public static final StorageLevel MEMORY_AND_DISK = create(true, true, false, true, 1);
34+
public static final StorageLevel MEMORY_AND_DISK_2 = create(true, true, false, true, 2);
35+
public static final StorageLevel MEMORY_AND_DISK_SER = create(true, true, false, false, 1);
36+
public static final StorageLevel MEMORY_AND_DISK_SER_2 = create(true, true, false, false, 2);
37+
public static final StorageLevel OFF_HEAP = create(false, false, true, false, 1);
3738

3839
/**
3940
* Create a new StorageLevel object.
@@ -42,7 +43,26 @@ public class StorageLevels {
4243
* @param deserialized saved as deserialized objects, if true
4344
* @param replication replication factor
4445
*/
45-
public static StorageLevel create(boolean useDisk, boolean useMemory, boolean deserialized, int replication) {
46-
return StorageLevel.apply(useDisk, useMemory, deserialized, replication);
46+
@Deprecated
47+
public static StorageLevel create(boolean useDisk, boolean useMemory, boolean deserialized,
48+
int replication) {
49+
return StorageLevel.apply(useDisk, useMemory, false, deserialized, replication);
50+
}
51+
52+
/**
53+
* Create a new StorageLevel object.
54+
* @param useDisk saved to disk, if true
55+
* @param useMemory saved to memory, if true
56+
* @param useOffHeap saved to Tachyon, if true
57+
* @param deserialized saved as deserialized objects, if true
58+
* @param replication replication factor
59+
*/
60+
public static StorageLevel create(
61+
boolean useDisk,
62+
boolean useMemory,
63+
boolean useOffHeap,
64+
boolean deserialized,
65+
int replication) {
66+
return StorageLevel.apply(useDisk, useMemory, useOffHeap, deserialized, replication);
4767
}
4868
}

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,13 @@ package org.apache.spark
1919

2020
import java.io._
2121
import java.net.URI
22-
import java.util.{Properties, UUID}
2322
import java.util.concurrent.atomic.AtomicInteger
24-
23+
import java.util.{Properties, UUID}
24+
import java.util.UUID.randomUUID
2525
import scala.collection.{Map, Set}
2626
import scala.collection.generic.Growable
2727
import scala.collection.mutable.{ArrayBuffer, HashMap}
2828
import scala.reflect.{ClassTag, classTag}
29-
3029
import org.apache.hadoop.conf.Configuration
3130
import org.apache.hadoop.fs.Path
3231
import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable}
@@ -37,6 +36,7 @@ import org.apache.mesos.MesosNativeLibrary
3736

3837
import org.apache.spark.broadcast.Broadcast
3938
import org.apache.spark.deploy.{LocalSparkCluster, SparkHadoopUtil}
39+
import org.apache.spark.input.WholeTextFileInputFormat
4040
import org.apache.spark.partial.{ApproximateEvaluator, PartialResult}
4141
import org.apache.spark.rdd._
4242
import org.apache.spark.scheduler._
@@ -129,6 +129,11 @@ class SparkContext(
129129
val master = conf.get("spark.master")
130130
val appName = conf.get("spark.app.name")
131131

132+
// Generate the random name for a temp folder in Tachyon
133+
// Add a timestamp as the suffix here to make it more safe
134+
val tachyonFolderName = "spark-" + randomUUID.toString()
135+
conf.set("spark.tachyonStore.folderName", tachyonFolderName)
136+
132137
val isLocal = (master == "local" || master.startsWith("local["))
133138

134139
if (master == "yarn-client") System.setProperty("SPARK_YARN_MODE", "true")
@@ -378,6 +383,39 @@ class SparkContext(
378383
minSplits).map(pair => pair._2.toString)
379384
}
380385

386+
/**
387+
* Read a directory of text files from HDFS, a local file system (available on all nodes), or any
388+
* Hadoop-supported file system URI. Each file is read as a single record and returned in a
389+
* key-value pair, where the key is the path of each file, the value is the content of each file.
390+
*
391+
* <p> For example, if you have the following files:
392+
* {{{
393+
* hdfs://a-hdfs-path/part-00000
394+
* hdfs://a-hdfs-path/part-00001
395+
* ...
396+
* hdfs://a-hdfs-path/part-nnnnn
397+
* }}}
398+
*
399+
* Do `val rdd = sparkContext.wholeTextFile("hdfs://a-hdfs-path")`,
400+
*
401+
* <p> then `rdd` contains
402+
* {{{
403+
* (a-hdfs-path/part-00000, its content)
404+
* (a-hdfs-path/part-00001, its content)
405+
* ...
406+
* (a-hdfs-path/part-nnnnn, its content)
407+
* }}}
408+
*
409+
* @note Small files are preferred, as each file will be loaded fully in memory.
410+
*/
411+
def wholeTextFiles(path: String): RDD[(String, String)] = {
412+
newAPIHadoopFile(
413+
path,
414+
classOf[WholeTextFileInputFormat],
415+
classOf[String],
416+
classOf[String])
417+
}
418+
381419
/**
382420
* Get an RDD for a Hadoop-readable dataset from a Hadoop JobConf given its InputFormat and other
383421
* necessary info (e.g. file name for a filesystem-based dataset, table name for HyperTable),
@@ -704,10 +742,6 @@ class SparkContext(
704742
*/
705743
def getPersistentRDDs: Map[Int, RDD[_]] = persistentRdds.toMap
706744

707-
def getStageInfo: Map[Stage, StageInfo] = {
708-
dagScheduler.stageToInfos
709-
}
710-
711745
/**
712746
* Return information about blocks stored in all of the slaves
713747
*/
@@ -1262,8 +1296,8 @@ object SparkContext extends Logging {
12621296

12631297
/** Creates a task scheduler based on a given master URL. Extracted for testing. */
12641298
private def createTaskScheduler(sc: SparkContext, master: String): TaskScheduler = {
1265-
// Regular expression used for local[N] master format
1266-
val LOCAL_N_REGEX = """local\[([0-9]+)\]""".r
1299+
// Regular expression used for local[N] and local[*] master formats
1300+
val LOCAL_N_REGEX = """local\[([0-9\*]+)\]""".r
12671301
// Regular expression for local[N, maxRetries], used in tests with failing tasks
12681302
val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+)\s*,\s*([0-9]+)\]""".r
12691303
// Regular expression for simulating a Spark cluster of [N, cores, memory] locally
@@ -1286,8 +1320,11 @@ object SparkContext extends Logging {
12861320
scheduler
12871321

12881322
case LOCAL_N_REGEX(threads) =>
1323+
def localCpuCount = Runtime.getRuntime.availableProcessors()
1324+
// local[*] estimates the number of cores on the machine; local[N] uses exactly N threads.
1325+
val threadCount = if (threads == "*") localCpuCount else threads.toInt
12891326
val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
1290-
val backend = new LocalBackend(scheduler, threads.toInt)
1327+
val backend = new LocalBackend(scheduler, threadCount)
12911328
scheduler.initialize(backend)
12921329
scheduler
12931330

0 commit comments

Comments
 (0)