Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import java.text.DateFormat
import java.util.{Arrays, Comparator, Date, Locale}

import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.control.NonFatal

import com.google.common.primitives.Longs
Expand Down Expand Up @@ -143,14 +144,29 @@ class SparkHadoopUtil extends Logging {
* Returns a function that can be called to find Hadoop FileSystem bytes read. If
* getFSBytesReadOnThreadCallback is called from thread r at time t, the returned callback will
* return the bytes read on r since t.
*
* @return None if the required method can't be found.
*/
private[spark] def getFSBytesReadOnThreadCallback(): () => Long = {
val threadStats = FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics)
val f = () => threadStats.map(_.getBytesRead).sum
val baselineBytesRead = f()
() => f() - baselineBytesRead
val f = () => FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics.getBytesRead).sum
val baseline = (Thread.currentThread().getId, f())

/**
* This function may be called in both spawned child threads and parent task thread (in
* PythonRDD), and Hadoop FileSystem uses thread local variables to track the statistics.
* So we need a map to track the bytes read from the child threads and parent thread,
* summing them together to get the bytes read of this task.
*/
new Function0[Long] {
private val bytesReadMap = new mutable.HashMap[Long, Long]()

override def apply(): Long = {
bytesReadMap.synchronized {
bytesReadMap.put(Thread.currentThread().getId, f())
bytesReadMap.map { case (k, v) =>
v - (if (k == baseline._1) baseline._2 else 0)
}.sum
}
}
}
}

/**
Expand Down
17 changes: 10 additions & 7 deletions core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,15 @@ private[spark] object SparkSubmitUtils {
// Exposed for testing
var printStream = SparkSubmit.printStream

// Exposed for testing.
// These components are used to make the default exclusion rules for Spark dependencies.
// We need to specify each component explicitly, otherwise we miss spark-streaming-kafka-0-8 and
// other spark-streaming utility components. Underscore is there to differentiate between
// spark-streaming_2.1x and spark-streaming-kafka-0-8-assembly_2.1x
val IVY_DEFAULT_EXCLUDES = Seq("catalyst_", "core_", "graphx_", "launcher_", "mllib_",
"mllib-local_", "network-common_", "network-shuffle_", "repl_", "sketch_", "sql_", "streaming_",
"tags_", "unsafe_")

/**
* Represents a Maven Coordinate
* @param groupId the groupId of the coordinate
Expand Down Expand Up @@ -1007,13 +1016,7 @@ private[spark] object SparkSubmitUtils {
// Add scala exclusion rule
md.addExcludeRule(createExclusion("*:scala-library:*", ivySettings, ivyConfName))

// We need to specify each component explicitly, otherwise we miss spark-streaming-kafka-0-8 and
// other spark-streaming utility components. Underscore is there to differentiate between
// spark-streaming_2.1x and spark-streaming-kafka-0-8-assembly_2.1x
val components = Seq("catalyst_", "core_", "graphx_", "hive_", "mllib_", "repl_",
"sql_", "streaming_", "yarn_", "network-common_", "network-shuffle_", "network-yarn_")

components.foreach { comp =>
IVY_DEFAULT_EXCLUDES.foreach { comp =>
md.addExcludeRule(createExclusion(s"org.apache.spark:spark-$comp*:*", ivySettings,
ivyConfName))
}
Expand Down
8 changes: 7 additions & 1 deletion core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,13 @@ class HadoopRDD[K, V](
null
}
// Register an on-task-completion callback to close the input stream.
context.addTaskCompletionListener{ context => closeIfNeeded() }
context.addTaskCompletionListener { context =>
// Update the bytes read before closing is to make sure lingering bytesRead statistics in
// this thread get correctly added.
updateBytesRead()
closeIfNeeded()
}

private val key: K = if (reader == null) null.asInstanceOf[K] else reader.createKey()
private val value: V = if (reader == null) null.asInstanceOf[V] else reader.createValue()

Expand Down
8 changes: 7 additions & 1 deletion core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,13 @@ class NewHadoopRDD[K, V](
}

// Register an on-task-completion callback to close the input stream.
context.addTaskCompletionListener(context => close())
context.addTaskCompletionListener { context =>
// Update the bytesRead before closing is to make sure lingering bytesRead statistics in
// this thread get correctly added.
updateBytesRead()
close()
}

private var havePair = false
private var recordsSinceMetricsUpdate = 0

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,12 +187,9 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll {
}

test("neglects Spark and Spark's dependencies") {
val components = Seq("catalyst_", "core_", "graphx_", "hive_", "mllib_", "repl_",
"sql_", "streaming_", "yarn_", "network-common_", "network-shuffle_", "network-yarn_")

val coordinates =
components.map(comp => s"org.apache.spark:spark-${comp}2.10:1.2.0").mkString(",") +
",org.apache.spark:spark-core_fake:1.2.0"
val coordinates = SparkSubmitUtils.IVY_DEFAULT_EXCLUDES
.map(comp => s"org.apache.spark:spark-${comp}2.11:2.1.1")
.mkString(",") + ",org.apache.spark:spark-core_fake:1.2.0"

val path = SparkSubmitUtils.resolveMavenCoordinates(
coordinates,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import org.scalatest.BeforeAndAfter

import org.apache.spark.{SharedSparkContext, SparkFunSuite}
import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
import org.apache.spark.util.Utils
import org.apache.spark.util.{ThreadUtils, Utils}

class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext
with BeforeAndAfter {
Expand Down Expand Up @@ -319,6 +319,35 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext
}
assert(bytesRead >= tmpFile.length())
}

test("input metrics with old Hadoop API in different thread") {
val bytesRead = runAndReturnBytesRead {
sc.textFile(tmpFilePath, 4).mapPartitions { iter =>
val buf = new ArrayBuffer[String]()
ThreadUtils.runInNewThread("testThread", false) {
iter.flatMap(_.split(" ")).foreach(buf.append(_))
}

buf.iterator
}.count()
}
assert(bytesRead >= tmpFile.length())
}

test("input metrics with new Hadoop API in different thread") {
val bytesRead = runAndReturnBytesRead {
sc.newAPIHadoopFile(tmpFilePath, classOf[NewTextInputFormat], classOf[LongWritable],
classOf[Text]).mapPartitions { iter =>
val buf = new ArrayBuffer[String]()
ThreadUtils.runInNewThread("testThread", false) {
iter.map(_._2.toString).flatMap(_.split(" ")).foreach(buf.append(_))
}

buf.iterator
}.count()
}
assert(bytesRead >= tmpFile.length())
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.launcher;

import java.io.InputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectStreamClass;
import java.util.Arrays;
import java.util.List;

/**
* An object input stream that only allows classes used by the launcher protocol to be in the
* serialized stream. See SPARK-20922.
*/
class FilteredObjectInputStream extends ObjectInputStream {

private static final List<String> ALLOWED_PACKAGES = Arrays.asList(
"org.apache.spark.launcher.",
"java.lang.");

FilteredObjectInputStream(InputStream is) throws IOException {
super(is);
}

@Override
protected Class<?> resolveClass(ObjectStreamClass desc)
throws IOException, ClassNotFoundException {

boolean isValid = ALLOWED_PACKAGES.stream().anyMatch(p -> desc.getName().startsWith(p));
if (!isValid) {
throw new IllegalArgumentException(
String.format("Unexpected class in stream: %s", desc.getName()));
}
return super.resolveClass(desc);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.net.Socket;
import java.util.logging.Level;
Expand Down Expand Up @@ -53,7 +52,7 @@ abstract class LauncherConnection implements Closeable, Runnable {
@Override
public void run() {
try {
ObjectInputStream in = new ObjectInputStream(socket.getInputStream());
FilteredObjectInputStream in = new FilteredObjectInputStream(socket.getInputStream());
while (!closed) {
Message msg = (Message) in.readObject();
handle(msg);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@

import java.io.Closeable;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.net.InetAddress;
import java.net.Socket;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
Expand Down Expand Up @@ -120,31 +123,7 @@ public void testTimeout() throws Exception {
Socket s = new Socket(InetAddress.getLoopbackAddress(),
LauncherServer.getServerInstance().getPort());
client = new TestClient(s);

// Try a few times since the client-side socket may not reflect the server-side close
// immediately.
boolean helloSent = false;
int maxTries = 10;
for (int i = 0; i < maxTries; i++) {
try {
if (!helloSent) {
client.send(new Hello(handle.getSecret(), "1.4.0"));
helloSent = true;
} else {
client.send(new SetAppId("appId"));
}
fail("Expected exception caused by connection timeout.");
} catch (IllegalStateException | IOException e) {
// Expected.
break;
} catch (AssertionError e) {
if (i < maxTries - 1) {
Thread.sleep(100);
} else {
throw new AssertionError("Test failed after " + maxTries + " attempts.", e);
}
}
}
waitForError(client, handle.getSecret());
} finally {
SparkLauncher.launcherConfig.remove(SparkLauncher.CHILD_CONNECTION_TIMEOUT);
kill(handle);
Expand Down Expand Up @@ -183,6 +162,25 @@ public void infoChanged(SparkAppHandle handle) {
}
}

@Test
public void testStreamFiltering() throws Exception {
ChildProcAppHandle handle = LauncherServer.newAppHandle();
TestClient client = null;
try {
Socket s = new Socket(InetAddress.getLoopbackAddress(),
LauncherServer.getServerInstance().getPort());

client = new TestClient(s);
client.send(new EvilPayload());
waitForError(client, handle.getSecret());
assertEquals(0, EvilPayload.EVIL_BIT);
} finally {
kill(handle);
close(client);
client.clientThread.join();
}
}

private void kill(SparkAppHandle handle) {
if (handle != null) {
handle.kill();
Expand All @@ -199,6 +197,35 @@ private void close(Closeable c) {
}
}

/**
* Try a few times to get a client-side error, since the client-side socket may not reflect the
* server-side close immediately.
*/
private void waitForError(TestClient client, String secret) throws Exception {
boolean helloSent = false;
int maxTries = 10;
for (int i = 0; i < maxTries; i++) {
try {
if (!helloSent) {
client.send(new Hello(secret, "1.4.0"));
helloSent = true;
} else {
client.send(new SetAppId("appId"));
}
fail("Expected error but message went through.");
} catch (IllegalStateException | IOException e) {
// Expected.
break;
} catch (AssertionError e) {
if (i < maxTries - 1) {
Thread.sleep(100);
} else {
throw new AssertionError("Test failed after " + maxTries + " attempts.", e);
}
}
}
}

private static class TestClient extends LauncherConnection {

final BlockingQueue<Message> inbound;
Expand All @@ -220,4 +247,19 @@ protected void handle(Message msg) throws IOException {

}

private static class EvilPayload extends LauncherProtocol.Message {

static int EVIL_BIT = 0;

// This field should cause the launcher server to throw an error and not deserialize the
// message.
private List<String> notAllowedField = Arrays.asList("disallowed");

private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
stream.defaultReadObject();
EVIL_BIT = 1;
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,13 @@ class CoordinateMatrix @Since("1.0.0") (
s"colsPerBlock needs to be greater than 0. colsPerBlock: $colsPerBlock")
val m = numRows()
val n = numCols()

// Since block matrices require an integer row and col index
require(math.ceil(m.toDouble / rowsPerBlock) <= Int.MaxValue,
"Number of rows divided by rowsPerBlock cannot exceed maximum integer.")
require(math.ceil(n.toDouble / colsPerBlock) <= Int.MaxValue,
"Number of cols divided by colsPerBlock cannot exceed maximum integer.")

val numRowBlocks = math.ceil(m.toDouble / rowsPerBlock).toInt
val numColBlocks = math.ceil(n.toDouble / colsPerBlock).toInt
val partitioner = GridPartitioner(numRowBlocks, numColBlocks, entries.partitions.length)
Expand Down
Loading