diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 9b7e95ce30acb..ca45c6f9b0a96 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -361,6 +361,7 @@ export("as.DataFrame",
"clearCache",
"createDataFrame",
"createExternalTable",
+ "createTable",
"currentDatabase",
"dropTempTable",
"dropTempView",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 97786df4ae6a1..ec85f723c08c6 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -557,7 +557,7 @@ setMethod("insertInto",
jmode <- convertToJSaveMode(ifelse(overwrite, "overwrite", "append"))
write <- callJMethod(x@sdf, "write")
write <- callJMethod(write, "mode", jmode)
- callJMethod(write, "insertInto", tableName)
+ invisible(callJMethod(write, "insertInto", tableName))
})
#' Cache
@@ -2894,7 +2894,7 @@ setMethod("saveAsTable",
write <- callJMethod(write, "format", source)
write <- callJMethod(write, "mode", jmode)
write <- callJMethod(write, "options", options)
- callJMethod(write, "saveAsTable", tableName)
+ invisible(callJMethod(write, "saveAsTable", tableName))
})
#' summary
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index a1edef7608fa1..c2a1e240ad395 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -544,12 +544,15 @@ sql <- function(x, ...) {
dispatchFunc("sql(sqlQuery)", x, ...)
}
-#' Create a SparkDataFrame from a SparkSQL Table
+#' Create a SparkDataFrame from a SparkSQL table or view
#'
-#' Returns the specified Table as a SparkDataFrame. The Table must have already been registered
-#' in the SparkSession.
+#' Returns the specified table or view as a SparkDataFrame. The table or view must already exist or
+#' have already been registered in the SparkSession.
#'
-#' @param tableName The SparkSQL Table to convert to a SparkDataFrame.
+#' @param tableName the qualified or unqualified name that designates a table or view. If a database
+#' is specified, it identifies the table/view from the database.
+#' Otherwise, it first attempts to find a temporary view with the given name
+#' and then match the table/view from the current database.
#' @return SparkDataFrame
#' @rdname tableToDF
#' @name tableToDF
diff --git a/R/pkg/R/catalog.R b/R/pkg/R/catalog.R
index 07a89f763cde1..e59a7024333ac 100644
--- a/R/pkg/R/catalog.R
+++ b/R/pkg/R/catalog.R
@@ -17,7 +17,7 @@
# catalog.R: SparkSession catalog functions
-#' Create an external table
+#' (Deprecated) Create an external table
#'
#' Creates an external table based on the dataset in a data source,
#' Returns a SparkDataFrame associated with the external table.
@@ -29,10 +29,11 @@
#' @param tableName a name of the table.
#' @param path the path of files to load.
#' @param source the name of external data source.
-#' @param schema the schema of the data for certain data source.
+#' @param schema the schema of the data required for some data sources.
#' @param ... additional argument(s) passed to the method.
#' @return A SparkDataFrame.
-#' @rdname createExternalTable
+#' @rdname createExternalTable-deprecated
+#' @seealso \link{createTable}
#' @export
#' @examples
#'\dontrun{
@@ -43,29 +44,70 @@
#' @method createExternalTable default
#' @note createExternalTable since 1.4.0
createExternalTable.default <- function(tableName, path = NULL, source = NULL, schema = NULL, ...) {
+ .Deprecated("createTable", old = "createExternalTable")
+ createTable(tableName, path, source, schema, ...)
+}
+
+createExternalTable <- function(x, ...) {
+ dispatchFunc("createExternalTable(tableName, path = NULL, source = NULL, ...)", x, ...)
+}
+
+#' Creates a table based on the dataset in a data source
+#'
+#' Creates a table based on the dataset in a data source. Returns a SparkDataFrame associated with
+#' the table.
+#'
+#' The data source is specified by the \code{source} and a set of options(...).
+#' If \code{source} is not specified, the default data source configured by
+#' "spark.sql.sources.default" will be used. When a \code{path} is specified, an external table is
+#' created from the data at the given path. Otherwise a managed table is created.
+#'
+#' @param tableName the qualified or unqualified name that designates a table. If no database
+#' identifier is provided, it refers to a table in the current database.
+#' @param path (optional) the path of files to load.
+#' @param source (optional) the name of the data source.
+#' @param schema (optional) the schema of the data required for some data sources.
+#' @param ... additional named parameters as options for the data source.
+#' @return A SparkDataFrame.
+#' @rdname createTable
+#' @seealso \link{createExternalTable}
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' df <- createTable("myjson", path="path/to/json", source="json", schema)
+#'
+#' createTable("people", source = "json", schema = schema)
+#' insertInto(df, "people")
+#' }
+#' @name createTable
+#' @note createTable since 2.2.0
+createTable <- function(tableName, path = NULL, source = NULL, schema = NULL, ...) {
sparkSession <- getSparkSession()
options <- varargsToStrEnv(...)
if (!is.null(path)) {
options[["path"]] <- path
}
+ if (is.null(source)) {
+ source <- getDefaultSqlSource()
+ }
catalog <- callJMethod(sparkSession, "catalog")
if (is.null(schema)) {
- sdf <- callJMethod(catalog, "createExternalTable", tableName, source, options)
+ sdf <- callJMethod(catalog, "createTable", tableName, source, options)
+ } else if (class(schema) == "structType") {
+ sdf <- callJMethod(catalog, "createTable", tableName, source, schema$jobj, options)
} else {
- sdf <- callJMethod(catalog, "createExternalTable", tableName, source, schema$jobj, options)
+ stop("schema must be a structType.")
}
dataFrame(sdf)
}
-createExternalTable <- function(x, ...) {
- dispatchFunc("createExternalTable(tableName, path = NULL, source = NULL, ...)", x, ...)
-}
-
#' Cache Table
#'
#' Caches the specified table in-memory.
#'
-#' @param tableName The name of the table being cached
+#' @param tableName the qualified or unqualified name that designates a table. If no database
+#' identifier is provided, it refers to a table in the current database.
#' @return SparkDataFrame
#' @rdname cacheTable
#' @export
@@ -94,7 +136,8 @@ cacheTable <- function(x, ...) {
#'
#' Removes the specified table from the in-memory cache.
#'
-#' @param tableName The name of the table being uncached
+#' @param tableName the qualified or unqualified name that designates a table. If no database
+#' identifier is provided, it refers to a table in the current database.
#' @return SparkDataFrame
#' @rdname uncacheTable
#' @export
@@ -162,6 +205,7 @@ clearCache <- function() {
#' @method dropTempTable default
#' @note dropTempTable since 1.4.0
dropTempTable.default <- function(tableName) {
+ .Deprecated("dropTempView", old = "dropTempTable")
if (class(tableName) != "character") {
stop("tableName must be a string.")
}
@@ -169,7 +213,6 @@ dropTempTable.default <- function(tableName) {
}
dropTempTable <- function(x, ...) {
- .Deprecated("dropTempView")
dispatchFunc("dropTempView(viewName)", x, ...)
}
@@ -178,7 +221,7 @@ dropTempTable <- function(x, ...) {
#' Drops the temporary view with the given view name in the catalog.
#' If the view has been cached before, then it will also be uncached.
#'
-#' @param viewName the name of the view to be dropped.
+#' @param viewName the name of the temporary view to be dropped.
#' @return TRUE if the view is dropped successfully, FALSE otherwise.
#' @rdname dropTempView
#' @name dropTempView
@@ -317,10 +360,10 @@ listDatabases <- function() {
dataFrame(callJMethod(callJMethod(catalog, "listDatabases"), "toDF"))
}
-#' Returns a list of tables in the specified database
+#' Returns a list of tables or views in the specified database
#'
-#' Returns a list of tables in the specified database.
-#' This includes all temporary tables.
+#' Returns a list of tables or views in the specified database.
+#' This includes all temporary views.
#'
#' @param databaseName (optional) name of the database
#' @return a SparkDataFrame of the list of tables.
@@ -349,11 +392,13 @@ listTables <- function(databaseName = NULL) {
dataFrame(callJMethod(jdst, "toDF"))
}
-#' Returns a list of columns for the given table in the specified database
+#' Returns a list of columns for the given table/view in the specified database
#'
-#' Returns a list of columns for the given table in the specified database.
+#' Returns a list of columns for the given table/view in the specified database.
#'
-#' @param tableName a name of the table.
+#' @param tableName the qualified or unqualified name that designates a table/view. If no database
+#' identifier is provided, it refers to a table/view in the current database.
+#' If \code{databaseName} parameter is specified, this must be an unqualified name.
#' @param databaseName (optional) name of the database
#' @return a SparkDataFrame of the list of column descriptions.
#' @rdname listColumns
@@ -409,12 +454,13 @@ listFunctions <- function(databaseName = NULL) {
dataFrame(callJMethod(jdst, "toDF"))
}
-#' Recover all the partitions in the directory of a table and update the catalog
+#' Recovers all the partitions in the directory of a table and update the catalog
#'
-#' Recover all the partitions in the directory of a table and update the catalog. The name should
-#' reference a partitioned table, and not a temporary view.
+#' Recovers all the partitions in the directory of a table and update the catalog. The name should
+#' reference a partitioned table, and not a view.
#'
-#' @param tableName a name of the table.
+#' @param tableName the qualified or unqualified name that designates a table. If no database
+#' identifier is provided, it refers to a table in the current database.
#' @rdname recoverPartitions
#' @name recoverPartitions
#' @export
@@ -430,17 +476,18 @@ recoverPartitions <- function(tableName) {
invisible(handledCallJMethod(catalog, "recoverPartitions", tableName))
}
-#' Invalidate and refresh all the cached metadata of the given table
+#' Invalidates and refreshes all the cached data and metadata of the given table
#'
-#' Invalidate and refresh all the cached metadata of the given table. For performance reasons,
-#' Spark SQL or the external data source library it uses might cache certain metadata about a
-#' table, such as the location of blocks. When those change outside of Spark SQL, users should
+#' Invalidates and refreshes all the cached data and metadata of the given table. For performance
+#' reasons, Spark SQL or the external data source library it uses might cache certain metadata about
+#' a table, such as the location of blocks. When those change outside of Spark SQL, users should
#' call this function to invalidate the cache.
#'
#' If this table is cached as an InMemoryRelation, drop the original cached version and make the
#' new version cached lazily.
#'
-#' @param tableName a name of the table.
+#' @param tableName the qualified or unqualified name that designates a table. If no database
+#' identifier is provided, it refers to a table in the current database.
#' @rdname refreshTable
#' @name refreshTable
#' @export
@@ -456,11 +503,11 @@ refreshTable <- function(tableName) {
invisible(handledCallJMethod(catalog, "refreshTable", tableName))
}
-#' Invalidate and refresh all the cached data and metadata for SparkDataFrame containing path
+#' Invalidates and refreshes all the cached data and metadata for SparkDataFrame containing path
#'
-#' Invalidate and refresh all the cached data (and the associated metadata) for any SparkDataFrame
-#' that contains the given data source path. Path matching is by prefix, i.e. "/" would invalidate
-#' everything that is cached.
+#' Invalidates and refreshes all the cached data (and the associated metadata) for any
+#' SparkDataFrame that contains the given data source path. Path matching is by prefix, i.e. "/"
+#' would invalidate everything that is cached.
#'
#' @param path the path of the data source.
#' @rdname refreshByPath
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index ad06711a79a78..58cf24256a94f 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -281,7 +281,7 @@ test_that("create DataFrame from RDD", {
setHiveContext(sc)
sql("CREATE TABLE people (name string, age double, height float)")
df <- read.df(jsonPathNa, "json", schema)
- invisible(insertInto(df, "people"))
+ insertInto(df, "people")
expect_equal(collect(sql("SELECT age from people WHERE name = 'Bob'"))$age,
c(16))
expect_equal(collect(sql("SELECT height from people WHERE name ='Bob'"))$height,
@@ -1268,7 +1268,16 @@ test_that("column calculation", {
test_that("test HiveContext", {
setHiveContext(sc)
- df <- createExternalTable("json", jsonPath, "json")
+
+ schema <- structType(structField("name", "string"), structField("age", "integer"),
+ structField("height", "float"))
+ createTable("people", source = "json", schema = schema)
+ df <- read.df(jsonPathNa, "json", schema)
+ insertInto(df, "people")
+ expect_equal(collect(sql("SELECT age from people WHERE name = 'Bob'"))$age, c(16))
+ sql("DROP TABLE people")
+
+ df <- createTable("json", jsonPath, "json")
expect_is(df, "SparkDataFrame")
expect_equal(count(df), 3)
df2 <- sql("select * from json")
@@ -1276,25 +1285,26 @@ test_that("test HiveContext", {
expect_equal(count(df2), 3)
jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- invisible(saveAsTable(df, "json2", "json", "append", path = jsonPath2))
+ saveAsTable(df, "json2", "json", "append", path = jsonPath2)
df3 <- sql("select * from json2")
expect_is(df3, "SparkDataFrame")
expect_equal(count(df3), 3)
unlink(jsonPath2)
hivetestDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- invisible(saveAsTable(df, "hivetestbl", path = hivetestDataPath))
+ saveAsTable(df, "hivetestbl", path = hivetestDataPath)
df4 <- sql("select * from hivetestbl")
expect_is(df4, "SparkDataFrame")
expect_equal(count(df4), 3)
unlink(hivetestDataPath)
parquetDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- invisible(saveAsTable(df, "parquetest", "parquet", mode = "overwrite", path = parquetDataPath))
+ saveAsTable(df, "parquetest", "parquet", mode = "overwrite", path = parquetDataPath)
df5 <- sql("select * from parquetest")
expect_is(df5, "SparkDataFrame")
expect_equal(count(df5), 3)
unlink(parquetDataPath)
+
unsetHiveContext()
})
diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html
index 4e83d6d564986..5c91304e49fd7 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html
+++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html
@@ -24,7 +24,15 @@
{
+
++
++
++
++
@@ -65,6 +94,11 @@ private[ui] class ExecutorsPage(
}
private[spark] object ExecutorsPage {
+ private val ON_HEAP_MEMORY_TOOLTIP = "Memory used / total available memory for on heap " +
+ "storage of data like RDD partitions cached in memory."
+ private val OFF_HEAP_MEMORY_TOOLTIP = "Memory used / total available memory for off heap " +
+ "storage of data like RDD partitions cached in memory."
+
/** Represent an executor's info as a map given a storage status index */
def getExecInfo(
listener: ExecutorsListener,
@@ -80,6 +114,10 @@ private[spark] object ExecutorsPage {
val rddBlocks = status.numBlocks
val memUsed = status.memUsed
val maxMem = status.maxMem
+ val onHeapMemUsed = status.onHeapMemUsed
+ val offHeapMemUsed = status.offHeapMemUsed
+ val maxOnHeapMem = status.maxOnHeapMem
+ val maxOffHeapMem = status.maxOffHeapMem
val diskUsed = status.diskUsed
val taskSummary = listener.executorToTaskSummary.getOrElse(execId, ExecutorTaskSummary(execId))
@@ -103,7 +141,11 @@ private[spark] object ExecutorsPage {
taskSummary.shuffleWrite,
taskSummary.isBlacklisted,
maxMem,
- taskSummary.executorLogs
+ taskSummary.executorLogs,
+ onHeapMemUsed,
+ offHeapMemUsed,
+ maxOnHeapMem,
+ maxOffHeapMem
)
}
}
diff --git a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
index 227e940c9c50c..a1a0c729b9240 100644
--- a/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/storage/RDDPage.scala
@@ -147,7 +147,8 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") {
/** Header fields for the worker table */
private def workerHeader = Seq(
"Host",
- "Memory Usage",
+ "On Heap Memory Usage",
+ "Off Heap Memory Usage",
"Disk Usage")
/** Render an HTML row representing a worker */
@@ -155,8 +156,12 @@ private[ui] class RDDPage(parent: StorageTab) extends WebUIPage("rdd") {
| {worker.address} |
- {Utils.bytesToString(worker.memoryUsed)}
- ({Utils.bytesToString(worker.memoryRemaining)} Remaining)
+ {Utils.bytesToString(worker.onHeapMemoryUsed.getOrElse(0L))}
+ ({Utils.bytesToString(worker.onHeapMemoryRemaining.getOrElse(0L))} Remaining)
+ |
+
+ {Utils.bytesToString(worker.offHeapMemoryUsed.getOrElse(0L))}
+ ({Utils.bytesToString(worker.offHeapMemoryRemaining.getOrElse(0L))} Remaining)
|
{Utils.bytesToString(worker.diskUsed)} |
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 1d2cb7acefa33..8296c4294242c 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -182,7 +182,9 @@ private[spark] object JsonProtocol {
("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.blockManagerAdded) ~
("Block Manager ID" -> blockManagerId) ~
("Maximum Memory" -> blockManagerAdded.maxMem) ~
- ("Timestamp" -> blockManagerAdded.time)
+ ("Timestamp" -> blockManagerAdded.time) ~
+ ("Maximum Onheap Memory" -> blockManagerAdded.maxOnHeapMem) ~
+ ("Maximum Offheap Memory" -> blockManagerAdded.maxOffHeapMem)
}
def blockManagerRemovedToJson(blockManagerRemoved: SparkListenerBlockManagerRemoved): JValue = {
@@ -612,7 +614,9 @@ private[spark] object JsonProtocol {
val blockManagerId = blockManagerIdFromJson(json \ "Block Manager ID")
val maxMem = (json \ "Maximum Memory").extract[Long]
val time = Utils.jsonOption(json \ "Timestamp").map(_.extract[Long]).getOrElse(-1L)
- SparkListenerBlockManagerAdded(time, blockManagerId, maxMem)
+ val maxOnHeapMem = Utils.jsonOption(json \ "Maximum Onheap Memory").map(_.extract[Long])
+ val maxOffHeapMem = Utils.jsonOption(json \ "Maximum Offheap Memory").map(_.extract[Long])
+ SparkListenerBlockManagerAdded(time, blockManagerId, maxMem, maxOnHeapMem, maxOffHeapMem)
}
def blockManagerRemovedFromJson(json: JValue): SparkListenerBlockManagerRemoved = {
diff --git a/core/src/test/resources/HistoryServerExpectations/executor_memory_usage_expectation.json b/core/src/test/resources/HistoryServerExpectations/executor_memory_usage_expectation.json
new file mode 100644
index 0000000000000..e732af2663503
--- /dev/null
+++ b/core/src/test/resources/HistoryServerExpectations/executor_memory_usage_expectation.json
@@ -0,0 +1,139 @@
+[ {
+ "id" : "2",
+ "hostPort" : "172.22.0.167:51487",
+ "isActive" : true,
+ "rddBlocks" : 0,
+ "memoryUsed" : 0,
+ "diskUsed" : 0,
+ "totalCores" : 4,
+ "maxTasks" : 4,
+ "activeTasks" : 0,
+ "failedTasks" : 4,
+ "completedTasks" : 0,
+ "totalTasks" : 4,
+ "totalDuration" : 2537,
+ "totalGCTime" : 88,
+ "totalInputBytes" : 0,
+ "totalShuffleRead" : 0,
+ "totalShuffleWrite" : 0,
+ "isBlacklisted" : true,
+ "maxMemory" : 908381388,
+ "executorLogs" : {
+ "stdout" : "http://172.22.0.167:51469/logPage/?appId=app-20161116163331-0000&executorId=2&logType=stdout",
+ "stderr" : "http://172.22.0.167:51469/logPage/?appId=app-20161116163331-0000&executorId=2&logType=stderr"
+ },
+ "onHeapMemoryUsed" : 0,
+ "offHeapMemoryUsed" : 0,
+ "maxOnHeapMemory" : 384093388,
+ "maxOffHeapMemory" : 524288000
+}, {
+ "id" : "driver",
+ "hostPort" : "172.22.0.167:51475",
+ "isActive" : true,
+ "rddBlocks" : 0,
+ "memoryUsed" : 0,
+ "diskUsed" : 0,
+ "totalCores" : 0,
+ "maxTasks" : 0,
+ "activeTasks" : 0,
+ "failedTasks" : 0,
+ "completedTasks" : 0,
+ "totalTasks" : 0,
+ "totalDuration" : 0,
+ "totalGCTime" : 0,
+ "totalInputBytes" : 0,
+ "totalShuffleRead" : 0,
+ "totalShuffleWrite" : 0,
+ "isBlacklisted" : true,
+ "maxMemory" : 908381388,
+ "executorLogs" : { },
+ "onHeapMemoryUsed" : 0,
+ "offHeapMemoryUsed" : 0,
+ "maxOnHeapMemory" : 384093388,
+ "maxOffHeapMemory" : 524288000
+}, {
+ "id" : "1",
+ "hostPort" : "172.22.0.167:51490",
+ "isActive" : true,
+ "rddBlocks" : 0,
+ "memoryUsed" : 0,
+ "diskUsed" : 0,
+ "totalCores" : 4,
+ "maxTasks" : 4,
+ "activeTasks" : 0,
+ "failedTasks" : 0,
+ "completedTasks" : 4,
+ "totalTasks" : 4,
+ "totalDuration" : 3152,
+ "totalGCTime" : 68,
+ "totalInputBytes" : 0,
+ "totalShuffleRead" : 0,
+ "totalShuffleWrite" : 0,
+ "isBlacklisted" : true,
+ "maxMemory" : 908381388,
+ "executorLogs" : {
+ "stdout" : "http://172.22.0.167:51467/logPage/?appId=app-20161116163331-0000&executorId=1&logType=stdout",
+ "stderr" : "http://172.22.0.167:51467/logPage/?appId=app-20161116163331-0000&executorId=1&logType=stderr"
+ },
+
+ "onHeapMemoryUsed" : 0,
+ "offHeapMemoryUsed" : 0,
+ "maxOnHeapMemory" : 384093388,
+ "maxOffHeapMemory" : 524288000
+}, {
+ "id" : "0",
+ "hostPort" : "172.22.0.167:51491",
+ "isActive" : true,
+ "rddBlocks" : 0,
+ "memoryUsed" : 0,
+ "diskUsed" : 0,
+ "totalCores" : 4,
+ "maxTasks" : 4,
+ "activeTasks" : 0,
+ "failedTasks" : 4,
+ "completedTasks" : 0,
+ "totalTasks" : 4,
+ "totalDuration" : 2551,
+ "totalGCTime" : 116,
+ "totalInputBytes" : 0,
+ "totalShuffleRead" : 0,
+ "totalShuffleWrite" : 0,
+ "isBlacklisted" : true,
+ "maxMemory" : 908381388,
+ "executorLogs" : {
+ "stdout" : "http://172.22.0.167:51465/logPage/?appId=app-20161116163331-0000&executorId=0&logType=stdout",
+ "stderr" : "http://172.22.0.167:51465/logPage/?appId=app-20161116163331-0000&executorId=0&logType=stderr"
+ },
+ "onHeapMemoryUsed" : 0,
+ "offHeapMemoryUsed" : 0,
+ "maxOnHeapMemory" : 384093388,
+ "maxOffHeapMemory" : 524288000
+}, {
+ "id" : "3",
+ "hostPort" : "172.22.0.167:51485",
+ "isActive" : true,
+ "rddBlocks" : 0,
+ "memoryUsed" : 0,
+ "diskUsed" : 0,
+ "totalCores" : 4,
+ "maxTasks" : 4,
+ "activeTasks" : 0,
+ "failedTasks" : 0,
+ "completedTasks" : 12,
+ "totalTasks" : 12,
+ "totalDuration" : 2453,
+ "totalGCTime" : 72,
+ "totalInputBytes" : 0,
+ "totalShuffleRead" : 0,
+ "totalShuffleWrite" : 0,
+ "isBlacklisted" : true,
+ "maxMemory" : 908381388,
+ "executorLogs" : {
+ "stdout" : "http://172.22.0.167:51466/logPage/?appId=app-20161116163331-0000&executorId=3&logType=stdout",
+ "stderr" : "http://172.22.0.167:51466/logPage/?appId=app-20161116163331-0000&executorId=3&logType=stderr"
+ },
+ "onHeapMemoryUsed" : 0,
+ "offHeapMemoryUsed" : 0,
+ "maxOnHeapMemory" : 384093388,
+ "maxOffHeapMemory" : 524288000
+} ]
diff --git a/core/src/test/resources/HistoryServerExpectations/executor_node_blacklisting_expectation.json b/core/src/test/resources/HistoryServerExpectations/executor_node_blacklisting_expectation.json
index 5914a1c2c4b6d..e732af2663503 100644
--- a/core/src/test/resources/HistoryServerExpectations/executor_node_blacklisting_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/executor_node_blacklisting_expectation.json
@@ -17,11 +17,15 @@
"totalShuffleRead" : 0,
"totalShuffleWrite" : 0,
"isBlacklisted" : true,
- "maxMemory" : 384093388,
+ "maxMemory" : 908381388,
"executorLogs" : {
"stdout" : "http://172.22.0.167:51469/logPage/?appId=app-20161116163331-0000&executorId=2&logType=stdout",
"stderr" : "http://172.22.0.167:51469/logPage/?appId=app-20161116163331-0000&executorId=2&logType=stderr"
- }
+ },
+ "onHeapMemoryUsed" : 0,
+ "offHeapMemoryUsed" : 0,
+ "maxOnHeapMemory" : 384093388,
+ "maxOffHeapMemory" : 524288000
}, {
"id" : "driver",
"hostPort" : "172.22.0.167:51475",
@@ -41,8 +45,12 @@
"totalShuffleRead" : 0,
"totalShuffleWrite" : 0,
"isBlacklisted" : true,
- "maxMemory" : 384093388,
- "executorLogs" : { }
+ "maxMemory" : 908381388,
+ "executorLogs" : { },
+ "onHeapMemoryUsed" : 0,
+ "offHeapMemoryUsed" : 0,
+ "maxOnHeapMemory" : 384093388,
+ "maxOffHeapMemory" : 524288000
}, {
"id" : "1",
"hostPort" : "172.22.0.167:51490",
@@ -62,11 +70,16 @@
"totalShuffleRead" : 0,
"totalShuffleWrite" : 0,
"isBlacklisted" : true,
- "maxMemory" : 384093388,
+ "maxMemory" : 908381388,
"executorLogs" : {
"stdout" : "http://172.22.0.167:51467/logPage/?appId=app-20161116163331-0000&executorId=1&logType=stdout",
"stderr" : "http://172.22.0.167:51467/logPage/?appId=app-20161116163331-0000&executorId=1&logType=stderr"
- }
+ },
+
+ "onHeapMemoryUsed" : 0,
+ "offHeapMemoryUsed" : 0,
+ "maxOnHeapMemory" : 384093388,
+ "maxOffHeapMemory" : 524288000
}, {
"id" : "0",
"hostPort" : "172.22.0.167:51491",
@@ -86,11 +99,15 @@
"totalShuffleRead" : 0,
"totalShuffleWrite" : 0,
"isBlacklisted" : true,
- "maxMemory" : 384093388,
+ "maxMemory" : 908381388,
"executorLogs" : {
"stdout" : "http://172.22.0.167:51465/logPage/?appId=app-20161116163331-0000&executorId=0&logType=stdout",
"stderr" : "http://172.22.0.167:51465/logPage/?appId=app-20161116163331-0000&executorId=0&logType=stderr"
- }
+ },
+ "onHeapMemoryUsed" : 0,
+ "offHeapMemoryUsed" : 0,
+ "maxOnHeapMemory" : 384093388,
+ "maxOffHeapMemory" : 524288000
}, {
"id" : "3",
"hostPort" : "172.22.0.167:51485",
@@ -110,9 +127,13 @@
"totalShuffleRead" : 0,
"totalShuffleWrite" : 0,
"isBlacklisted" : true,
- "maxMemory" : 384093388,
+ "maxMemory" : 908381388,
"executorLogs" : {
"stdout" : "http://172.22.0.167:51466/logPage/?appId=app-20161116163331-0000&executorId=3&logType=stdout",
"stderr" : "http://172.22.0.167:51466/logPage/?appId=app-20161116163331-0000&executorId=3&logType=stderr"
- }
+ },
+ "onHeapMemoryUsed" : 0,
+ "offHeapMemoryUsed" : 0,
+ "maxOnHeapMemory" : 384093388,
+ "maxOffHeapMemory" : 524288000
} ]
diff --git a/core/src/test/resources/spark-events/app-20161116163331-0000 b/core/src/test/resources/spark-events/app-20161116163331-0000
index 7566c9fc0a20b..57cfc5b973129 100755
--- a/core/src/test/resources/spark-events/app-20161116163331-0000
+++ b/core/src/test/resources/spark-events/app-20161116163331-0000
@@ -1,15 +1,15 @@
{"Event":"SparkListenerLogStart","Spark Version":"2.1.0-SNAPSHOT"}
-{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"driver","Host":"172.22.0.167","Port":51475},"Maximum Memory":384093388,"Timestamp":1479335611477}
+{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"driver","Host":"172.22.0.167","Port":51475},"Maximum Memory":908381388,"Timestamp":1479335611477,"Maximum Onheap Memory":384093388,"Maximum Offheap Memory":524288000}
{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre","Java Version":"1.8.0_92 (Oracle Corporation)","Scala Version":"version 2.11.8"},"Spark Properties":{"spark.blacklist.task.maxTaskAttemptsPerExecutor":"3","spark.blacklist.enabled":"TRUE","spark.driver.host":"172.22.0.167","spark.blacklist.task.maxTaskAttemptsPerNode":"3","spark.eventLog.enabled":"TRUE","spark.driver.port":"51459","spark.repl.class.uri":"spark://172.22.0.167:51459/classes","spark.jars":"","spark.repl.class.outputDir":"/private/var/folders/l4/d46wlzj16593f3d812vk49tw0000gp/T/spark-1cbc97d0-7fe6-4c9f-8c2c-f6fe51ee3cf2/repl-39929169-ac4c-4c6d-b116-f648e4dd62ed","spark.app.name":"Spark shell","spark.blacklist.stage.maxFailedExecutorsPerNode":"3","spark.scheduler.mode":"FIFO","spark.eventLog.overwrite":"TRUE","spark.blacklist.stage.maxFailedTasksPerExecutor":"3","spark.executor.id":"driver","spark.blacklist.application.maxFailedExecutorsPerNode":"2","spark.submit.deployMode":"client","spark.master":"local-cluster[4,4,1024]","spark.home":"/Users/Jose/IdeaProjects/spark","spark.eventLog.dir":"/Users/jose/logs","spark.sql.catalogImplementation":"in-memory","spark.eventLog.compress":"FALSE","spark.blacklist.application.maxFailedTasksPerExecutor":"1","spark.blacklist.timeout":"1000000","spark.app.id":"app-20161116163331-0000","spark.task.maxFailures":"4"},"System Properties":{"java.io.tmpdir":"/var/folders/l4/d46wlzj16593f3d812vk49tw0000gp/T/","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.vm.specification.version":"1.8","user.home":"/Users/Jose","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","ftp.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","sun.arch.data.model":"64","sun.boot.library.path":"/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre/lib","user.dir":"/Users/Jose/IdeaProjects/spark","java.library.path":"/Users/Jose/Library/Java/Extensions:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java:.","sun.cpu.isalist":"","os.arch":"x86_64","java.vm.version":"25.92-b14","java.endorsed.dirs":"/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre/lib/endorsed","java.runtime.version":"1.8.0_92-b14","java.vm.info":"mixed mode","java.ext.dirs":"/Users/Jose/Library/Java/Extensions:/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre/lib/ext:/Library/Java/Extensions:/Network/Library/Java/Extensions:/System/Library/Java/Extensions:/usr/lib/java","java.runtime.name":"Java(TM) SE Runtime Environment","file.separator":"/","io.netty.maxDirectMemory":"0","java.class.version":"52.0","scala.usejavacp":"true","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre/lib/resources.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre/lib/rt.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre/lib/sunrsasign.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre/lib/jsse.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre/lib/jce.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre/lib/charsets.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre/lib/jfr.jar:/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre/classes","file.encoding":"UTF-8","user.timezone":"America/Chicago","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"10.11.6","sun.os.patch.level":"unknown","gopherProxySet":"false","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","http.nonProxyHosts":"local|*.local|169.254/16|*.169.254/16","user.language":"en","socksNonProxyHosts":"local|*.local|169.254/16|*.169.254/16","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.lwawt.macosx.CPrinterJob","java.awt.graphicsenv":"sun.awt.CGraphicsEnvironment","awt.toolkit":"sun.lwawt.macosx.LWCToolkit","os.name":"Mac OS X","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"jose","java.vm.name":"Java HotSpot(TM) 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --master local-cluster[4,4,1024] --conf spark.blacklist.enabled=TRUE --conf spark.blacklist.timeout=1000000 --conf spark.blacklist.application.maxFailedTasksPerExecutor=1 --conf spark.eventLog.overwrite=TRUE --conf spark.blacklist.task.maxTaskAttemptsPerNode=3 --conf spark.blacklist.stage.maxFailedTasksPerExecutor=3 --conf spark.blacklist.task.maxTaskAttemptsPerExecutor=3 --conf spark.eventLog.compress=FALSE --conf spark.blacklist.stage.maxFailedExecutorsPerNode=3 --conf spark.eventLog.enabled=TRUE --conf spark.eventLog.dir=/Users/jose/logs --conf spark.blacklist.application.maxFailedExecutorsPerNode=2 --conf spark.task.maxFailures=4 --class org.apache.spark.repl.Main --name Spark shell spark-shell -i /Users/Jose/dev/jose-utils/blacklist/test-blacklist.scala","java.home":"/Library/Java/JavaVirtualMachines/jdk1.8.0_92.jdk/Contents/Home/jre","java.version":"1.8.0_92","sun.io.unicode.encoding":"UnicodeBig"},"Classpath Entries":{"/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/avro-mapred-1.7.7-hadoop2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-mapreduce-client-core-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-servlet-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/parquet-column-1.8.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/snappy-java-1.1.2.6.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/oro-2.0.8.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/arpack_combined_all-0.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/pmml-schema-1.2.15.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-assembly_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/javassist-3.18.1-GA.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-tags_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-launcher_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-math3-3.4.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hk2-api-2.4.0-b34.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/scala-xml_2.11-1.0.4.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/objenesis-2.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spire-macros_2.11-0.7.4.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/scala-reflect-2.11.8.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-mllib-local_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-mllib_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jersey-server-2.22.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/core/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jackson-mapper-asl-1.9.13.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jackson-module-scala_2.11-2.6.5.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/curator-framework-2.4.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/javax.inject-1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/curator-client-2.4.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jackson-core-asl-1.9.13.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/common/network-common/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/zookeeper-3.4.5.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-auth-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/repl/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jul-to-slf4j-1.7.16.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jersey-media-jaxb-2.22.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-io-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/RoaringBitmap-0.5.11.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/javax.ws.rs-api-2.0.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/sql/catalyst/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-unsafe_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-repl_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-continuation-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-yarn-client-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/sql/hive-thriftserver/target/scala-2.11/classes":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-annotations-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/metrics-graphite-3.1.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-yarn-api-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jersey-container-servlet-core-2.22.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/streaming/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-net-3.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-proxy-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-catalyst_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/lz4-1.3.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-crypto-1.0.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/common/network-yarn/target/scala-2.11/classes":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/javax.annotation-api-1.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-sql_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/guava-14.0.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/javax.servlet-api-3.1.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-collections-3.2.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/conf/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/unused-1.0.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/aopalliance-1.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/parquet-encoding-1.8.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/common/tags/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/json4s-jackson_2.11-3.2.11.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-cli-1.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-yarn-server-common-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/cglib-2.2.1-v20090111.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/pyrolite-4.13.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/scala-library-2.11.8.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/scala-parser-combinators_2.11-1.0.4.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-util-6.1.26.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/py4j-0.10.4.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-configuration-1.6.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/core-1.1.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/core/target/jars/*":"System Classpath","/Users/Jose/IdeaProjects/spark/common/network-shuffle/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/parquet-format-2.3.0-incubating.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/kryo-shaded-3.0.3.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/sql/core/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/chill-java-0.8.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jackson-annotations-2.6.5.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/parquet-hadoop-1.8.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/sql/hive/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/avro-ipc-1.7.7.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/xz-1.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/parquet-jackson-1.8.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/aopalliance-repackaged-2.4.0-b34.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jersey-common-2.22.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/log4j-1.2.17.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/metrics-core-3.1.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-util-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/scalap-2.11.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/osgi-resource-locator-1.0.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-beanutils-1.7.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-compress-1.4.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jcl-over-slf4j-1.7.16.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/yarn/target/scala-2.11/classes":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-plus-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/protobuf-java-2.5.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/common/unsafe/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jackson-module-paranamer-2.6.5.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/leveldbjni-all-1.8.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jackson-core-2.6.5.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/slf4j-api-1.7.16.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/compress-lzf-1.0.3.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/stream-2.7.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-mapreduce-client-shuffle-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-codec-1.10.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-yarn-common-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/common/sketch/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/breeze_2.11-0.12.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-mapreduce-client-common-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-core_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jersey-container-servlet-2.22.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-network-shuffle_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-lang-2.5.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/ivy-2.4.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-common-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-math-2.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-hdfs-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/scala-compiler-2.11.8.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/metrics-jvm-3.1.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-lang3-3.5.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jsr305-1.3.9.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/minlog-1.3.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/netty-3.8.0.Final.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-webapp-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/json4s-ast_2.11-3.2.11.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/xbean-asm5-shaded-4.4.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-io-2.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/slf4j-log4j12-1.7.16.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hk2-locator-2.4.0-b34.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/shapeless_2.11-2.0.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-network-common_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-xml-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-httpclient-3.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/javax.inject-2.4.0-b34.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/mllib/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/scalatest_2.11-2.2.6.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hk2-utils-2.4.0-b34.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-client-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jersey-guava-2.22.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-jndi-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/graphx/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-mapreduce-client-app-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/examples/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/xmlenc-0.52.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jets3t-0.7.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/curator-recipes-2.4.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/opencsv-2.3.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jtransforms-2.4.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/antlr4-runtime-4.5.3.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/chill_2.11-0.8.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-digester-1.8.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/univocity-parsers-2.2.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jline-2.12.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-streaming_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/launcher/target/scala-2.11/classes/":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/breeze-macros_2.11-0.12.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jersey-client-2.22.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jackson-databind-2.6.5.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-servlets-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/paranamer-2.6.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-security-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/avro-ipc-1.7.7-tests.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/avro-1.7.7.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spire_2.11-0.7.4.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-client-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/metrics-json-3.1.2.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-beanutils-core-1.8.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/validation-api-1.1.0.Final.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-graphx_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/netty-all-4.0.41.Final.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/janino-3.0.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/json4s-core_2.11-3.2.11.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/commons-compiler-3.0.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/guice-3.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-server-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/jetty-http-9.2.16.v20160414.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/parquet-common-1.8.1.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/hadoop-mapreduce-client-jobclient-2.2.0.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/spark-sketch_2.11-2.1.0-SNAPSHOT.jar":"System Classpath","/Users/Jose/IdeaProjects/spark/assembly/target/scala-2.11/jars/pmml-model-1.2.15.jar":"System Classpath"}}
{"Event":"SparkListenerApplicationStart","App Name":"Spark shell","App ID":"app-20161116163331-0000","Timestamp":1479335609916,"User":"jose"}
{"Event":"SparkListenerExecutorAdded","Timestamp":1479335615320,"Executor ID":"3","Executor Info":{"Host":"172.22.0.167","Total Cores":4,"Log Urls":{"stdout":"http://172.22.0.167:51466/logPage/?appId=app-20161116163331-0000&executorId=3&logType=stdout","stderr":"http://172.22.0.167:51466/logPage/?appId=app-20161116163331-0000&executorId=3&logType=stderr"}}}
-{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"3","Host":"172.22.0.167","Port":51485},"Maximum Memory":384093388,"Timestamp":1479335615387}
+{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"3","Host":"172.22.0.167","Port":51485},"Maximum Memory":908381388,"Timestamp":1479335615387,"Maximum Onheap Memory":384093388,"Maximum Offheap Memory":524288000}
{"Event":"SparkListenerExecutorAdded","Timestamp":1479335615393,"Executor ID":"2","Executor Info":{"Host":"172.22.0.167","Total Cores":4,"Log Urls":{"stdout":"http://172.22.0.167:51469/logPage/?appId=app-20161116163331-0000&executorId=2&logType=stdout","stderr":"http://172.22.0.167:51469/logPage/?appId=app-20161116163331-0000&executorId=2&logType=stderr"}}}
{"Event":"SparkListenerExecutorAdded","Timestamp":1479335615443,"Executor ID":"1","Executor Info":{"Host":"172.22.0.167","Total Cores":4,"Log Urls":{"stdout":"http://172.22.0.167:51467/logPage/?appId=app-20161116163331-0000&executorId=1&logType=stdout","stderr":"http://172.22.0.167:51467/logPage/?appId=app-20161116163331-0000&executorId=1&logType=stderr"}}}
-{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"2","Host":"172.22.0.167","Port":51487},"Maximum Memory":384093388,"Timestamp":1479335615448}
+{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"2","Host":"172.22.0.167","Port":51487},"Maximum Memory":908381388,"Timestamp":1479335615448,"Maximum Onheap Memory":384093388,"Maximum Offheap Memory":524288000}
{"Event":"SparkListenerExecutorAdded","Timestamp":1479335615462,"Executor ID":"0","Executor Info":{"Host":"172.22.0.167","Total Cores":4,"Log Urls":{"stdout":"http://172.22.0.167:51465/logPage/?appId=app-20161116163331-0000&executorId=0&logType=stdout","stderr":"http://172.22.0.167:51465/logPage/?appId=app-20161116163331-0000&executorId=0&logType=stderr"}}}
-{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"1","Host":"172.22.0.167","Port":51490},"Maximum Memory":384093388,"Timestamp":1479335615496}
-{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"0","Host":"172.22.0.167","Port":51491},"Maximum Memory":384093388,"Timestamp":1479335615515}
+{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"1","Host":"172.22.0.167","Port":51490},"Maximum Memory":908381388,"Timestamp":1479335615496,"Maximum Onheap Memory":384093388,"Maximum Offheap Memory":524288000}
+{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"0","Host":"172.22.0.167","Port":51491},"Maximum Memory":908381388,"Timestamp":1479335615515,"Maximum Onheap Memory":384093388,"Maximum Offheap Memory":524288000}
{"Event":"SparkListenerJobStart","Job ID":0,"Submission Time":1479335616467,"Stage Infos":[{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"count at
:26","Number of Tasks":16,"RDD Info":[{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"1\",\"name\":\"map\"}","Callsite":"map at :26","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":16,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"ParallelCollectionRDD","Scope":"{\"id\":\"0\",\"name\":\"parallelize\"}","Callsite":"parallelize at :26","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":16,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:1135)\n$line16.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:26)\n$line16.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:31)\n$line16.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:33)\n$line16.$read$$iw$$iw$$iw$$iw$$iw.(:35)\n$line16.$read$$iw$$iw$$iw$$iw.(:37)\n$line16.$read$$iw$$iw$$iw.(:39)\n$line16.$read$$iw$$iw.(:41)\n$line16.$read$$iw.(:43)\n$line16.$read.(:45)\n$line16.$read$.(:49)\n$line16.$read$.()\n$line16.$eval$.$print$lzycompute(:7)\n$line16.$eval$.$print(:6)\n$line16.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\nscala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:786)","Accumulables":[]}],"Stage IDs":[0],"Properties":{}}
{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"count at :26","Number of Tasks":16,"RDD Info":[{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"1\",\"name\":\"map\"}","Callsite":"map at :26","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":16,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"ParallelCollectionRDD","Scope":"{\"id\":\"0\",\"name\":\"parallelize\"}","Callsite":"parallelize at :26","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Number of Partitions":16,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.rdd.RDD.count(RDD.scala:1135)\n$line16.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:26)\n$line16.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.(:31)\n$line16.$read$$iw$$iw$$iw$$iw$$iw$$iw.(:33)\n$line16.$read$$iw$$iw$$iw$$iw$$iw.(:35)\n$line16.$read$$iw$$iw$$iw$$iw.(:37)\n$line16.$read$$iw$$iw$$iw.(:39)\n$line16.$read$$iw$$iw.(:41)\n$line16.$read$$iw.(:43)\n$line16.$read.(:45)\n$line16.$read$.(:49)\n$line16.$read$.()\n$line16.$eval$.$print$lzycompute(:7)\n$line16.$eval$.$print(:6)\n$line16.$eval.$print()\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\nscala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:786)","Accumulables":[]},"Properties":{}}
{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1479335616657,"Executor ID":"1","Host":"172.22.0.167","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}}
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 2c947556dfd30..735f4454e299e 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -572,7 +572,13 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
// first attempt will hang
if (!SparkContextSuite.isTaskStarted) {
SparkContextSuite.isTaskStarted = true
- Thread.sleep(9999999)
+ try {
+ Thread.sleep(9999999)
+ } catch {
+ case t: Throwable =>
+ // SPARK-20217 should not fail stage if task throws non-interrupted exception
+ throw new RuntimeException("killed")
+ }
}
// second attempt succeeds immediately
}
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index dcf83cb530a91..764156c3edc41 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -153,7 +153,8 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
"rdd list storage json" -> "applications/local-1422981780767/storage/rdd",
"executor node blacklisting" -> "applications/app-20161116163331-0000/executors",
- "executor node blacklisting unblacklisting" -> "applications/app-20161115172038-0000/executors"
+ "executor node blacklisting unblacklisting" -> "applications/app-20161115172038-0000/executors",
+ "executor memory usage" -> "applications/app-20161116163331-0000/executors"
// Todo: enable this test when logging the even of onBlockUpdated. See: SPARK-13845
// "one rdd storage json" -> "applications/local-1422981780767/storage/rdd/0"
)
diff --git a/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala b/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala
index e5733aebf607c..da198f946fd64 100644
--- a/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/StorageSuite.scala
@@ -27,7 +27,7 @@ class StorageSuite extends SparkFunSuite {
// For testing add, update, and remove (for non-RDD blocks)
private def storageStatus1: StorageStatus = {
- val status = new StorageStatus(BlockManagerId("big", "dog", 1), 1000L)
+ val status = new StorageStatus(BlockManagerId("big", "dog", 1), 1000L, Some(1000L), Some(0L))
assert(status.blocks.isEmpty)
assert(status.rddBlocks.isEmpty)
assert(status.memUsed === 0L)
@@ -74,7 +74,7 @@ class StorageSuite extends SparkFunSuite {
// For testing add, update, remove, get, and contains etc. for both RDD and non-RDD blocks
private def storageStatus2: StorageStatus = {
- val status = new StorageStatus(BlockManagerId("big", "dog", 1), 1000L)
+ val status = new StorageStatus(BlockManagerId("big", "dog", 1), 1000L, Some(1000L), Some(0L))
assert(status.rddBlocks.isEmpty)
status.addBlock(TestBlockId("dan"), BlockStatus(memAndDisk, 10L, 20L))
status.addBlock(TestBlockId("man"), BlockStatus(memAndDisk, 10L, 20L))
@@ -252,9 +252,9 @@ class StorageSuite extends SparkFunSuite {
// For testing StorageUtils.updateRddInfo and StorageUtils.getRddBlockLocations
private def stockStorageStatuses: Seq[StorageStatus] = {
- val status1 = new StorageStatus(BlockManagerId("big", "dog", 1), 1000L)
- val status2 = new StorageStatus(BlockManagerId("fat", "duck", 2), 2000L)
- val status3 = new StorageStatus(BlockManagerId("fat", "cat", 3), 3000L)
+ val status1 = new StorageStatus(BlockManagerId("big", "dog", 1), 1000L, Some(1000L), Some(0L))
+ val status2 = new StorageStatus(BlockManagerId("fat", "duck", 2), 2000L, Some(2000L), Some(0L))
+ val status3 = new StorageStatus(BlockManagerId("fat", "cat", 3), 3000L, Some(3000L), Some(0L))
status1.addBlock(RDDBlockId(0, 0), BlockStatus(memAndDisk, 1L, 2L))
status1.addBlock(RDDBlockId(0, 1), BlockStatus(memAndDisk, 1L, 2L))
status2.addBlock(RDDBlockId(0, 2), BlockStatus(memAndDisk, 1L, 2L))
@@ -332,4 +332,81 @@ class StorageSuite extends SparkFunSuite {
assert(blockLocations1(RDDBlockId(1, 2)) === Seq("cat:3"))
}
+ private val offheap = StorageLevel.OFF_HEAP
+ // For testing add, update, remove, get, and contains etc. for both RDD and non-RDD onheap
+ // and offheap blocks
+ private def storageStatus3: StorageStatus = {
+ val status = new StorageStatus(BlockManagerId("big", "dog", 1), 2000L, Some(1000L), Some(1000L))
+ assert(status.rddBlocks.isEmpty)
+ status.addBlock(TestBlockId("dan"), BlockStatus(memAndDisk, 10L, 20L))
+ status.addBlock(TestBlockId("man"), BlockStatus(offheap, 10L, 0L))
+ status.addBlock(RDDBlockId(0, 0), BlockStatus(offheap, 10L, 0L))
+ status.addBlock(RDDBlockId(1, 1), BlockStatus(offheap, 100L, 0L))
+ status.addBlock(RDDBlockId(2, 2), BlockStatus(memAndDisk, 10L, 20L))
+ status.addBlock(RDDBlockId(2, 3), BlockStatus(memAndDisk, 10L, 20L))
+ status.addBlock(RDDBlockId(2, 4), BlockStatus(memAndDisk, 10L, 40L))
+ status
+ }
+
+ test("storage memUsed, diskUsed with on-heap and off-heap blocks") {
+ val status = storageStatus3
+ def actualMemUsed: Long = status.blocks.values.map(_.memSize).sum
+ def actualDiskUsed: Long = status.blocks.values.map(_.diskSize).sum
+
+ def actualOnHeapMemUsed: Long =
+ status.blocks.values.filter(!_.storageLevel.useOffHeap).map(_.memSize).sum
+ def actualOffHeapMemUsed: Long =
+ status.blocks.values.filter(_.storageLevel.useOffHeap).map(_.memSize).sum
+
+ assert(status.maxMem === status.maxOnHeapMem.get + status.maxOffHeapMem.get)
+
+ assert(status.memUsed === actualMemUsed)
+ assert(status.diskUsed === actualDiskUsed)
+ assert(status.onHeapMemUsed.get === actualOnHeapMemUsed)
+ assert(status.offHeapMemUsed.get === actualOffHeapMemUsed)
+
+ assert(status.memRemaining === status.maxMem - actualMemUsed)
+ assert(status.onHeapMemRemaining.get === status.maxOnHeapMem.get - actualOnHeapMemUsed)
+ assert(status.offHeapMemRemaining.get === status.maxOffHeapMem.get - actualOffHeapMemUsed)
+
+ status.addBlock(TestBlockId("wire"), BlockStatus(memAndDisk, 400L, 500L))
+ status.addBlock(RDDBlockId(25, 25), BlockStatus(memAndDisk, 40L, 50L))
+ assert(status.memUsed === actualMemUsed)
+ assert(status.diskUsed === actualDiskUsed)
+
+ status.updateBlock(TestBlockId("dan"), BlockStatus(memAndDisk, 4L, 5L))
+ status.updateBlock(RDDBlockId(0, 0), BlockStatus(offheap, 4L, 0L))
+ status.updateBlock(RDDBlockId(1, 1), BlockStatus(offheap, 4L, 0L))
+ assert(status.memUsed === actualMemUsed)
+ assert(status.diskUsed === actualDiskUsed)
+ assert(status.onHeapMemUsed.get === actualOnHeapMemUsed)
+ assert(status.offHeapMemUsed.get === actualOffHeapMemUsed)
+
+ status.removeBlock(TestBlockId("fire"))
+ status.removeBlock(TestBlockId("man"))
+ status.removeBlock(RDDBlockId(2, 2))
+ status.removeBlock(RDDBlockId(2, 3))
+ assert(status.memUsed === actualMemUsed)
+ assert(status.diskUsed === actualDiskUsed)
+ }
+
+ private def storageStatus4: StorageStatus = {
+ val status = new StorageStatus(BlockManagerId("big", "dog", 1), 2000L, None, None)
+ status
+ }
+ test("old SparkListenerBlockManagerAdded event compatible") {
+ // This scenario will only be happened when replaying old event log. In this scenario there's
+ // no block add or remove event replayed, so only total amount of memory is valid.
+ val status = storageStatus4
+ assert(status.maxMem === status.maxMemory)
+
+ assert(status.memUsed === 0L)
+ assert(status.diskUsed === 0L)
+ assert(status.onHeapMemUsed === None)
+ assert(status.offHeapMemUsed === None)
+
+ assert(status.memRemaining === status.maxMem)
+ assert(status.onHeapMemRemaining === None)
+ assert(status.offHeapMemRemaining === None)
+ }
}
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index 4228373036425..f4c561c737794 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -39,7 +39,7 @@ import org.apache.spark.LocalSparkContext._
import org.apache.spark.api.java.StorageLevels
import org.apache.spark.deploy.history.HistoryServerSuite
import org.apache.spark.shuffle.FetchFailedException
-import org.apache.spark.status.api.v1.{JacksonMessageWriter, StageStatus}
+import org.apache.spark.status.api.v1.{JacksonMessageWriter, RDDDataDistribution, StageStatus}
private[spark] class SparkUICssErrorHandler extends DefaultCssErrorHandler {
@@ -103,6 +103,7 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
.set("spark.ui.enabled", "true")
.set("spark.ui.port", "0")
.set("spark.ui.killEnabled", killEnabled.toString)
+ .set("spark.memory.offHeap.size", "64m")
val sc = new SparkContext(conf)
assert(sc.ui.isDefined)
sc
@@ -151,6 +152,39 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
val updatedRddJson = getJson(ui, "storage/rdd/0")
(updatedRddJson \ "storageLevel").extract[String] should be (
StorageLevels.MEMORY_ONLY.description)
+
+ val dataDistributions0 =
+ (updatedRddJson \ "dataDistribution").extract[Seq[RDDDataDistribution]]
+ dataDistributions0.length should be (1)
+ val dist0 = dataDistributions0.head
+
+ dist0.onHeapMemoryUsed should not be (None)
+ dist0.memoryUsed should be (dist0.onHeapMemoryUsed.get)
+ dist0.onHeapMemoryRemaining should not be (None)
+ dist0.offHeapMemoryRemaining should not be (None)
+ dist0.memoryRemaining should be (
+ dist0.onHeapMemoryRemaining.get + dist0.offHeapMemoryRemaining.get)
+ dist0.onHeapMemoryUsed should not be (Some(0L))
+ dist0.offHeapMemoryUsed should be (Some(0L))
+
+ rdd.unpersist()
+ rdd.persist(StorageLevels.OFF_HEAP).count()
+ val updatedStorageJson1 = getJson(ui, "storage/rdd")
+ updatedStorageJson1.children.length should be (1)
+ val updatedRddJson1 = getJson(ui, "storage/rdd/0")
+ val dataDistributions1 =
+ (updatedRddJson1 \ "dataDistribution").extract[Seq[RDDDataDistribution]]
+ dataDistributions1.length should be (1)
+ val dist1 = dataDistributions1.head
+
+ dist1.offHeapMemoryUsed should not be (None)
+ dist1.memoryUsed should be (dist1.offHeapMemoryUsed.get)
+ dist1.onHeapMemoryRemaining should not be (None)
+ dist1.offHeapMemoryRemaining should not be (None)
+ dist1.memoryRemaining should be (
+ dist1.onHeapMemoryRemaining.get + dist1.offHeapMemoryRemaining.get)
+ dist1.onHeapMemoryUsed should be (Some(0L))
+ dist1.offHeapMemoryUsed should not be (Some(0L))
}
}
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 8d5ad12cb85be..ef01cfe4b92cd 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -367,6 +367,15 @@ See the [configuration page](configuration.html) for information on Spark config
[host_path:]container_path[:ro|:rw]
+
+ spark.mesos.task.labels |
+ (none) |
+
+ Set the Mesos labels to add to each task. Labels are free-form key-value pairs.
+ Key-value pairs should be separated by a colon, and commas used to list more than one.
+ Ex. key:value,key2:value2.
+ |
+
spark.mesos.executor.home |
driver side SPARK_HOME |
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java b/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java
index 47638565b1663..575a463e8725f 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java
@@ -89,7 +89,7 @@ public static void main(String[] args) {
// The results of SQL queries are themselves DataFrames and support all normal functions.
Dataset sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key");
- // The items in DaraFrames are of type Row, which lets you to access each column by ordinal.
+ // The items in DataFrames are of type Row, which lets you to access each column by ordinal.
Dataset stringsDS = sqlDF.map(
(MapFunction) row -> "Key: " + row.get(0) + ", Value: " + row.get(1),
Encoders.STRING());
diff --git a/examples/src/main/python/sql/hive.py b/examples/src/main/python/sql/hive.py
index 1f175d725800f..1f83a6fb48b97 100644
--- a/examples/src/main/python/sql/hive.py
+++ b/examples/src/main/python/sql/hive.py
@@ -68,7 +68,7 @@
# The results of SQL queries are themselves DataFrames and support all normal functions.
sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key")
- # The items in DaraFrames are of type Row, which allows you to access each column by ordinal.
+ # The items in DataFrames are of type Row, which allows you to access each column by ordinal.
stringsDS = sqlDF.rdd.map(lambda row: "Key: %d, Value: %s" % (row.key, row.value))
for record in stringsDS.collect():
print(record)
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
index 3de26364b5288..e5f75d53edc86 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
@@ -76,7 +76,7 @@ object SparkHiveExample {
// The results of SQL queries are themselves DataFrames and support all normal functions.
val sqlDF = sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key")
- // The items in DaraFrames are of type Row, which allows you to access each column by ordinal.
+ // The items in DataFrames are of type Row, which allows you to access each column by ordinal.
val stringsDS = sqlDF.map {
case Row(key: Int, value: String) => s"Key: $key, Value: $value"
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index ce834f1d17e0d..ab4c235209289 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -140,7 +140,7 @@ class RandomForestClassifier @Since("1.4.0") (
.map(_.asInstanceOf[DecisionTreeClassificationModel])
val numFeatures = oldDataset.first().features.size
- val m = new RandomForestClassificationModel(trees, numFeatures, numClasses)
+ val m = new RandomForestClassificationModel(uid, trees, numFeatures, numClasses)
instr.logSuccess(m)
m
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 2f524a8c5784d..a58da50fad972 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -131,7 +131,7 @@ class RandomForestRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S
.map(_.asInstanceOf[DecisionTreeRegressionModel])
val numFeatures = oldDataset.first().features.size
- val m = new RandomForestRegressionModel(trees, numFeatures)
+ val m = new RandomForestRegressionModel(uid, trees, numFeatures)
instr.logSuccess(m)
m
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
index dafc6c200f95f..4cdbf845ae4f5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
@@ -79,7 +79,7 @@ class PipelineSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
.setStages(Array(estimator0, transformer1, estimator2, transformer3))
val pipelineModel = pipeline.fit(dataset0)
- MLTestingUtils.checkCopy(pipelineModel)
+ MLTestingUtils.checkCopyAndUids(pipeline, pipelineModel)
assert(pipelineModel.stages.length === 4)
assert(pipelineModel.stages(0).eq(model0))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index 964fcfbdd87a2..918ab27e2730b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
@@ -249,8 +249,7 @@ class DecisionTreeClassifierSuite
val newData: DataFrame = TreeTests.setMetadata(rdd, categoricalFeatures, numClasses)
val newTree = dt.fit(newData)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(newTree)
+ MLTestingUtils.checkCopyAndUids(dt, newTree)
val predictions = newTree.transform(newData)
.select(newTree.getPredictionCol, newTree.getRawPredictionCol, newTree.getProbabilityCol)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index 0cddb37281b39..1f79e0d4e6228 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -97,8 +97,7 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext
assert(model.getProbabilityCol === "probability")
assert(model.hasParent)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(gbt, model)
}
test("setThreshold, getThreshold") {
@@ -261,8 +260,7 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext
.setSeed(123)
val model = gbt.fit(df)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(gbt, model)
sc.checkpointDir = None
Utils.deleteRecursively(tempDir)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
index c763a4cef1afd..2f87afc23fe7e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
@@ -124,8 +124,7 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
assert(model.hasParent)
assert(model.numFeatures === 2)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(lsvc, model)
}
test("linear svc doesn't fit intercept when fitIntercept is off") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index f0648d0936a12..c858b9bbfc256 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -142,8 +142,7 @@ class LogisticRegressionSuite
assert(model.intercept !== 0.0)
assert(model.hasParent)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(lr, model)
assert(model.hasSummary)
val copiedModel = model.copy(ParamMap.empty)
assert(copiedModel.hasSummary)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
index 7700099caac37..ce54c3df4f3f6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
@@ -74,8 +74,8 @@ class MultilayerPerceptronClassifierSuite
.setMaxIter(100)
.setSolver("l-bfgs")
val model = trainer.fit(dataset)
- MLTestingUtils.checkCopy(model)
val result = model.transform(dataset)
+ MLTestingUtils.checkCopyAndUids(trainer, model)
val predictionAndLabels = result.select("prediction", "label").collect()
predictionAndLabels.foreach { case Row(p: Double, l: Double) =>
assert(p == l)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
index d41c5b533dedf..b56f8e19ca53c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@@ -149,6 +149,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
validateModelFit(pi, theta, model)
assert(model.hasParent)
+ MLTestingUtils.checkCopyAndUids(nb, model)
val validationDataset =
generateNaiveBayesInput(piArray, thetaArray, nPoints, 17, "multinomial").toDF()
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index aacb7921b835f..c02e38ad64e3e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -76,8 +76,7 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
assert(ova.getPredictionCol === "prediction")
val ovaModel = ova.fit(dataset)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(ovaModel)
+ MLTestingUtils.checkCopyAndUids(ova, ovaModel)
assert(ovaModel.models.length === numClasses)
val transformedDataset = ovaModel.transform(dataset)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
index c3003cec73b41..ca2954d2f32c4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
@@ -141,8 +141,7 @@ class RandomForestClassifierSuite
val df: DataFrame = TreeTests.setMetadata(rdd, categoricalFeatures, numClasses)
val model = rf.fit(df)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(rf, model)
val predictions = model.transform(df)
.select(rf.getPredictionCol, rf.getRawPredictionCol, rf.getProbabilityCol)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index 200a892f6c694..fa7471fa2d658 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -47,8 +47,7 @@ class BisectingKMeansSuite
assert(bkm.getMinDivisibleClusterSize === 1.0)
val model = bkm.setMaxIter(1).fit(dataset)
- // copied model must have the same parent
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(bkm, model)
assert(model.hasSummary)
val copiedModel = model.copy(ParamMap.empty)
assert(copiedModel.hasSummary)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 61da897b666f4..08b800b7e4183 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -77,8 +77,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
assert(gm.getTol === 0.01)
val model = gm.setMaxIter(1).fit(dataset)
- // copied model must have the same parent
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(gm, model)
assert(model.hasSummary)
val copiedModel = model.copy(ParamMap.empty)
assert(copiedModel.hasSummary)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index ca05b9c389f65..119fe1dead9a9 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -52,8 +52,7 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
assert(kmeans.getTol === 1e-4)
val model = kmeans.setMaxIter(1).fit(dataset)
- // copied model must have the same parent
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(kmeans, model)
assert(model.hasSummary)
val copiedModel = model.copy(ParamMap.empty)
assert(copiedModel.hasSummary)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
index 75aa0be61a3ed..b4fe63a89f871 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
@@ -176,7 +176,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
val lda = new LDA().setK(k).setSeed(1).setOptimizer("online").setMaxIter(2)
val model = lda.fit(dataset)
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(lda, model)
assert(model.isInstanceOf[LocalLDAModel])
assert(model.vocabSize === vocabSize)
@@ -221,7 +221,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
val lda = new LDA().setK(k).setSeed(1).setOptimizer("em").setMaxIter(2)
val model_ = lda.fit(dataset)
- MLTestingUtils.checkCopy(model_)
+ MLTestingUtils.checkCopyAndUids(lda, model_)
assert(model_.isInstanceOf[DistributedLDAModel])
val model = model_.asInstanceOf[DistributedLDAModel]
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
index cc81da5c66e6d..7175c721bff36 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
@@ -94,7 +94,8 @@ class BucketedRandomProjectionLSHSuite
unitVectors.foreach { v: Vector =>
assert(Vectors.norm(v, 2.0) ~== 1.0 absTol 1e-14)
}
- MLTestingUtils.checkCopy(brpModel)
+
+ MLTestingUtils.checkCopyAndUids(brp, brpModel)
}
test("BucketedRandomProjectionLSH: test of LSH property") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
index d6925da97d57e..c83909c4498f2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
@@ -119,7 +119,8 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
test("Test Chi-Square selector: numTopFeatures") {
val selector = new ChiSqSelector()
.setOutputCol("filtered").setSelectorType("numTopFeatures").setNumTopFeatures(1)
- ChiSqSelectorSuite.testSelector(selector, dataset)
+ val model = ChiSqSelectorSuite.testSelector(selector, dataset)
+ MLTestingUtils.checkCopyAndUids(selector, model)
}
test("Test Chi-Square selector: percentile") {
@@ -166,11 +167,13 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
object ChiSqSelectorSuite {
- private def testSelector(selector: ChiSqSelector, dataset: Dataset[_]): Unit = {
- selector.fit(dataset).transform(dataset).select("filtered", "topFeature").collect()
+ private def testSelector(selector: ChiSqSelector, dataset: Dataset[_]): ChiSqSelectorModel = {
+ val selectorModel = selector.fit(dataset)
+ selectorModel.transform(dataset).select("filtered", "topFeature").collect()
.foreach { case Row(vec1: Vector, vec2: Vector) =>
assert(vec1 ~== vec2 absTol 1e-1)
}
+ selectorModel
}
/**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index 69d3033bb2189..f213145f1ba0a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.feature
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row
@@ -68,10 +68,11 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
val cv = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features")
- .fit(df)
- assert(cv.vocabulary.toSet === Set("a", "b", "c", "d", "e"))
+ val cvm = cv.fit(df)
+ MLTestingUtils.checkCopyAndUids(cv, cvm)
+ assert(cvm.vocabulary.toSet === Set("a", "b", "c", "d", "e"))
- cv.transform(df).select("features", "expected").collect().foreach {
+ cvm.transform(df).select("features", "expected").collect().foreach {
case Row(features: Vector, expected: Vector) =>
assert(features ~== expected absTol 1e-14)
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala
index 5325d95526a50..005edf73d29be 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/IDFSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.feature
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.feature.{IDFModel => OldIDFModel}
import org.apache.spark.mllib.linalg.VectorImplicits._
@@ -65,10 +65,12 @@ class IDFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
val df = data.zip(expected).toSeq.toDF("features", "expected")
- val idfModel = new IDF()
+ val idfEst = new IDF()
.setInputCol("features")
.setOutputCol("idfValue")
- .fit(df)
+ val idfModel = idfEst.fit(df)
+
+ MLTestingUtils.checkCopyAndUids(idfEst, idfModel)
idfModel.transform(df).select("idfValue", "expected").collect().foreach {
case Row(x: Vector, y: Vector) =>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
index a9b559f7ba648..dd4dd62b8cfe9 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
@@ -18,7 +18,7 @@
package org.apache.spark.ml.feature
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
-import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.ml.util.{MLTestingUtils, SchemaUtils}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DataTypes
@@ -58,6 +58,8 @@ private[ml] object LSHTest {
val outputCol = model.getOutputCol
val transformedData = model.transform(dataset)
+ MLTestingUtils.checkCopyAndUids(lsh, model)
+
// Check output column type
SchemaUtils.checkColumnType(
transformedData.schema, model.getOutputCol, DataTypes.createArrayType(new VectorUDT))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MaxAbsScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MaxAbsScalerSuite.scala
index a12174493b867..918da4f9388d4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/MaxAbsScalerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MaxAbsScalerSuite.scala
@@ -50,8 +50,7 @@ class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with De
assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1")
}
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(scaler, model)
}
test("MaxAbsScaler read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
index 0ddf097a6eb22..96df68dbdf053 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
@@ -63,7 +63,7 @@ class MinHashLSHSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
.setOutputCol("values")
val model = mh.fit(dataset)
assert(mh.uid === model.uid)
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(mh, model)
}
test("hashFunction") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinMaxScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinMaxScalerSuite.scala
index b79eeb2d75ef0..51db74eb739ca 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinMaxScalerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinMaxScalerSuite.scala
@@ -53,8 +53,7 @@ class MinMaxScalerSuite extends SparkFunSuite with MLlibTestSparkContext with De
assert(vector1.equals(vector2), "Transformed vector is different with expected.")
}
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(scaler, model)
}
test("MinMaxScaler arguments max must be larger than min") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/PCASuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/PCASuite.scala
index a60e87590f060..3067a52a4df76 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/PCASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/PCASuite.scala
@@ -58,12 +58,12 @@ class PCASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
.setInputCol("features")
.setOutputCol("pca_features")
.setK(3)
- .fit(df)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(pca)
+ val pcaModel = pca.fit(df)
- pca.transform(df).select("pca_features", "expected").collect().foreach {
+ MLTestingUtils.checkCopyAndUids(pca, pcaModel)
+
+ pcaModel.transform(df).select("pca_features", "expected").collect().foreach {
case Row(x: Vector, y: Vector) =>
assert(x ~== y absTol 1e-5, "Transformed vector is different with expected vector.")
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 5cfd59e6b88a2..fbebd75d70ac5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -37,7 +37,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
val formula = new RFormula().setFormula("id ~ v1 + v2")
val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
val model = formula.fit(original)
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(formula, model)
val result = model.transform(original)
val resultSchema = model.transformSchema(original.schema)
val expected = Seq(
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala
index a928f93633011..350ba44baa1eb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StandardScalerSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.feature
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}
@@ -77,10 +77,11 @@ class StandardScalerSuite extends SparkFunSuite with MLlibTestSparkContext
test("Standardization with default parameter") {
val df0 = data.zip(resWithStd).toSeq.toDF("features", "expected")
- val standardScaler0 = new StandardScaler()
+ val standardScalerEst0 = new StandardScaler()
.setInputCol("features")
.setOutputCol("standardized_features")
- .fit(df0)
+ val standardScaler0 = standardScalerEst0.fit(df0)
+ MLTestingUtils.checkCopyAndUids(standardScalerEst0, standardScaler0)
assertResult(standardScaler0.transform(df0))
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
index 8d9042b31e033..5634d4210f478 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -45,12 +45,11 @@ class StringIndexerSuite
val indexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("labelIndex")
- .fit(df)
+ val indexerModel = indexer.fit(df)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(indexer)
+ MLTestingUtils.checkCopyAndUids(indexer, indexerModel)
- val transformed = indexer.transform(df)
+ val transformed = indexerModel.transform(df)
val attr = Attribute.fromStructField(transformed.schema("labelIndex"))
.asInstanceOf[NominalAttribute]
assert(attr.values.get === Array("a", "c", "b"))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
index b28ce2ab45b45..f2cca8aa82e85 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
@@ -114,8 +114,7 @@ class VectorIndexerSuite extends SparkFunSuite with MLlibTestSparkContext
val vectorIndexer = getIndexer
val model = vectorIndexer.fit(densePoints1) // vectors of length 3
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(vectorIndexer, model)
model.transform(densePoints1) // should work
model.transform(sparsePoints1) // should work
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index 2043a16c15f1a..a6a1c2b4f32bd 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -57,15 +57,14 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
val docDF = doc.zip(expected).toDF("text", "expected")
- val model = new Word2Vec()
+ val w2v = new Word2Vec()
.setVectorSize(3)
.setInputCol("text")
.setOutputCol("result")
.setSeed(42L)
- .fit(docDF)
+ val model = w2v.fit(docDF)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(w2v, model)
// These expectations are just magic values, characterizing the current
// behavior. The test needs to be updated to be more general, see SPARK-11502
diff --git a/mllib/src/test/scala/org/apache/spark/ml/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/fpm/FPGrowthSuite.scala
index 6bec057511cd1..6806cb03bc42b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/fpm/FPGrowthSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/fpm/FPGrowthSuite.scala
@@ -17,9 +17,10 @@
package org.apache.spark.ml.fpm
import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
@@ -121,7 +122,9 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
.setMinConfidence(0.5678)
assert(fpGrowth.getMinSupport === 0.4567)
assert(model.getMinConfidence === 0.5678)
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(fpGrowth, model)
+ ParamsSuite.checkParams(fpGrowth)
+ ParamsSuite.checkParams(model)
}
test("read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
index a177ed13bf8ef..7574af3d77ea8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala
@@ -409,8 +409,7 @@ class ALSSuite
logInfo(s"Test RMSE is $rmse.")
assert(rmse < targetRMSE)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(als, model)
}
test("exact rank-1 matrix") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
index 708185a0943df..fb39e50a83552 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -83,8 +83,7 @@ class AFTSurvivalRegressionSuite
.setQuantilesCol("quantiles")
.fit(datasetUnivariate)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(aftr, model)
model.transform(datasetUnivariate)
.select("label", "prediction", "quantiles")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
index 0e91284d03d98..642f266891b57 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
@@ -69,11 +69,12 @@ class DecisionTreeRegressorSuite
test("copied model must have the same parent") {
val categoricalFeatures = Map(0 -> 2, 1 -> 2)
val df = TreeTests.setMetadata(categoricalDataPointsRDD, categoricalFeatures, numClasses = 0)
- val model = new DecisionTreeRegressor()
+ val dtr = new DecisionTreeRegressor()
.setImpurity("variance")
.setMaxDepth(2)
- .setMaxBins(8).fit(df)
- MLTestingUtils.checkCopy(model)
+ .setMaxBins(8)
+ val model = dtr.fit(df)
+ MLTestingUtils.checkCopyAndUids(dtr, model)
}
test("predictVariance") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
index 03c2f97797bce..2da25f7e0100a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -90,8 +90,7 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext
.setMaxIter(2)
val model = gbt.fit(df)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(gbt, model)
val preds = model.transform(df)
val predictions = preds.select("prediction").rdd.map(_.getDouble(0))
// Checks based on SPARK-8736 (to ensure it is not doing classification)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 401911763fa3b..f7c7c001a36af 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -197,8 +197,7 @@ class GeneralizedLinearRegressionSuite
val model = glr.setFamily("gaussian").setLink("identity")
.fit(datasetGaussianIdentity)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(glr, model)
assert(model.hasSummary)
val copiedModel = model.copy(ParamMap.empty)
assert(copiedModel.hasSummary)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
index f41a3601b1fa8..180f5f7ce5ab2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
@@ -93,8 +93,7 @@ class IsotonicRegressionSuite
val model = ir.fit(dataset)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(ir, model)
model.transform(dataset)
.select("label", "features", "prediction", "weight")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index c6a267b7283d8..e7bd4eb9e0adf 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -148,8 +148,7 @@ class LinearRegressionSuite
assert(lir.getSolver == "auto")
val model = lir.fit(datasetWithDenseFeature)
- // copied model must have the same parent.
- MLTestingUtils.checkCopy(model)
+ MLTestingUtils.checkCopyAndUids(lir, model)
assert(model.hasSummary)
val copiedModel = model.copy(ParamMap.empty)
assert(copiedModel.hasSummary)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
index 3bf0445ebd3dd..8b8e8a655f47b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
@@ -90,6 +90,8 @@ class RandomForestRegressorSuite extends SparkFunSuite with MLlibTestSparkContex
val model = rf.fit(df)
+ MLTestingUtils.checkCopyAndUids(rf, model)
+
val importances = model.featureImportances
val mostImportantFeature = importances.argmax
assert(mostImportantFeature === 1)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 7116265474f22..2b4e6b53e4f81 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -58,8 +58,7 @@ class CrossValidatorSuite
.setNumFolds(3)
val cvModel = cv.fit(dataset)
- // copied model must have the same paren.
- MLTestingUtils.checkCopy(cvModel)
+ MLTestingUtils.checkCopyAndUids(cv, cvModel)
val parent = cvModel.bestModel.parent.asInstanceOf[LogisticRegression]
assert(parent.getRegParam === 0.001)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index 4463a9b6e543a..a34f930aa11c4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -45,18 +45,18 @@ class TrainValidationSplitSuite
.addGrid(lr.maxIter, Array(0, 10))
.build()
val eval = new BinaryClassificationEvaluator
- val cv = new TrainValidationSplit()
+ val tvs = new TrainValidationSplit()
.setEstimator(lr)
.setEstimatorParamMaps(lrParamMaps)
.setEvaluator(eval)
.setTrainRatio(0.5)
.setSeed(42L)
- val cvModel = cv.fit(dataset)
- val parent = cvModel.bestModel.parent.asInstanceOf[LogisticRegression]
- assert(cv.getTrainRatio === 0.5)
+ val tvsModel = tvs.fit(dataset)
+ val parent = tvsModel.bestModel.parent.asInstanceOf[LogisticRegression]
+ assert(tvs.getTrainRatio === 0.5)
assert(parent.getRegParam === 0.001)
assert(parent.getMaxIter === 10)
- assert(cvModel.validationMetrics.length === lrParamMaps.length)
+ assert(tvsModel.validationMetrics.length === lrParamMaps.length)
}
test("train validation with linear regression") {
@@ -71,28 +71,27 @@ class TrainValidationSplitSuite
.addGrid(trainer.maxIter, Array(0, 10))
.build()
val eval = new RegressionEvaluator()
- val cv = new TrainValidationSplit()
+ val tvs = new TrainValidationSplit()
.setEstimator(trainer)
.setEstimatorParamMaps(lrParamMaps)
.setEvaluator(eval)
.setTrainRatio(0.5)
.setSeed(42L)
- val cvModel = cv.fit(dataset)
+ val tvsModel = tvs.fit(dataset)
- // copied model must have the same paren.
- MLTestingUtils.checkCopy(cvModel)
+ MLTestingUtils.checkCopyAndUids(tvs, tvsModel)
- val parent = cvModel.bestModel.parent.asInstanceOf[LinearRegression]
+ val parent = tvsModel.bestModel.parent.asInstanceOf[LinearRegression]
assert(parent.getRegParam === 0.001)
assert(parent.getMaxIter === 10)
- assert(cvModel.validationMetrics.length === lrParamMaps.length)
+ assert(tvsModel.validationMetrics.length === lrParamMaps.length)
eval.setMetricName("r2")
- val cvModel2 = cv.fit(dataset)
- val parent2 = cvModel2.bestModel.parent.asInstanceOf[LinearRegression]
+ val tvsModel2 = tvs.fit(dataset)
+ val parent2 = tvsModel2.bestModel.parent.asInstanceOf[LinearRegression]
assert(parent2.getRegParam === 0.001)
assert(parent2.getMaxIter === 10)
- assert(cvModel2.validationMetrics.length === lrParamMaps.length)
+ assert(tvsModel2.validationMetrics.length === lrParamMaps.length)
}
test("transformSchema should check estimatorParamMaps") {
@@ -104,17 +103,17 @@ class TrainValidationSplitSuite
.addGrid(est.inputCol, Array("input1", "input2"))
.build()
- val cv = new TrainValidationSplit()
+ val tvs = new TrainValidationSplit()
.setEstimator(est)
.setEstimatorParamMaps(paramMaps)
.setEvaluator(eval)
.setTrainRatio(0.5)
- cv.transformSchema(new StructType()) // This should pass.
+ tvs.transformSchema(new StructType()) // This should pass.
val invalidParamMaps = paramMaps :+ ParamMap(est.inputCol -> "")
- cv.setEstimatorParamMaps(invalidParamMaps)
+ tvs.setEstimatorParamMaps(invalidParamMaps)
intercept[IllegalArgumentException] {
- cv.transformSchema(new StructType())
+ tvs.transformSchema(new StructType())
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
index 578f31c8e7dba..bef79e634f75f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
@@ -31,11 +31,15 @@ import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
object MLTestingUtils extends SparkFunSuite {
- def checkCopy(model: Model[_]): Unit = {
+
+ def checkCopyAndUids[T <: Estimator[_]](estimator: T, model: Model[_]): Unit = {
+ assert(estimator.uid === model.uid, "Model uid does not match parent estimator")
+
+ // copied model must have the same parent
val copied = model.copy(ParamMap.empty)
.asInstanceOf[Model[_]]
- assert(copied.parent.uid == model.parent.uid)
assert(copied.parent == model.parent)
+ assert(copied.parent.uid == model.parent.uid)
}
def checkNumericTypes[M <: Model[M], T <: Estimator[M]](
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 2e3f9f2d0f3ac..feae76a087dec 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -100,7 +100,16 @@ object MimaExcludes {
ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toDenseMatrix"),
ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.toSparseMatrix"),
ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Matrix.getSizeInBytes")
- )
+ ) ++ Seq(
+ // [SPARK-17019] Expose on-heap and off-heap memory usage in various places
+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.SparkListenerBlockManagerAdded.copy"),
+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.SparkListenerBlockManagerAdded.this"),
+ ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.scheduler.SparkListenerBlockManagerAdded$"),
+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.SparkListenerBlockManagerAdded.apply"),
+ ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.storage.StorageStatus.this"),
+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.storage.StorageStatus.this"),
+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.RDDDataDistribution.this")
+ )
// Exclude rules for 2.1.x
lazy val v21excludes = v20excludes ++ {
diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
index b765343251965..ad1b487676fa7 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -72,7 +72,10 @@ def _convert_to_vector(l):
return DenseVector(l)
elif _have_scipy and scipy.sparse.issparse(l):
assert l.shape[1] == 1, "Expected column vector"
+ # Make sure the converted csc_matrix has sorted indices.
csc = l.tocsc()
+ if not csc.has_sorted_indices:
+ csc.sort_indices()
return SparseVector(l.shape[0], csc.indices, csc.data)
else:
raise TypeError("Cannot convert type %s into Vector" % type(l))
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 031f22c02098e..7b24b3c74a9fa 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -74,7 +74,10 @@ def _convert_to_vector(l):
return DenseVector(l)
elif _have_scipy and scipy.sparse.issparse(l):
assert l.shape[1] == 1, "Expected column vector"
+ # Make sure the converted csc_matrix has sorted indices.
csc = l.tocsc()
+ if not csc.has_sorted_indices:
+ csc.sort_indices()
return SparseVector(l.shape[0], csc.indices, csc.data)
else:
raise TypeError("Cannot convert type %s into Vector" % type(l))
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index c519883cdd73b..523b3f1113317 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -853,6 +853,17 @@ def serialize(l):
self.assertEqual(sv, serialize(lil.tocsr()))
self.assertEqual(sv, serialize(lil.todok()))
+ def test_convert_to_vector(self):
+ from scipy.sparse import csc_matrix
+ # Create a CSC matrix with non-sorted indices
+ indptr = array([0, 2])
+ indices = array([3, 1])
+ data = array([2.0, 1.0])
+ csc = csc_matrix((data, indices, indptr))
+ self.assertFalse(csc.has_sorted_indices)
+ sv = SparseVector(4, {1: 1, 3: 2})
+ self.assertEqual(sv, _convert_to_vector(csc))
+
def test_dot(self):
from scipy.sparse import lil_matrix
lil = lil_matrix((4, 1))
diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index 253a750629170..41e68a45a6159 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -72,10 +72,10 @@ def listDatabases(self):
@ignore_unicode_prefix
@since(2.0)
def listTables(self, dbName=None):
- """Returns a list of tables in the specified database.
+ """Returns a list of tables/views in the specified database.
If no database is specified, the current database is used.
- This includes all temporary tables.
+ This includes all temporary views.
"""
if dbName is None:
dbName = self.currentDatabase()
@@ -115,7 +115,7 @@ def listFunctions(self, dbName=None):
@ignore_unicode_prefix
@since(2.0)
def listColumns(self, tableName, dbName=None):
- """Returns a list of columns for the given table in the specified database.
+ """Returns a list of columns for the given table/view in the specified database.
If no database is specified, the current database is used.
@@ -161,14 +161,15 @@ def createExternalTable(self, tableName, path=None, source=None, schema=None, **
def createTable(self, tableName, path=None, source=None, schema=None, **options):
"""Creates a table based on the dataset in a data source.
- It returns the DataFrame associated with the external table.
+ It returns the DataFrame associated with the table.
The data source is specified by the ``source`` and a set of ``options``.
If ``source`` is not specified, the default data source configured by
- ``spark.sql.sources.default`` will be used.
+ ``spark.sql.sources.default`` will be used. When ``path`` is specified, an external table is
+ created from the data at the given path. Otherwise a managed table is created.
Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and
- created external table.
+ created table.
:return: :class:`DataFrame`
"""
@@ -276,14 +277,24 @@ def clearCache(self):
@since(2.0)
def refreshTable(self, tableName):
- """Invalidate and refresh all the cached metadata of the given table."""
+ """Invalidates and refreshes all the cached data and metadata of the given table."""
self._jcatalog.refreshTable(tableName)
@since('2.1.1')
def recoverPartitions(self, tableName):
- """Recover all the partitions of the given table and update the catalog."""
+ """Recovers all the partitions of the given table and update the catalog.
+
+ Only works with a partitioned table, and not a view.
+ """
self._jcatalog.recoverPartitions(tableName)
+ @since('2.2.0')
+ def refreshByPath(self, path):
+ """Invalidates and refreshes all the cached data (and the associated metadata) for any
+ DataFrame that contains the given data source path.
+ """
+ self._jcatalog.refreshByPath(path)
+
def _reset(self):
"""(Internal use only) Drop all existing databases (except "default"), tables,
partitions and functions, and set the current database to "default".
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index c22f4b87e1a78..fdb7abbad4e5f 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -385,7 +385,7 @@ def sql(self, sqlQuery):
@since(1.0)
def table(self, tableName):
- """Returns the specified table as a :class:`DataFrame`.
+ """Returns the specified table or view as a :class:`DataFrame`.
:return: :class:`DataFrame`
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 08a301695fda7..41bf8c269b795 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-__version__ = "2.1.0.dev0"
+__version__ = "2.2.0.dev0"
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index 5bdc2a2b840e3..2a36ec4fa8112 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -67,6 +67,8 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
private val maxGpus = conf.getInt("spark.mesos.gpus.max", 0)
+ private val taskLabels = conf.get("spark.mesos.task.labels", "")
+
private[this] val shutdownTimeoutMS =
conf.getTimeAsMs("spark.mesos.coarse.shutdownTimeout", "10s")
.ensuring(_ >= 0, "spark.mesos.coarse.shutdownTimeout must be >= 0")
@@ -408,6 +410,13 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
taskBuilder.addAllResources(resourcesToUse.asJava)
taskBuilder.setContainer(MesosSchedulerBackendUtil.containerInfo(sc.conf))
+ val labelsBuilder = taskBuilder.getLabelsBuilder
+ val labels = buildMesosLabels().asJava
+
+ labelsBuilder.addAllLabels(labels)
+
+ taskBuilder.setLabels(labelsBuilder)
+
tasks(offer.getId) ::= taskBuilder.build()
remainingResources(offerId) = resourcesLeft.asJava
totalCoresAcquired += taskCPUs
@@ -422,6 +431,21 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
tasks.toMap
}
+ private def buildMesosLabels(): List[Label] = {
+ taskLabels.split(",").flatMap(label =>
+ label.split(":") match {
+ case Array(key, value) =>
+ Some(Label.newBuilder()
+ .setKey(key)
+ .setValue(value)
+ .build())
+ case _ =>
+ logWarning(s"Unable to parse $label into a key:value label for the task.")
+ None
+ }
+ ).toList
+ }
+
/** Extracts task needed resources from a list of available resources. */
private def partitionTaskResources(
resources: JList[Resource],
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
index eb83926ae4102..c040f05d93b3a 100644
--- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
@@ -475,6 +475,52 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
assert(launchedTasks.head.getName == "test-mesos-dynamic-alloc 0")
}
+ test("mesos sets configurable labels on tasks") {
+ val taskLabelsString = "mesos:test,label:test"
+ setBackend(Map(
+ "spark.mesos.task.labels" -> taskLabelsString
+ ))
+
+ // Build up the labels
+ val taskLabels = Protos.Labels.newBuilder()
+ .addLabels(Protos.Label.newBuilder()
+ .setKey("mesos").setValue("test").build())
+ .addLabels(Protos.Label.newBuilder()
+ .setKey("label").setValue("test").build())
+ .build()
+
+ val offers = List(Resources(backend.executorMemory(sc), 1))
+ offerResources(offers)
+ val launchedTasks = verifyTaskLaunched(driver, "o1")
+
+ val labels = launchedTasks.head.getLabels
+
+ assert(launchedTasks.head.getLabels.equals(taskLabels))
+ }
+
+ test("mesos ignored invalid labels and sets configurable labels on tasks") {
+ val taskLabelsString = "mesos:test,label:test,incorrect:label:here"
+ setBackend(Map(
+ "spark.mesos.task.labels" -> taskLabelsString
+ ))
+
+ // Build up the labels
+ val taskLabels = Protos.Labels.newBuilder()
+ .addLabels(Protos.Label.newBuilder()
+ .setKey("mesos").setValue("test").build())
+ .addLabels(Protos.Label.newBuilder()
+ .setKey("label").setValue("test").build())
+ .build()
+
+ val offers = List(Resources(backend.executorMemory(sc), 1))
+ offerResources(offers)
+ val launchedTasks = verifyTaskLaunched(driver, "o1")
+
+ val labels = launchedTasks.head.getLabels
+
+ assert(launchedTasks.head.getLabels.equals(taskLabels))
+ }
+
test("mesos supports spark.mesos.network.name") {
setBackend(Map(
"spark.mesos.network.name" -> "test-network-name"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala
new file mode 100644
index 0000000000000..91cb004eaec46
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala
@@ -0,0 +1,351 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import scala.annotation.tailrec
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.planning.PhysicalOperation
+import org.apache.spark.sql.catalyst.plans._
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * Encapsulates star-schema detection logic.
+ */
+case class StarSchemaDetection(conf: SQLConf) extends PredicateHelper {
+
+ /**
+ * Star schema consists of one or more fact tables referencing a number of dimension
+ * tables. In general, star-schema joins are detected using the following conditions:
+ * 1. Informational RI constraints (reliable detection)
+ * + Dimension contains a primary key that is being joined to the fact table.
+ * + Fact table contains foreign keys referencing multiple dimension tables.
+ * 2. Cardinality based heuristics
+ * + Usually, the table with the highest cardinality is the fact table.
+ * + Table being joined with the most number of tables is the fact table.
+ *
+ * To detect star joins, the algorithm uses a combination of the above two conditions.
+ * The fact table is chosen based on the cardinality heuristics, and the dimension
+ * tables are chosen based on the RI constraints. A star join will consist of the largest
+ * fact table joined with the dimension tables on their primary keys. To detect that a
+ * column is a primary key, the algorithm uses table and column statistics.
+ *
+ * The algorithm currently returns only the star join with the largest fact table.
+ * Choosing the largest fact table on the driving arm to avoid large inners is in
+ * general a good heuristic. This restriction will be lifted to observe multiple
+ * star joins.
+ *
+ * The highlights of the algorithm are the following:
+ *
+ * Given a set of joined tables/plans, the algorithm first verifies if they are eligible
+ * for star join detection. An eligible plan is a base table access with valid statistics.
+ * A base table access represents Project or Filter operators above a LeafNode. Conservatively,
+ * the algorithm only considers base table access as part of a star join since they provide
+ * reliable statistics. This restriction can be lifted with the CBO enablement by default.
+ *
+ * If some of the plans are not base table access, or statistics are not available, the algorithm
+ * returns an empty star join plan since, in the absence of statistics, it cannot make
+ * good planning decisions. Otherwise, the algorithm finds the table with the largest cardinality
+ * (number of rows), which is assumed to be a fact table.
+ *
+ * Next, it computes the set of dimension tables for the current fact table. A dimension table
+ * is assumed to be in a RI relationship with a fact table. To infer column uniqueness,
+ * the algorithm compares the number of distinct values with the total number of rows in the
+ * table. If their relative difference is within certain limits (i.e. ndvMaxError * 2, adjusted
+ * based on 1TB TPC-DS data), the column is assumed to be unique.
+ */
+ def findStarJoins(
+ input: Seq[LogicalPlan],
+ conditions: Seq[Expression]): Seq[LogicalPlan] = {
+
+ val emptyStarJoinPlan = Seq.empty[LogicalPlan]
+
+ if (!conf.starSchemaDetection || input.size < 2) {
+ emptyStarJoinPlan
+ } else {
+ // Find if the input plans are eligible for star join detection.
+ // An eligible plan is a base table access with valid statistics.
+ val foundEligibleJoin = input.forall {
+ case PhysicalOperation(_, _, t: LeafNode) if t.stats(conf).rowCount.isDefined => true
+ case _ => false
+ }
+
+ if (!foundEligibleJoin) {
+ // Some plans don't have stats or are complex plans. Conservatively,
+ // return an empty star join. This restriction can be lifted
+ // once statistics are propagated in the plan.
+ emptyStarJoinPlan
+ } else {
+ // Find the fact table using cardinality based heuristics i.e.
+ // the table with the largest number of rows.
+ val sortedFactTables = input.map { plan =>
+ TableAccessCardinality(plan, getTableAccessCardinality(plan))
+ }.collect { case t @ TableAccessCardinality(_, Some(_)) =>
+ t
+ }.sortBy(_.size)(implicitly[Ordering[Option[BigInt]]].reverse)
+
+ sortedFactTables match {
+ case Nil =>
+ emptyStarJoinPlan
+ case table1 :: table2 :: _
+ if table2.size.get.toDouble > conf.starSchemaFTRatio * table1.size.get.toDouble =>
+ // If the top largest tables have comparable number of rows, return an empty star plan.
+ // This restriction will be lifted when the algorithm is generalized
+ // to return multiple star plans.
+ emptyStarJoinPlan
+ case TableAccessCardinality(factTable, _) :: rest =>
+ // Find the fact table joins.
+ val allFactJoins = rest.collect { case TableAccessCardinality(plan, _)
+ if findJoinConditions(factTable, plan, conditions).nonEmpty =>
+ plan
+ }
+
+ // Find the corresponding join conditions.
+ val allFactJoinCond = allFactJoins.flatMap { plan =>
+ val joinCond = findJoinConditions(factTable, plan, conditions)
+ joinCond
+ }
+
+ // Verify if the join columns have valid statistics.
+ // Allow any relational comparison between the tables. Later
+ // we will heuristically choose a subset of equi-join
+ // tables.
+ val areStatsAvailable = allFactJoins.forall { dimTable =>
+ allFactJoinCond.exists {
+ case BinaryComparison(lhs: AttributeReference, rhs: AttributeReference) =>
+ val dimCol = if (dimTable.outputSet.contains(lhs)) lhs else rhs
+ val factCol = if (factTable.outputSet.contains(lhs)) lhs else rhs
+ hasStatistics(dimCol, dimTable) && hasStatistics(factCol, factTable)
+ case _ => false
+ }
+ }
+
+ if (!areStatsAvailable) {
+ emptyStarJoinPlan
+ } else {
+ // Find the subset of dimension tables. A dimension table is assumed to be in a
+ // RI relationship with the fact table. Only consider equi-joins
+ // between a fact and a dimension table to avoid expanding joins.
+ val eligibleDimPlans = allFactJoins.filter { dimTable =>
+ allFactJoinCond.exists {
+ case cond @ Equality(lhs: AttributeReference, rhs: AttributeReference) =>
+ val dimCol = if (dimTable.outputSet.contains(lhs)) lhs else rhs
+ isUnique(dimCol, dimTable)
+ case _ => false
+ }
+ }
+
+ if (eligibleDimPlans.isEmpty || eligibleDimPlans.size < 2) {
+ // An eligible star join was not found since the join is not
+ // an RI join, or the star join is an expanding join.
+ // Also, a star would involve more than one dimension table.
+ emptyStarJoinPlan
+ } else {
+ factTable +: eligibleDimPlans
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Determines if a column referenced by a base table access is a primary key.
+ * A column is a PK if it is not nullable and has unique values.
+ * To determine if a column has unique values in the absence of informational
+ * RI constraints, the number of distinct values is compared to the total
+ * number of rows in the table. If their relative difference
+ * is within the expected limits (i.e. 2 * spark.sql.statistics.ndv.maxError based
+ * on TPC-DS data results), the column is assumed to have unique values.
+ */
+ private def isUnique(
+ column: Attribute,
+ plan: LogicalPlan): Boolean = plan match {
+ case PhysicalOperation(_, _, t: LeafNode) =>
+ val leafCol = findLeafNodeCol(column, plan)
+ leafCol match {
+ case Some(col) if t.outputSet.contains(col) =>
+ val stats = t.stats(conf)
+ stats.rowCount match {
+ case Some(rowCount) if rowCount >= 0 =>
+ if (stats.attributeStats.nonEmpty && stats.attributeStats.contains(col)) {
+ val colStats = stats.attributeStats.get(col)
+ if (colStats.get.nullCount > 0) {
+ false
+ } else {
+ val distinctCount = colStats.get.distinctCount
+ val relDiff = math.abs((distinctCount.toDouble / rowCount.toDouble) - 1.0d)
+ // ndvMaxErr adjusted based on TPCDS 1TB data results
+ relDiff <= conf.ndvMaxError * 2
+ }
+ } else {
+ false
+ }
+ case None => false
+ }
+ case None => false
+ }
+ case _ => false
+ }
+
+ /**
+ * Given a column over a base table access, it returns
+ * the leaf node column from which the input column is derived.
+ */
+ @tailrec
+ private def findLeafNodeCol(
+ column: Attribute,
+ plan: LogicalPlan): Option[Attribute] = plan match {
+ case pl @ PhysicalOperation(_, _, _: LeafNode) =>
+ pl match {
+ case t: LeafNode if t.outputSet.contains(column) =>
+ Option(column)
+ case p: Project if p.outputSet.exists(_.semanticEquals(column)) =>
+ val col = p.outputSet.find(_.semanticEquals(column)).get
+ findLeafNodeCol(col, p.child)
+ case f: Filter =>
+ findLeafNodeCol(column, f.child)
+ case _ => None
+ }
+ case _ => None
+ }
+
+ /**
+ * Checks if a column has statistics.
+ * The column is assumed to be over a base table access.
+ */
+ private def hasStatistics(
+ column: Attribute,
+ plan: LogicalPlan): Boolean = plan match {
+ case PhysicalOperation(_, _, t: LeafNode) =>
+ val leafCol = findLeafNodeCol(column, plan)
+ leafCol match {
+ case Some(col) if t.outputSet.contains(col) =>
+ val stats = t.stats(conf)
+ stats.attributeStats.nonEmpty && stats.attributeStats.contains(col)
+ case None => false
+ }
+ case _ => false
+ }
+
+ /**
+ * Returns the join predicates between two input plans. It only
+ * considers basic comparison operators.
+ */
+ @inline
+ private def findJoinConditions(
+ plan1: LogicalPlan,
+ plan2: LogicalPlan,
+ conditions: Seq[Expression]): Seq[Expression] = {
+ val refs = plan1.outputSet ++ plan2.outputSet
+ conditions.filter {
+ case BinaryComparison(_, _) => true
+ case _ => false
+ }.filterNot(canEvaluate(_, plan1))
+ .filterNot(canEvaluate(_, plan2))
+ .filter(_.references.subsetOf(refs))
+ }
+
+ /**
+ * Checks if a star join is a selective join. A star join is assumed
+ * to be selective if there are local predicates on the dimension
+ * tables.
+ */
+ private def isSelectiveStarJoin(
+ dimTables: Seq[LogicalPlan],
+ conditions: Seq[Expression]): Boolean = dimTables.exists {
+ case plan @ PhysicalOperation(_, p, _: LeafNode) =>
+ // Checks if any condition applies to the dimension tables.
+ // Exclude the IsNotNull predicates until predicate selectivity is available.
+ // In most cases, this predicate is artificially introduced by the Optimizer
+ // to enforce nullability constraints.
+ val localPredicates = conditions.filterNot(_.isInstanceOf[IsNotNull])
+ .exists(canEvaluate(_, plan))
+
+ // Checks if there are any predicates pushed down to the base table access.
+ val pushedDownPredicates = p.nonEmpty && !p.forall(_.isInstanceOf[IsNotNull])
+
+ localPredicates || pushedDownPredicates
+ case _ => false
+ }
+
+ /**
+ * Helper case class to hold (plan, rowCount) pairs.
+ */
+ private case class TableAccessCardinality(plan: LogicalPlan, size: Option[BigInt])
+
+ /**
+ * Returns the cardinality of a base table access. A base table access represents
+ * a LeafNode, or Project or Filter operators above a LeafNode.
+ */
+ private def getTableAccessCardinality(
+ input: LogicalPlan): Option[BigInt] = input match {
+ case PhysicalOperation(_, cond, t: LeafNode) if t.stats(conf).rowCount.isDefined =>
+ if (conf.cboEnabled && input.stats(conf).rowCount.isDefined) {
+ Option(input.stats(conf).rowCount.get)
+ } else {
+ Option(t.stats(conf).rowCount.get)
+ }
+ case _ => None
+ }
+
+ /**
+ * Reorders a star join based on heuristics. It is called from ReorderJoin if CBO is disabled.
+ * 1) Finds the star join with the largest fact table.
+ * 2) Places the fact table the driving arm of the left-deep tree.
+ * This plan avoids large table access on the inner, and thus favor hash joins.
+ * 3) Applies the most selective dimensions early in the plan to reduce the amount of
+ * data flow.
+ */
+ def reorderStarJoins(
+ input: Seq[(LogicalPlan, InnerLike)],
+ conditions: Seq[Expression]): Seq[(LogicalPlan, InnerLike)] = {
+ assert(input.size >= 2)
+
+ val emptyStarJoinPlan = Seq.empty[(LogicalPlan, InnerLike)]
+
+ // Find the eligible star plans. Currently, it only returns
+ // the star join with the largest fact table.
+ val eligibleJoins = input.collect{ case (plan, Inner) => plan }
+ val starPlan = findStarJoins(eligibleJoins, conditions)
+
+ if (starPlan.isEmpty) {
+ emptyStarJoinPlan
+ } else {
+ val (factTable, dimTables) = (starPlan.head, starPlan.tail)
+
+ // Only consider selective joins. This case is detected by observing local predicates
+ // on the dimension tables. In a star schema relationship, the join between the fact and the
+ // dimension table is a FK-PK join. Heuristically, a selective dimension may reduce
+ // the result of a join.
+ if (isSelectiveStarJoin(dimTables, conditions)) {
+ val reorderDimTables = dimTables.map { plan =>
+ TableAccessCardinality(plan, getTableAccessCardinality(plan))
+ }.sortBy(_.size).map {
+ case TableAccessCardinality(p1, _) => p1
+ }
+
+ val reorderStarPlan = factTable +: reorderDimTables
+ reorderStarPlan.map(plan => (plan, Inner))
+ } else {
+ emptyStarJoinPlan
+ }
+ }
+ }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
index 250dd07a16eb4..c3ab58744953d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
@@ -20,338 +20,12 @@ package org.apache.spark.sql.catalyst.optimizer
import scala.annotation.tailrec
import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.planning.{ExtractFiltersAndInnerJoins, PhysicalOperation}
+import org.apache.spark.sql.catalyst.planning.ExtractFiltersAndInnerJoins
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.internal.SQLConf
-/**
- * Encapsulates star-schema join detection.
- */
-case class StarSchemaDetection(conf: SQLConf) extends PredicateHelper {
-
- /**
- * Star schema consists of one or more fact tables referencing a number of dimension
- * tables. In general, star-schema joins are detected using the following conditions:
- * 1. Informational RI constraints (reliable detection)
- * + Dimension contains a primary key that is being joined to the fact table.
- * + Fact table contains foreign keys referencing multiple dimension tables.
- * 2. Cardinality based heuristics
- * + Usually, the table with the highest cardinality is the fact table.
- * + Table being joined with the most number of tables is the fact table.
- *
- * To detect star joins, the algorithm uses a combination of the above two conditions.
- * The fact table is chosen based on the cardinality heuristics, and the dimension
- * tables are chosen based on the RI constraints. A star join will consist of the largest
- * fact table joined with the dimension tables on their primary keys. To detect that a
- * column is a primary key, the algorithm uses table and column statistics.
- *
- * Since Catalyst only supports left-deep tree plans, the algorithm currently returns only
- * the star join with the largest fact table. Choosing the largest fact table on the
- * driving arm to avoid large inners is in general a good heuristic. This restriction can
- * be lifted with support for bushy tree plans.
- *
- * The highlights of the algorithm are the following:
- *
- * Given a set of joined tables/plans, the algorithm first verifies if they are eligible
- * for star join detection. An eligible plan is a base table access with valid statistics.
- * A base table access represents Project or Filter operators above a LeafNode. Conservatively,
- * the algorithm only considers base table access as part of a star join since they provide
- * reliable statistics.
- *
- * If some of the plans are not base table access, or statistics are not available, the algorithm
- * returns an empty star join plan since, in the absence of statistics, it cannot make
- * good planning decisions. Otherwise, the algorithm finds the table with the largest cardinality
- * (number of rows), which is assumed to be a fact table.
- *
- * Next, it computes the set of dimension tables for the current fact table. A dimension table
- * is assumed to be in a RI relationship with a fact table. To infer column uniqueness,
- * the algorithm compares the number of distinct values with the total number of rows in the
- * table. If their relative difference is within certain limits (i.e. ndvMaxError * 2, adjusted
- * based on 1TB TPC-DS data), the column is assumed to be unique.
- */
- def findStarJoins(
- input: Seq[LogicalPlan],
- conditions: Seq[Expression]): Seq[Seq[LogicalPlan]] = {
-
- val emptyStarJoinPlan = Seq.empty[Seq[LogicalPlan]]
-
- if (!conf.starSchemaDetection || input.size < 2) {
- emptyStarJoinPlan
- } else {
- // Find if the input plans are eligible for star join detection.
- // An eligible plan is a base table access with valid statistics.
- val foundEligibleJoin = input.forall {
- case PhysicalOperation(_, _, t: LeafNode) if t.stats(conf).rowCount.isDefined => true
- case _ => false
- }
-
- if (!foundEligibleJoin) {
- // Some plans don't have stats or are complex plans. Conservatively,
- // return an empty star join. This restriction can be lifted
- // once statistics are propagated in the plan.
- emptyStarJoinPlan
- } else {
- // Find the fact table using cardinality based heuristics i.e.
- // the table with the largest number of rows.
- val sortedFactTables = input.map { plan =>
- TableAccessCardinality(plan, getTableAccessCardinality(plan))
- }.collect { case t @ TableAccessCardinality(_, Some(_)) =>
- t
- }.sortBy(_.size)(implicitly[Ordering[Option[BigInt]]].reverse)
-
- sortedFactTables match {
- case Nil =>
- emptyStarJoinPlan
- case table1 :: table2 :: _
- if table2.size.get.toDouble > conf.starSchemaFTRatio * table1.size.get.toDouble =>
- // If the top largest tables have comparable number of rows, return an empty star plan.
- // This restriction will be lifted when the algorithm is generalized
- // to return multiple star plans.
- emptyStarJoinPlan
- case TableAccessCardinality(factTable, _) :: rest =>
- // Find the fact table joins.
- val allFactJoins = rest.collect { case TableAccessCardinality(plan, _)
- if findJoinConditions(factTable, plan, conditions).nonEmpty =>
- plan
- }
-
- // Find the corresponding join conditions.
- val allFactJoinCond = allFactJoins.flatMap { plan =>
- val joinCond = findJoinConditions(factTable, plan, conditions)
- joinCond
- }
-
- // Verify if the join columns have valid statistics.
- // Allow any relational comparison between the tables. Later
- // we will heuristically choose a subset of equi-join
- // tables.
- val areStatsAvailable = allFactJoins.forall { dimTable =>
- allFactJoinCond.exists {
- case BinaryComparison(lhs: AttributeReference, rhs: AttributeReference) =>
- val dimCol = if (dimTable.outputSet.contains(lhs)) lhs else rhs
- val factCol = if (factTable.outputSet.contains(lhs)) lhs else rhs
- hasStatistics(dimCol, dimTable) && hasStatistics(factCol, factTable)
- case _ => false
- }
- }
-
- if (!areStatsAvailable) {
- emptyStarJoinPlan
- } else {
- // Find the subset of dimension tables. A dimension table is assumed to be in a
- // RI relationship with the fact table. Only consider equi-joins
- // between a fact and a dimension table to avoid expanding joins.
- val eligibleDimPlans = allFactJoins.filter { dimTable =>
- allFactJoinCond.exists {
- case cond @ Equality(lhs: AttributeReference, rhs: AttributeReference) =>
- val dimCol = if (dimTable.outputSet.contains(lhs)) lhs else rhs
- isUnique(dimCol, dimTable)
- case _ => false
- }
- }
-
- if (eligibleDimPlans.isEmpty) {
- // An eligible star join was not found because the join is not
- // an RI join, or the star join is an expanding join.
- emptyStarJoinPlan
- } else {
- Seq(factTable +: eligibleDimPlans)
- }
- }
- }
- }
- }
- }
-
- /**
- * Reorders a star join based on heuristics:
- * 1) Finds the star join with the largest fact table and places it on the driving
- * arm of the left-deep tree. This plan avoids large table access on the inner, and
- * thus favor hash joins.
- * 2) Applies the most selective dimensions early in the plan to reduce the amount of
- * data flow.
- */
- def reorderStarJoins(
- input: Seq[(LogicalPlan, InnerLike)],
- conditions: Seq[Expression]): Seq[(LogicalPlan, InnerLike)] = {
- assert(input.size >= 2)
-
- val emptyStarJoinPlan = Seq.empty[(LogicalPlan, InnerLike)]
-
- // Find the eligible star plans. Currently, it only returns
- // the star join with the largest fact table.
- val eligibleJoins = input.collect{ case (plan, Inner) => plan }
- val starPlans = findStarJoins(eligibleJoins, conditions)
-
- if (starPlans.isEmpty) {
- emptyStarJoinPlan
- } else {
- val starPlan = starPlans.head
- val (factTable, dimTables) = (starPlan.head, starPlan.tail)
-
- // Only consider selective joins. This case is detected by observing local predicates
- // on the dimension tables. In a star schema relationship, the join between the fact and the
- // dimension table is a FK-PK join. Heuristically, a selective dimension may reduce
- // the result of a join.
- // Also, conservatively assume that a fact table is joined with more than one dimension.
- if (dimTables.size >= 2 && isSelectiveStarJoin(dimTables, conditions)) {
- val reorderDimTables = dimTables.map { plan =>
- TableAccessCardinality(plan, getTableAccessCardinality(plan))
- }.sortBy(_.size).map {
- case TableAccessCardinality(p1, _) => p1
- }
-
- val reorderStarPlan = factTable +: reorderDimTables
- reorderStarPlan.map(plan => (plan, Inner))
- } else {
- emptyStarJoinPlan
- }
- }
- }
-
- /**
- * Determines if a column referenced by a base table access is a primary key.
- * A column is a PK if it is not nullable and has unique values.
- * To determine if a column has unique values in the absence of informational
- * RI constraints, the number of distinct values is compared to the total
- * number of rows in the table. If their relative difference
- * is within the expected limits (i.e. 2 * spark.sql.statistics.ndv.maxError based
- * on TPCDS data results), the column is assumed to have unique values.
- */
- private def isUnique(
- column: Attribute,
- plan: LogicalPlan): Boolean = plan match {
- case PhysicalOperation(_, _, t: LeafNode) =>
- val leafCol = findLeafNodeCol(column, plan)
- leafCol match {
- case Some(col) if t.outputSet.contains(col) =>
- val stats = t.stats(conf)
- stats.rowCount match {
- case Some(rowCount) if rowCount >= 0 =>
- if (stats.attributeStats.nonEmpty && stats.attributeStats.contains(col)) {
- val colStats = stats.attributeStats.get(col)
- if (colStats.get.nullCount > 0) {
- false
- } else {
- val distinctCount = colStats.get.distinctCount
- val relDiff = math.abs((distinctCount.toDouble / rowCount.toDouble) - 1.0d)
- // ndvMaxErr adjusted based on TPCDS 1TB data results
- relDiff <= conf.ndvMaxError * 2
- }
- } else {
- false
- }
- case None => false
- }
- case None => false
- }
- case _ => false
- }
-
- /**
- * Given a column over a base table access, it returns
- * the leaf node column from which the input column is derived.
- */
- @tailrec
- private def findLeafNodeCol(
- column: Attribute,
- plan: LogicalPlan): Option[Attribute] = plan match {
- case pl @ PhysicalOperation(_, _, _: LeafNode) =>
- pl match {
- case t: LeafNode if t.outputSet.contains(column) =>
- Option(column)
- case p: Project if p.outputSet.exists(_.semanticEquals(column)) =>
- val col = p.outputSet.find(_.semanticEquals(column)).get
- findLeafNodeCol(col, p.child)
- case f: Filter =>
- findLeafNodeCol(column, f.child)
- case _ => None
- }
- case _ => None
- }
-
- /**
- * Checks if a column has statistics.
- * The column is assumed to be over a base table access.
- */
- private def hasStatistics(
- column: Attribute,
- plan: LogicalPlan): Boolean = plan match {
- case PhysicalOperation(_, _, t: LeafNode) =>
- val leafCol = findLeafNodeCol(column, plan)
- leafCol match {
- case Some(col) if t.outputSet.contains(col) =>
- val stats = t.stats(conf)
- stats.attributeStats.nonEmpty && stats.attributeStats.contains(col)
- case None => false
- }
- case _ => false
- }
-
- /**
- * Returns the join predicates between two input plans. It only
- * considers basic comparison operators.
- */
- @inline
- private def findJoinConditions(
- plan1: LogicalPlan,
- plan2: LogicalPlan,
- conditions: Seq[Expression]): Seq[Expression] = {
- val refs = plan1.outputSet ++ plan2.outputSet
- conditions.filter {
- case BinaryComparison(_, _) => true
- case _ => false
- }.filterNot(canEvaluate(_, plan1))
- .filterNot(canEvaluate(_, plan2))
- .filter(_.references.subsetOf(refs))
- }
-
- /**
- * Checks if a star join is a selective join. A star join is assumed
- * to be selective if there are local predicates on the dimension
- * tables.
- */
- private def isSelectiveStarJoin(
- dimTables: Seq[LogicalPlan],
- conditions: Seq[Expression]): Boolean = dimTables.exists {
- case plan @ PhysicalOperation(_, p, _: LeafNode) =>
- // Checks if any condition applies to the dimension tables.
- // Exclude the IsNotNull predicates until predicate selectivity is available.
- // In most cases, this predicate is artificially introduced by the Optimizer
- // to enforce nullability constraints.
- val localPredicates = conditions.filterNot(_.isInstanceOf[IsNotNull])
- .exists(canEvaluate(_, plan))
-
- // Checks if there are any predicates pushed down to the base table access.
- val pushedDownPredicates = p.nonEmpty && !p.forall(_.isInstanceOf[IsNotNull])
-
- localPredicates || pushedDownPredicates
- case _ => false
- }
-
- /**
- * Helper case class to hold (plan, rowCount) pairs.
- */
- private case class TableAccessCardinality(plan: LogicalPlan, size: Option[BigInt])
-
- /**
- * Returns the cardinality of a base table access. A base table access represents
- * a LeafNode, or Project or Filter operators above a LeafNode.
- */
- private def getTableAccessCardinality(
- input: LogicalPlan): Option[BigInt] = input match {
- case PhysicalOperation(_, cond, t: LeafNode) if t.stats(conf).rowCount.isDefined =>
- if (conf.cboEnabled && input.stats(conf).rowCount.isDefined) {
- Option(input.stats(conf).rowCount.get)
- } else {
- Option(t.stats(conf).rowCount.get)
- }
- case _ => None
- }
-}
-
/**
* Reorder the joins and push all the conditions into join, so that the bottom ones have at least
* one condition.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/StarJoinReorderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/StarJoinReorderSuite.scala
index 003ce49eaf8e6..605c01b7220d1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/StarJoinReorderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/StarJoinReorderSuite.scala
@@ -206,7 +206,7 @@ class StarJoinReorderSuite extends PlanTest with StatsEstimationTestBase {
// and d3_fk1 = s3_pk1
//
// Default join reordering: d1, f1, d2, d3, s3
- // Star join reordering: f1, d1, d3, d2,, d3
+ // Star join reordering: f1, d1, d3, d2, s3
val query =
d1.join(f1).join(d2).join(s3).join(d3)
@@ -242,7 +242,7 @@ class StarJoinReorderSuite extends PlanTest with StatsEstimationTestBase {
// and d3_fk1 = s3_pk1
//
// Default join reordering: d1, f1, d2, d3, s3
- // Star join reordering: f1, d1, d3, d2, d3
+ // Star join reordering: f1, d1, d3, d2, s3
val query =
d1.join(f1).join(d2).join(s3).join(d3)
.where((nameToAttr("f1_fk1") === nameToAttr("d1_pk1")) &&
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
index 137b0cbc84f8f..074952ff7900a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
@@ -283,7 +283,7 @@ abstract class Catalog {
/**
* :: Experimental ::
- * Creates a table from the given path based on a data source and a set of options.
+ * Creates a table based on the dataset in a data source and a set of options.
* Then, returns the corresponding DataFrame.
*
* @param tableName is either a qualified or unqualified name that designates a table.
@@ -321,7 +321,7 @@ abstract class Catalog {
/**
* :: Experimental ::
* (Scala-specific)
- * Creates a table from the given path based on a data source and a set of options.
+ * Creates a table based on the dataset in a data source and a set of options.
* Then, returns the corresponding DataFrame.
*
* @param tableName is either a qualified or unqualified name that designates a table.
@@ -357,7 +357,7 @@ abstract class Catalog {
/**
* :: Experimental ::
- * Create a table from the given path based on a data source, a schema and a set of options.
+ * Create a table based on the dataset in a data source, a schema and a set of options.
* Then, returns the corresponding DataFrame.
*
* @param tableName is either a qualified or unqualified name that designates a table.
@@ -397,7 +397,7 @@ abstract class Catalog {
/**
* :: Experimental ::
* (Scala-specific)
- * Create a table from the given path based on a data source, a schema and a set of options.
+ * Create a table based on the dataset in a data source, a schema and a set of options.
* Then, returns the corresponding DataFrame.
*
* @param tableName is either a qualified or unqualified name that designates a table.
@@ -447,6 +447,7 @@ abstract class Catalog {
/**
* Recovers all the partitions in the directory of a table and update the catalog.
+ * Only works with a partitioned table, and not a view.
*
* @param tableName is either a qualified or unqualified name that designates a table.
* If no database identifier is provided, it refers to a table in the
@@ -493,10 +494,10 @@ abstract class Catalog {
def clearCache(): Unit
/**
- * Invalidates and refreshes all the cached metadata of the given table. For performance reasons,
- * Spark SQL or the external data source library it uses might cache certain metadata about a
- * table, such as the location of blocks. When those change outside of Spark SQL, users should
- * call this function to invalidate the cache.
+ * Invalidates and refreshes all the cached data and metadata of the given table. For performance
+ * reasons, Spark SQL or the external data source library it uses might cache certain metadata
+ * about a table, such as the location of blocks. When those change outside of Spark SQL, users
+ * should call this function to invalidate the cache.
*
* If this table is cached as an InMemoryRelation, drop the original cached version and make the
* new version cached lazily.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index 5d1c35aba529a..aebb663df5c92 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -141,7 +141,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
}
/**
- * Returns a list of columns for the given table temporary view.
+ * Returns a list of columns for the given table/view or temporary view.
*/
@throws[AnalysisException]("table does not exist")
override def listColumns(tableName: String): Dataset[Column] = {
@@ -150,7 +150,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
}
/**
- * Returns a list of columns for the given table in the specified database.
+ * Returns a list of columns for the given table/view or temporary view in the specified database.
*/
@throws[AnalysisException]("database or table does not exist")
override def listColumns(dbName: String, tableName: String): Dataset[Column] = {
@@ -273,7 +273,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
/**
* :: Experimental ::
- * Creates a table from the given path based on a data source and returns the corresponding
+ * Creates a table from the given path and returns the corresponding
* DataFrame.
*
* @group ddl_ops
@@ -287,7 +287,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
/**
* :: Experimental ::
* (Scala-specific)
- * Creates a table from the given path based on a data source and a set of options.
+ * Creates a table based on the dataset in a data source and a set of options.
* Then, returns the corresponding DataFrame.
*
* @group ddl_ops
@@ -304,7 +304,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
/**
* :: Experimental ::
* (Scala-specific)
- * Creates a table from the given path based on a data source, a schema and a set of options.
+ * Creates a table based on the dataset in a data source, a schema and a set of options.
* Then, returns the corresponding DataFrame.
*
* @group ddl_ops
@@ -367,6 +367,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
/**
* Recovers all the partitions in the directory of a table and update the catalog.
+ * Only works with a partitioned table, and not a temporary view.
*
* @param tableName is either a qualified or unqualified name that designates a table.
* If no database identifier is provided, it refers to a table in the
@@ -431,8 +432,12 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
}
/**
- * Refreshes the cache entry for a table or view, if any. For Hive metastore table, the metadata
- * is refreshed. For data source tables, the schema will not be inferred and refreshed.
+ * Invalidates and refreshes all the cached data and metadata of the given table or view.
+ * For Hive metastore table, the metadata is refreshed. For data source tables, the schema will
+ * not be inferred and refreshed.
+ *
+ * If this table is cached as an InMemoryRelation, drop the original cached version and make the
+ * new version cached lazily.
*
* @group cachemgmt
* @since 2.0.0
@@ -456,7 +461,8 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
/**
* Refreshes the cache entry and the associated metadata for all Dataset (if any), that contain
- * the given data source path.
+ * the given data source path. Path matching is by prefix, i.e. "/" would invalidate
+ * everything that is cached.
*
* @group cachemgmt
* @since 2.0.0