diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 6d46c3190626..29a8a0062873 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.0
+Version: 2.3.1
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 3219c6f0cc47..c51eb0f39c4b 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -179,6 +179,7 @@ exportMethods("arrange",
               "with",
               "withColumn",
               "withColumnRenamed",
+              "withWatermark",
               "write.df",
               "write.jdbc",
               "write.json",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index fe238f6dd4eb..29f3e986eaab 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2853,7 +2853,7 @@ setMethod("intersect",
 #' except
 #'
 #' Return a new SparkDataFrame containing rows in this SparkDataFrame
-#' but not in another SparkDataFrame. This is equivalent to \code{EXCEPT} in SQL.
+#' but not in another SparkDataFrame. This is equivalent to \code{EXCEPT DISTINCT} in SQL.
 #'
 #' @param x a SparkDataFrame.
 #' @param y a SparkDataFrame.
@@ -3054,10 +3054,10 @@ setMethod("describe",
 #' \item stddev
 #' \item min
 #' \item max
-#' \item arbitrary approximate percentiles specified as a percentage (eg, "75%")
+#' \item arbitrary approximate percentiles specified as a percentage (eg, "75\%")
 #' }
 #' If no statistics are given, this function computes count, mean, stddev, min,
-#' approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
+#' approximate quartiles (percentiles at 25\%, 50\%, and 75\%), and max.
 #' This function is meant for exploratory data analysis, as we make no guarantee about the
 #' backward compatibility of the schema of the resulting Dataset. If you want to
 #' programmatically compute summary statistics, use the \code{agg} function instead.
@@ -3661,7 +3661,8 @@ setMethod("getNumPartitions",
 #' isStreaming
 #'
 #' Returns TRUE if this SparkDataFrame contains one or more sources that continuously return data
-#' as it arrives.
+#' as it arrives. A dataset that reads data from a streaming source must be executed as a
+#' \code{StreamingQuery} using \code{write.stream}.
 #'
 #' @param x A SparkDataFrame
 #' @return TRUE if this SparkDataFrame is from a streaming source
@@ -3707,7 +3708,17 @@ setMethod("isStreaming",
 #' @param df a streaming SparkDataFrame.
 #' @param source a name for external data source.
 #' @param outputMode one of 'append', 'complete', 'update'.
-#' @param ... additional argument(s) passed to the method.
+#' @param partitionBy a name or a list of names of columns to partition the output by on the file
+#'        system. If specified, the output is laid out on the file system similar to Hive's
+#'        partitioning scheme.
+#' @param trigger.processingTime a processing time interval as a string, e.g. '5 seconds',
+#'        '1 minute'. This is a trigger that runs a query periodically based on the processing
+#'        time. If value is '0 seconds', the query will run as fast as possible, this is the
+#'        default. Only one trigger can be set.
+#' @param trigger.once a logical, must be set to \code{TRUE}. This is a trigger that processes only
+#'        one batch of data in a streaming query then terminates the query. Only one trigger can be
+#'        set.
+#' @param ... additional external data source specific named options.
 #'
 #' @family SparkDataFrame functions
 #' @seealso \link{read.stream}
@@ -3725,7 +3736,8 @@ setMethod("isStreaming",
 #' # console
 #' q <- write.stream(wordCounts, "console", outputMode = "complete")
 #' # text stream
-#' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = "/home/user/cp")
+#' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = "/home/user/cp"
+#'                   partitionBy = c("year", "month"), trigger.processingTime = "30 seconds")
 #' # memory stream
 #' q <- write.stream(wordCounts, "memory", queryName = "outs", outputMode = "complete")
 #' head(sql("SELECT * from outs"))
@@ -3737,7 +3749,8 @@ setMethod("isStreaming",
 #' @note experimental
 setMethod("write.stream",
           signature(df = "SparkDataFrame"),
-          function(df, source = NULL, outputMode = NULL, ...) {
+          function(df, source = NULL, outputMode = NULL, partitionBy = NULL,
+                   trigger.processingTime = NULL, trigger.once = NULL, ...) {
             if (!is.null(source) && !is.character(source)) {
               stop("source should be character, NULL or omitted. It is the data source specified ",
                    "in 'spark.sql.sources.default' configuration by default.")
@@ -3748,12 +3761,43 @@ setMethod("write.stream",
             if (is.null(source)) {
               source <- getDefaultSqlSource()
             }
+            cols <- NULL
+            if (!is.null(partitionBy)) {
+              if (!all(sapply(partitionBy, function(c) { is.character(c) }))) {
+                stop("All partitionBy column names should be characters.")
+              }
+              cols <- as.list(partitionBy)
+            }
+            jtrigger <- NULL
+            if (!is.null(trigger.processingTime) && !is.na(trigger.processingTime)) {
+              if (!is.null(trigger.once)) {
+                stop("Multiple triggers not allowed.")
+              }
+              interval <- as.character(trigger.processingTime)
+              if (nchar(interval) == 0) {
+                stop("Value for trigger.processingTime must be a non-empty string.")
+              }
+              jtrigger <- handledCallJStatic("org.apache.spark.sql.streaming.Trigger",
+                                             "ProcessingTime",
+                                             interval)
+            } else if (!is.null(trigger.once) && !is.na(trigger.once)) {
+              if (!is.logical(trigger.once) || !trigger.once) {
+                stop("Value for trigger.once must be TRUE.")
+              }
+              jtrigger <- callJStatic("org.apache.spark.sql.streaming.Trigger", "Once")
+            }
             options <- varargsToStrEnv(...)
             write <- handledCallJMethod(df@sdf, "writeStream")
             write <- callJMethod(write, "format", source)
             if (!is.null(outputMode)) {
               write <- callJMethod(write, "outputMode", outputMode)
             }
+            if (!is.null(cols)) {
+              write <- callJMethod(write, "partitionBy", cols)
+            }
+            if (!is.null(jtrigger)) {
+              write <- callJMethod(write, "trigger", jtrigger)
+            }
             write <- callJMethod(write, "options", options)
             ssq <- handledCallJMethod(write, "start")
             streamingQuery(ssq)
@@ -3967,3 +4011,47 @@ setMethod("broadcast",
             sdf <- callJStatic("org.apache.spark.sql.functions", "broadcast", x@sdf)
             dataFrame(sdf)
           })
+
+#' withWatermark
+#'
+#' Defines an event time watermark for this streaming SparkDataFrame. A watermark tracks a point in
+#' time before which we assume no more late data is going to arrive.
+#'
+#' Spark will use this watermark for several purposes:
+#' \itemize{
+#'  \item To know when a given time window aggregation can be finalized and thus can be emitted
+#' when using output modes that do not allow updates.
+#'  \item To minimize the amount of state that we need to keep for on-going aggregations.
+#' }
+#' The current watermark is computed by looking at the \code{MAX(eventTime)} seen across
+#' all of the partitions in the query minus a user specified \code{delayThreshold}. Due to the cost
+#' of coordinating this value across partitions, the actual watermark used is only guaranteed
+#' to be at least \code{delayThreshold} behind the actual event time.  In some cases we may still
+#' process records that arrive more than \code{delayThreshold} late.
+#'
+#' @param x a streaming SparkDataFrame
+#' @param eventTime a string specifying the name of the Column that contains the event time of the
+#'                  row.
+#' @param delayThreshold a string specifying the minimum delay to wait to data to arrive late,
+#'                       relative to the latest record that has been processed in the form of an
+#'                       interval (e.g. "1 minute" or "5 hours"). NOTE: This should not be negative.
+#' @return a SparkDataFrame.
+#' @aliases withWatermark,SparkDataFrame,character,character-method
+#' @family SparkDataFrame functions
+#' @rdname withWatermark
+#' @name withWatermark
+#' @export
+#' @examples
+#' \dontrun{
+#' sparkR.session()
+#' schema <- structType(structField("time", "timestamp"), structField("value", "double"))
+#' df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
+#' df <- withWatermark(df, "time", "10 minutes")
+#' }
+#' @note withWatermark since 2.3.0
+setMethod("withWatermark",
+          signature(x = "SparkDataFrame", eventTime = "character", delayThreshold = "character"),
+          function(x, eventTime, delayThreshold) {
+            sdf <- callJMethod(x@sdf, "withWatermark", eventTime, delayThreshold)
+            dataFrame(sdf)
+          })
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 3b7f71bbbffb..9d0a2d5e074e 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -727,7 +727,9 @@ read.jdbc <- function(url, tableName,
 #' @param schema The data schema defined in structType or a DDL-formatted string, this is
 #'               required for file-based streaming data source
 #' @param ... additional external data source specific named options, for instance \code{path} for
-#'        file-based streaming data source
+#'        file-based streaming data source. \code{timeZone} to indicate a timezone to be used to 
+#'        parse timestamps in the JSON/CSV data sources or partition values; If it isn't set, it
+#'        uses the default value, session local timezone.
 #' @return SparkDataFrame
 #' @rdname read.stream
 #' @name read.stream
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 5369c32544e5..e0dde3339fab 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -799,6 +799,12 @@ setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn
 setGeneric("withColumnRenamed",
            function(x, existingCol, newCol) { standardGeneric("withColumnRenamed") })
 
+#' @rdname withWatermark
+#' @export
+setGeneric("withWatermark", function(x, eventTime, delayThreshold) {
+  standardGeneric("withWatermark")
+})
+
 #' @rdname write.df
 #' @export
 setGeneric("write.df", function(df, path = NULL, ...) { standardGeneric("write.df") })
diff --git a/R/pkg/R/mllib_recommendation.R b/R/pkg/R/mllib_recommendation.R
index fa794249085d..5441c4a4022a 100644
--- a/R/pkg/R/mllib_recommendation.R
+++ b/R/pkg/R/mllib_recommendation.R
@@ -48,6 +48,8 @@ setClass("ALSModel", representation(jobj = "jobj"))
 #' @param numUserBlocks number of user blocks used to parallelize computation (> 0).
 #' @param numItemBlocks number of item blocks used to parallelize computation (> 0).
 #' @param checkpointInterval number of checkpoint intervals (>= 1) or disable checkpoint (-1).
+#'                           Note: this setting will be ignored if the checkpoint directory is not
+#'                           set.
 #' @param ... additional argument(s) passed to the method.
 #' @return \code{spark.als} returns a fitted ALS model.
 #' @rdname spark.als
diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R
index 89a58bf0aada..4e5ddf22ee16 100644
--- a/R/pkg/R/mllib_tree.R
+++ b/R/pkg/R/mllib_tree.R
@@ -161,6 +161,8 @@ print.summary.decisionTree <- function(x) {
 #'                            >= 1.
 #' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
 #' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#'                           Note: this setting will be ignored if the checkpoint directory is not
+#'                           set.
 #' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
 #' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
@@ -382,6 +384,8 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
 #' @param minInstancesPerNode Minimum number of instances each child must have after split.
 #' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
 #' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#'                           Note: this setting will be ignored if the checkpoint directory is not
+#'                           set.
 #' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
 #' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
@@ -595,6 +599,8 @@ setMethod("write.ml", signature(object = "RandomForestClassificationModel", path
 #' @param minInstancesPerNode Minimum number of instances each child must have after split.
 #' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
 #' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#'                           Note: this setting will be ignored if the checkpoint directory is not
+#'                           set.
 #' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
 #' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
diff --git a/R/pkg/tests/fulltests/test_streaming.R b/R/pkg/tests/fulltests/test_streaming.R
index 54f40bbd5f51..a354d50c6b54 100644
--- a/R/pkg/tests/fulltests/test_streaming.R
+++ b/R/pkg/tests/fulltests/test_streaming.R
@@ -172,6 +172,113 @@ test_that("Terminated by error", {
   stopQuery(q)
 })
 
+test_that("PartitionBy", {
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  checkpointPath <- tempfile(pattern = "sparkr-test", fileext = ".checkpoint")
+  textPath <- tempfile(pattern = "sparkr-test", fileext = ".text")
+  df <- read.df(jsonPath, "json", stringSchema)
+  write.df(df, parquetPath, "parquet", "overwrite")
+
+  df <- read.stream(path = parquetPath, schema = stringSchema)
+
+  expect_error(write.stream(df, "json", path = textPath, checkpointLocation = "append",
+                            partitionBy = c(1, 2)),
+                            "All partitionBy column names should be characters")
+
+  q <- write.stream(df, "json", path = textPath, checkpointLocation = "append",
+                    partitionBy = "name")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  dirs <- list.files(textPath)
+  expect_equal(length(dirs[substring(dirs, 1, nchar("name=")) == "name="]), 3)
+
+  unlink(checkpointPath)
+  unlink(textPath)
+  unlink(parquetPath)
+})
+
+test_that("Watermark", {
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  schema <- structType(structField("value", "string"))
+  t <- Sys.time()
+  df <- as.DataFrame(lapply(list(t), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  df <- read.stream(path = parquetPath, schema = "value STRING")
+  df <- withColumn(df, "eventTime", cast(df$value, "timestamp"))
+  df <- withWatermark(df, "eventTime", "10 seconds")
+  counts <- count(group_by(df, "eventTime"))
+  q <- write.stream(counts, "memory", queryName = "times", outputMode = "append")
+
+  # first events
+  df <- as.DataFrame(lapply(list(t + 1, t, t + 2), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  # advance watermark to 15
+  df <- as.DataFrame(lapply(list(t + 25), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  # old events, should be dropped
+  df <- as.DataFrame(lapply(list(t), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  # evict events less than previous watermark
+  df <- as.DataFrame(lapply(list(t + 25), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  times <- collect(sql("SELECT * FROM times"))
+  # looks like write timing can affect the first bucket; but it should be t
+  expect_equal(times[order(times$eventTime),][1, 2], 2)
+
+  stopQuery(q)
+  unlink(parquetPath)
+})
+
+test_that("Trigger", {
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  schema <- structType(structField("value", "string"))
+  df <- as.DataFrame(lapply(list(Sys.time()), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  df <- read.stream(path = parquetPath, schema = "value STRING")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.processingTime = "", trigger.once = ""), "Multiple triggers not allowed.")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.processingTime = ""),
+               "Value for trigger.processingTime must be a non-empty string.")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.processingTime = "invalid"), "illegal argument")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.once = ""), "Value for trigger.once must be TRUE.")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.once = FALSE), "Value for trigger.once must be TRUE.")
+
+  q <- write.stream(df, "memory", queryName = "times", outputMode = "append", trigger.once = TRUE)
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+  df <- as.DataFrame(lapply(list(Sys.time()), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  expect_equal(nrow(collect(sql("SELECT * FROM times"))), 1)
+
+  stopQuery(q)
+  unlink(parquetPath)
+})
+
 unlink(jsonPath)
 unlink(jsonPathNa)
 
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 2e662424b25f..feca617c2554 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -1042,7 +1042,7 @@ unlink(modelPath)
 
 ## Structured Streaming
 
-SparkR supports the Structured Streaming API (experimental).
+SparkR supports the Structured Streaming API.
 
 You can check the Structured Streaming Programming Guide for [an introduction](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#programming-model) to its programming model and basic concepts.
 
diff --git a/assembly/pom.xml b/assembly/pom.xml
index b3b4239771bc..5c5a8e9b8232 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh
new file mode 100755
index 000000000000..071406336d1b
--- /dev/null
+++ b/bin/docker-image-tool.sh
@@ -0,0 +1,145 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script builds and pushes docker images when run from a release of Spark
+# with Kubernetes support.
+
+function error {
+  echo "$@" 1>&2
+  exit 1
+}
+
+if [ -z "${SPARK_HOME}" ]; then
+  SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+. "${SPARK_HOME}/bin/load-spark-env.sh"
+
+function image_ref {
+  local image="$1"
+  local add_repo="${2:-1}"
+  if [ $add_repo = 1 ] && [ -n "$REPO" ]; then
+    image="$REPO/$image"
+  fi
+  if [ -n "$TAG" ]; then
+    image="$image:$TAG"
+  fi
+  echo "$image"
+}
+
+function build {
+  local BUILD_ARGS
+  local IMG_PATH
+
+  if [ ! -f "$SPARK_HOME/RELEASE" ]; then
+    # Set image build arguments accordingly if this is a source repo and not a distribution archive.
+    IMG_PATH=resource-managers/kubernetes/docker/src/main/dockerfiles
+    BUILD_ARGS=(
+      --build-arg
+      img_path=$IMG_PATH
+      --build-arg
+      spark_jars=assembly/target/scala-$SPARK_SCALA_VERSION/jars
+    )
+  else
+    # Not passed as an argument to docker, but used to validate the Spark directory.
+    IMG_PATH="kubernetes/dockerfiles"
+  fi
+
+  if [ ! -d "$IMG_PATH" ]; then
+    error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark."
+  fi
+
+  docker build "${BUILD_ARGS[@]}" \
+    -t $(image_ref spark) \
+    -f "$IMG_PATH/spark/Dockerfile" .
+}
+
+function push {
+  docker push "$(image_ref spark)"
+}
+
+function usage {
+  cat <<EOF
+Usage: $0 [options] [command]
+Builds or pushes the built-in Spark Docker image.
+
+Commands:
+  build       Build image. Requires a repository address to be provided if the image will be
+              pushed to a different registry.
+  push        Push a pre-built image to a registry. Requires a repository address to be provided.
+
+Options:
+  -r repo     Repository address.
+  -t tag      Tag to apply to the built image, or to identify the image to be pushed.
+  -m          Use minikube's Docker daemon.
+
+Using minikube when building images will do so directly into minikube's Docker daemon.
+There is no need to push the images into minikube in that case, they'll be automatically
+available when running applications inside the minikube cluster.
+
+Check the following documentation for more information on using the minikube Docker daemon:
+
+  https://kubernetes.io/docs/getting-started-guides/minikube/#reusing-the-docker-daemon
+
+Examples:
+  - Build image in minikube with tag "testing"
+    $0 -m -t testing build
+
+  - Build and push image with tag "v2.3.0" to docker.io/myrepo
+    $0 -r docker.io/myrepo -t v2.3.0 build
+    $0 -r docker.io/myrepo -t v2.3.0 push
+EOF
+}
+
+if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
+  usage
+  exit 0
+fi
+
+REPO=
+TAG=
+while getopts mr:t: option
+do
+ case "${option}"
+ in
+ r) REPO=${OPTARG};;
+ t) TAG=${OPTARG};;
+ m)
+   if ! which minikube 1>/dev/null; then
+     error "Cannot find minikube."
+   fi
+   eval $(minikube docker-env)
+   ;;
+ esac
+done
+
+case "${@: -1}" in
+  build)
+    build
+    ;;
+  push)
+    if [ -z "$REPO" ]; then
+      usage
+      exit 1
+    fi
+    push
+    ;;
+  *)
+    usage
+    exit 1
+    ;;
+esac
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index cf93d41cd77c..2a625da682cd 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
index 4f9e10ca2006..0e491efac918 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
@@ -83,6 +83,7 @@ public LevelDB(File path, KVStoreSerializer serializer) throws Exception {
     if (versionData != null) {
       long version = serializer.deserializeLong(versionData);
       if (version != STORE_VERSION) {
+        close();
         throw new UnsupportedStoreVersionException();
       }
     } else {
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 18cbdadd224a..adb189043909 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java b/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
index c20fab83c346..8b8f9892847c 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
@@ -24,6 +24,7 @@
 import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
+import java.nio.file.StandardOpenOption;
 
 import com.google.common.base.Objects;
 import com.google.common.io.ByteStreams;
@@ -132,7 +133,7 @@ public Object convertToNetty() throws IOException {
     if (conf.lazyFileDescriptor()) {
       return new DefaultFileRegion(file, offset, length);
     } else {
-      FileChannel fileChannel = new FileInputStream(file).getChannel();
+      FileChannel fileChannel = FileChannel.open(file.toPath(), StandardOpenOption.READ);
       return new DefaultFileRegion(fileChannel, offset, length);
     }
   }
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 9968480ab765..4cdcfa2a4da9 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
index eacf485344b7..386738ece51a 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
@@ -19,10 +19,10 @@
 
 import java.io.DataInputStream;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.LongBuffer;
+import java.nio.file.Files;
 
 /**
  * Keeps the index information for a particular map output
@@ -39,7 +39,7 @@ public ShuffleIndexInformation(File indexFile) throws IOException {
     offsets = buffer.asLongBuffer();
     DataInputStream dis = null;
     try {
-      dis = new DataInputStream(new FileInputStream(indexFile));
+      dis = new DataInputStream(Files.newInputStream(indexFile.toPath()));
       dis.readFully(buffer.array());
     } finally {
       if (dis != null) {
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index ec2db6e5bb88..494137f8e9ec 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 2d59c71cc375..32bebf5eeca9 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index f7e586ee777e..8dc4ddeea3ec 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index a3772a262008..5613f5e81384 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
index cc9cc429643a..a9603c1aba05 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
@@ -31,8 +31,7 @@
 public class HeapMemoryAllocator implements MemoryAllocator {
 
   @GuardedBy("this")
-  private final Map<Long, LinkedList<WeakReference<MemoryBlock>>> bufferPoolsBySize =
-    new HashMap<>();
+  private final Map<Long, LinkedList<WeakReference<long[]>>> bufferPoolsBySize = new HashMap<>();
 
   private static final int POOLING_THRESHOLD_BYTES = 1024 * 1024;
 
@@ -49,13 +48,14 @@ private boolean shouldPool(long size) {
   public MemoryBlock allocate(long size) throws OutOfMemoryError {
     if (shouldPool(size)) {
       synchronized (this) {
-        final LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
+        final LinkedList<WeakReference<long[]>> pool = bufferPoolsBySize.get(size);
         if (pool != null) {
           while (!pool.isEmpty()) {
-            final WeakReference<MemoryBlock> blockReference = pool.pop();
-            final MemoryBlock memory = blockReference.get();
-            if (memory != null) {
-              assert (memory.size() == size);
+            final WeakReference<long[]> arrayReference = pool.pop();
+            final long[] array = arrayReference.get();
+            if (array != null) {
+              assert (array.length * 8L >= size);
+              MemoryBlock memory = new MemoryBlock(array, Platform.LONG_ARRAY_OFFSET, size);
               if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) {
                 memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE);
               }
@@ -76,18 +76,36 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError {
 
   @Override
   public void free(MemoryBlock memory) {
+    assert (memory.obj != null) :
+      "baseObject was null; are you trying to use the on-heap allocator to free off-heap memory?";
+    assert (memory.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) :
+      "page has already been freed";
+    assert ((memory.pageNumber == MemoryBlock.NO_PAGE_NUMBER)
+            || (memory.pageNumber == MemoryBlock.FREED_IN_TMM_PAGE_NUMBER)) :
+      "TMM-allocated pages must first be freed via TMM.freePage(), not directly in allocator " +
+        "free()";
+
     final long size = memory.size();
     if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) {
       memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE);
     }
+
+    // Mark the page as freed (so we can detect double-frees).
+    memory.pageNumber = MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER;
+
+    // As an additional layer of defense against use-after-free bugs, we mutate the
+    // MemoryBlock to null out its reference to the long[] array.
+    long[] array = (long[]) memory.obj;
+    memory.setObjAndOffset(null, 0);
+
     if (shouldPool(size)) {
       synchronized (this) {
-        LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
+        LinkedList<WeakReference<long[]>> pool = bufferPoolsBySize.get(size);
         if (pool == null) {
           pool = new LinkedList<>();
           bufferPoolsBySize.put(size, pool);
         }
-        pool.add(new WeakReference<>(memory));
+        pool.add(new WeakReference<>(array));
       }
     } else {
       // Do nothing
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
index cd1d378bc147..c333857358d3 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
@@ -26,6 +26,25 @@
  */
 public class MemoryBlock extends MemoryLocation {
 
+  /** Special `pageNumber` value for pages which were not allocated by TaskMemoryManagers */
+  public static final int NO_PAGE_NUMBER = -1;
+
+  /**
+   * Special `pageNumber` value for marking pages that have been freed in the TaskMemoryManager.
+   * We set `pageNumber` to this value in TaskMemoryManager.freePage() so that MemoryAllocator
+   * can detect if pages which were allocated by TaskMemoryManager have been freed in the TMM
+   * before being passed to MemoryAllocator.free() (it is an error to allocate a page in
+   * TaskMemoryManager and then directly free it in a MemoryAllocator without going through
+   * the TMM freePage() call).
+   */
+  public static final int FREED_IN_TMM_PAGE_NUMBER = -2;
+
+  /**
+   * Special `pageNumber` value for pages that have been freed by the MemoryAllocator. This allows
+   * us to detect double-frees.
+   */
+  public static final int FREED_IN_ALLOCATOR_PAGE_NUMBER = -3;
+
   private final long length;
 
   /**
@@ -33,7 +52,7 @@ public class MemoryBlock extends MemoryLocation {
    * TaskMemoryManager. This field is public so that it can be modified by the TaskMemoryManager,
    * which lives in a different package.
    */
-  public int pageNumber = -1;
+  public int pageNumber = NO_PAGE_NUMBER;
 
   public MemoryBlock(@Nullable Object obj, long offset, long length) {
     super(obj, offset);
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java
index 55bcdf1ed7b0..4368fb615ba1 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java
@@ -38,9 +38,20 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError {
   public void free(MemoryBlock memory) {
     assert (memory.obj == null) :
       "baseObject not null; are you trying to use the off-heap allocator to free on-heap memory?";
+    assert (memory.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) :
+      "page has already been freed";
+    assert ((memory.pageNumber == MemoryBlock.NO_PAGE_NUMBER)
+            || (memory.pageNumber == MemoryBlock.FREED_IN_TMM_PAGE_NUMBER)) :
+      "TMM-allocated pages must be freed via TMM.freePage(), not directly in allocator free()";
+
     if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) {
       memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE);
     }
     Platform.freeMemory(memory.offset);
+    // As an additional layer of defense against use-after-free bugs, we mutate the
+    // MemoryBlock to reset its pointer.
+    memory.offset = 0;
+    // Mark the page as freed (so we can detect double-frees).
+    memory.pageNumber = MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER;
   }
 }
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
index 4b141339ec81..62854837b05e 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
@@ -62,6 +62,52 @@ public void overlappingCopyMemory() {
     }
   }
 
+  @Test
+  public void onHeapMemoryAllocatorPoolingReUsesLongArrays() {
+    MemoryBlock block1 = MemoryAllocator.HEAP.allocate(1024 * 1024);
+    Object baseObject1 = block1.getBaseObject();
+    MemoryAllocator.HEAP.free(block1);
+    MemoryBlock block2 = MemoryAllocator.HEAP.allocate(1024 * 1024);
+    Object baseObject2 = block2.getBaseObject();
+    Assert.assertSame(baseObject1, baseObject2);
+    MemoryAllocator.HEAP.free(block2);
+  }
+
+  @Test
+  public void freeingOnHeapMemoryBlockResetsBaseObjectAndOffset() {
+    MemoryBlock block = MemoryAllocator.HEAP.allocate(1024);
+    Assert.assertNotNull(block.getBaseObject());
+    MemoryAllocator.HEAP.free(block);
+    Assert.assertNull(block.getBaseObject());
+    Assert.assertEquals(0, block.getBaseOffset());
+    Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, block.pageNumber);
+  }
+
+  @Test
+  public void freeingOffHeapMemoryBlockResetsOffset() {
+    MemoryBlock block = MemoryAllocator.UNSAFE.allocate(1024);
+    Assert.assertNull(block.getBaseObject());
+    Assert.assertNotEquals(0, block.getBaseOffset());
+    MemoryAllocator.UNSAFE.free(block);
+    Assert.assertNull(block.getBaseObject());
+    Assert.assertEquals(0, block.getBaseOffset());
+    Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, block.pageNumber);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void onHeapMemoryAllocatorThrowsAssertionErrorOnDoubleFree() {
+    MemoryBlock block = MemoryAllocator.HEAP.allocate(1024);
+    MemoryAllocator.HEAP.free(block);
+    MemoryAllocator.HEAP.free(block);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void offHeapMemoryAllocatorThrowsAssertionErrorOnDoubleFree() {
+    MemoryBlock block = MemoryAllocator.UNSAFE.allocate(1024);
+    MemoryAllocator.UNSAFE.free(block);
+    MemoryAllocator.UNSAFE.free(block);
+  }
+
   @Test
   public void memoryDebugFillEnabledInTest() {
     Assert.assertTrue(MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED);
@@ -71,9 +117,11 @@ public void memoryDebugFillEnabledInTest() {
       MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE);
 
     MemoryBlock onheap1 = MemoryAllocator.HEAP.allocate(1024 * 1024);
+    Object onheap1BaseObject = onheap1.getBaseObject();
+    long onheap1BaseOffset = onheap1.getBaseOffset();
     MemoryAllocator.HEAP.free(onheap1);
     Assert.assertEquals(
-      Platform.getByte(onheap1.getBaseObject(), onheap1.getBaseOffset()),
+      Platform.getByte(onheap1BaseObject, onheap1BaseOffset),
       MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE);
     MemoryBlock onheap2 = MemoryAllocator.HEAP.allocate(1024 * 1024);
     Assert.assertEquals(
diff --git a/core/pom.xml b/core/pom.xml
index 0a5bd958fc9c..c34c9df39063 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
index e8d3730daa7a..632d71806221 100644
--- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
+++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
@@ -321,8 +321,12 @@ public MemoryBlock allocatePage(long size, MemoryConsumer consumer) {
    * Free a block of memory allocated via {@link TaskMemoryManager#allocatePage}.
    */
   public void freePage(MemoryBlock page, MemoryConsumer consumer) {
-    assert (page.pageNumber != -1) :
+    assert (page.pageNumber != MemoryBlock.NO_PAGE_NUMBER) :
       "Called freePage() on memory that wasn't allocated with allocatePage()";
+    assert (page.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) :
+      "Called freePage() on a memory block that has already been freed";
+    assert (page.pageNumber != MemoryBlock.FREED_IN_TMM_PAGE_NUMBER) :
+            "Called freePage() on a memory block that has already been freed";
     assert(allocatedPages.get(page.pageNumber));
     pageTable[page.pageNumber] = null;
     synchronized (this) {
@@ -332,6 +336,10 @@ public void freePage(MemoryBlock page, MemoryConsumer consumer) {
       logger.trace("Freed page number {} ({} bytes)", page.pageNumber, page.size());
     }
     long pageSize = page.size();
+    // Clear the page number before passing the block to the MemoryAllocator's free().
+    // Doing this allows the MemoryAllocator to detect when a TaskMemoryManager-managed
+    // page has been inappropriately directly freed without calling TMM.freePage().
+    page.pageNumber = MemoryBlock.FREED_IN_TMM_PAGE_NUMBER;
     memoryManager.tungstenMemoryAllocator().free(page);
     releaseExecutionMemory(pageSize, consumer);
   }
@@ -358,7 +366,7 @@ public long encodePageNumberAndOffset(MemoryBlock page, long offsetInPage) {
 
   @VisibleForTesting
   public static long encodePageNumberAndOffset(int pageNumber, long offsetInPage) {
-    assert (pageNumber != -1) : "encodePageNumberAndOffset called with invalid page";
+    assert (pageNumber >= 0) : "encodePageNumberAndOffset called with invalid page";
     return (((long) pageNumber) << OFFSET_BITS) | (offsetInPage & MASK_LONG_LOWER_51_BITS);
   }
 
@@ -424,6 +432,7 @@ public long cleanUpAllAllocatedMemory() {
       for (MemoryBlock page : pageTable) {
         if (page != null) {
           logger.debug("unreleased page: " + page + " in task " + taskAttemptId);
+          page.pageNumber = MemoryBlock.FREED_IN_TMM_PAGE_NUMBER;
           memoryManager.tungstenMemoryAllocator().free(page);
         }
       }
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 2e00dc8b49dd..6c59038f2a6c 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -195,8 +195,11 @@ private[spark] class ExecutorAllocationManager(
       throw new SparkException(
         "spark.dynamicAllocation.sustainedSchedulerBacklogTimeout must be > 0!")
     }
-    if (executorIdleTimeoutS <= 0) {
-      throw new SparkException("spark.dynamicAllocation.executorIdleTimeout must be > 0!")
+    if (executorIdleTimeoutS < 0) {
+      throw new SparkException("spark.dynamicAllocation.executorIdleTimeout must be >= 0!")
+    }
+    if (cachedExecutorIdleTimeoutS < 0) {
+      throw new SparkException("spark.dynamicAllocation.cachedExecutorIdleTimeout must be >= 0!")
     }
     // Require external shuffle service for dynamic allocation
     // Otherwise, we may lose shuffle files when killing executors
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index d77303e6fdf8..f53b2bed74c6 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -640,9 +640,9 @@ private[spark] object SparkConf extends Logging {
         translation = s => s"${s.toLong * 10}s")),
     "spark.reducer.maxSizeInFlight" -> Seq(
       AlternateConfig("spark.reducer.maxMbInFlight", "1.4")),
-    "spark.kryoserializer.buffer" ->
-        Seq(AlternateConfig("spark.kryoserializer.buffer.mb", "1.4",
-          translation = s => s"${(s.toDouble * 1000).toInt}k")),
+    "spark.kryoserializer.buffer" -> Seq(
+      AlternateConfig("spark.kryoserializer.buffer.mb", "1.4",
+        translation = s => s"${(s.toDouble * 1000).toInt}k")),
     "spark.kryoserializer.buffer.max" -> Seq(
       AlternateConfig("spark.kryoserializer.buffer.max.mb", "1.4")),
     "spark.shuffle.file.buffer" -> Seq(
diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
index 0b87cd503d4f..69739745aa6c 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -66,7 +66,7 @@ object TaskContext {
    * An empty task context that does not represent an actual task.  This is only used in tests.
    */
   private[spark] def empty(): TaskContextImpl = {
-    new TaskContextImpl(0, 0, 0, 0, null, new Properties, null)
+    new TaskContextImpl(0, 0, 0, 0, 0, null, new Properties, null)
   }
 }
 
@@ -150,6 +150,13 @@ abstract class TaskContext extends Serializable {
    */
   def stageId(): Int
 
+  /**
+   * How many times the stage that this task belongs to has been attempted. The first stage attempt
+   * will be assigned stageAttemptNumber = 0, and subsequent attempts will have increasing attempt
+   * numbers.
+   */
+  def stageAttemptNumber(): Int
+
   /**
    * The ID of the RDD partition that is computed by this task.
    */
diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
index 01d8973e1bb0..cccd3ea457ba 100644
--- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
@@ -41,8 +41,9 @@ import org.apache.spark.util._
  * `TaskMetrics` & `MetricsSystem` objects are not thread safe.
  */
 private[spark] class TaskContextImpl(
-    val stageId: Int,
-    val partitionId: Int,
+    override val stageId: Int,
+    override val stageAttemptNumber: Int,
+    override val partitionId: Int,
     override val taskAttemptId: Long,
     override val attemptNumber: Int,
     override val taskMemoryManager: TaskMemoryManager,
diff --git a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
index e88988fe03b2..8d7a4a353a79 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
@@ -21,6 +21,8 @@ import java.util.concurrent.atomic.AtomicLong
 
 import scala.reflect.ClassTag
 
+import org.apache.commons.collections.map.{AbstractReferenceMap, ReferenceMap}
+
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.internal.Logging
 
@@ -52,6 +54,10 @@ private[spark] class BroadcastManager(
 
   private val nextBroadcastId = new AtomicLong(0)
 
+  private[broadcast] val cachedValues = {
+    new ReferenceMap(AbstractReferenceMap.HARD, AbstractReferenceMap.WEAK)
+  }
+
   def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean): Broadcast[T] = {
     broadcastFactory.newBroadcast[T](value_, isLocal, nextBroadcastId.getAndIncrement())
   }
diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
index 7aecd3c9668e..e125095cf477 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
@@ -206,36 +206,50 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
 
   private def readBroadcastBlock(): T = Utils.tryOrIOException {
     TorrentBroadcast.synchronized {
-      setConf(SparkEnv.get.conf)
-      val blockManager = SparkEnv.get.blockManager
-      blockManager.getLocalValues(broadcastId) match {
-        case Some(blockResult) =>
-          if (blockResult.data.hasNext) {
-            val x = blockResult.data.next().asInstanceOf[T]
-            releaseLock(broadcastId)
-            x
-          } else {
-            throw new SparkException(s"Failed to get locally stored broadcast data: $broadcastId")
-          }
-        case None =>
-          logInfo("Started reading broadcast variable " + id)
-          val startTimeMs = System.currentTimeMillis()
-          val blocks = readBlocks()
-          logInfo("Reading broadcast variable " + id + " took" + Utils.getUsedTimeMs(startTimeMs))
-
-          try {
-            val obj = TorrentBroadcast.unBlockifyObject[T](
-              blocks.map(_.toInputStream()), SparkEnv.get.serializer, compressionCodec)
-            // Store the merged copy in BlockManager so other tasks on this executor don't
-            // need to re-fetch it.
-            val storageLevel = StorageLevel.MEMORY_AND_DISK
-            if (!blockManager.putSingle(broadcastId, obj, storageLevel, tellMaster = false)) {
-              throw new SparkException(s"Failed to store $broadcastId in BlockManager")
+      val broadcastCache = SparkEnv.get.broadcastManager.cachedValues
+
+      Option(broadcastCache.get(broadcastId)).map(_.asInstanceOf[T]).getOrElse {
+        setConf(SparkEnv.get.conf)
+        val blockManager = SparkEnv.get.blockManager
+        blockManager.getLocalValues(broadcastId) match {
+          case Some(blockResult) =>
+            if (blockResult.data.hasNext) {
+              val x = blockResult.data.next().asInstanceOf[T]
+              releaseLock(broadcastId)
+
+              if (x != null) {
+                broadcastCache.put(broadcastId, x)
+              }
+
+              x
+            } else {
+              throw new SparkException(s"Failed to get locally stored broadcast data: $broadcastId")
             }
-            obj
-          } finally {
-            blocks.foreach(_.dispose())
-          }
+          case None =>
+            logInfo("Started reading broadcast variable " + id)
+            val startTimeMs = System.currentTimeMillis()
+            val blocks = readBlocks()
+            logInfo("Reading broadcast variable " + id + " took" + Utils.getUsedTimeMs(startTimeMs))
+
+            try {
+              val obj = TorrentBroadcast.unBlockifyObject[T](
+                blocks.map(_.toInputStream()), SparkEnv.get.serializer, compressionCodec)
+              // Store the merged copy in BlockManager so other tasks on this executor don't
+              // need to re-fetch it.
+              val storageLevel = StorageLevel.MEMORY_AND_DISK
+              if (!blockManager.putSingle(broadcastId, obj, storageLevel, tellMaster = false)) {
+                throw new SparkException(s"Failed to store $broadcastId in BlockManager")
+              }
+
+              if (obj != null) {
+                broadcastCache.put(broadcastId, obj)
+              }
+
+              obj
+            } finally {
+              blocks.foreach(_.dispose())
+            }
+        }
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index cbe1f2c3e08a..1e381965c52b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -584,10 +584,11 @@ object SparkSubmit extends CommandLineUtils with Logging {
         confKey = "spark.executor.memory"),
       OptionAssigner(args.totalExecutorCores, STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
         confKey = "spark.cores.max"),
-      OptionAssigner(args.files, LOCAL | STANDALONE | MESOS, ALL_DEPLOY_MODES,
+      OptionAssigner(args.files, LOCAL | STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
         confKey = "spark.files"),
       OptionAssigner(args.jars, LOCAL, CLIENT, confKey = "spark.jars"),
-      OptionAssigner(args.jars, STANDALONE | MESOS, ALL_DEPLOY_MODES, confKey = "spark.jars"),
+      OptionAssigner(args.jars, STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
+        confKey = "spark.jars"),
       OptionAssigner(args.driverMemory, STANDALONE | MESOS | YARN | KUBERNETES, CLUSTER,
         confKey = "spark.driver.memory"),
       OptionAssigner(args.driverCores, STANDALONE | MESOS | YARN | KUBERNETES, CLUSTER,
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 75484f5c9f30..0ec4afad0308 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -28,6 +28,7 @@ import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
 
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.history.config.HISTORY_SERVER_UI_PORT
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationInfo, UIRoot}
@@ -276,7 +277,7 @@ object HistoryServer extends Logging {
       .newInstance(conf)
       .asInstanceOf[ApplicationHistoryProvider]
 
-    val port = conf.getInt("spark.history.ui.port", 18080)
+    val port = conf.get(HISTORY_SERVER_UI_PORT)
 
     val server = new HistoryServer(conf, provider, securityManager, port)
     server.bind()
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/config.scala b/core/src/main/scala/org/apache/spark/deploy/history/config.scala
index 22b6d49d8e2a..efdbf672bb52 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/config.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/config.scala
@@ -44,4 +44,9 @@ private[spark] object config {
     .bytesConf(ByteUnit.BYTE)
     .createWithDefaultString("10g")
 
+  val HISTORY_SERVER_UI_PORT = ConfigBuilder("spark.history.ui.port")
+    .doc("Web UI port to bind Spark History Server")
+    .intConf
+    .createWithDefault(18080)
+
 }
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index eb12ddf96131..bbfcfbaa7363 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -38,10 +38,13 @@ package object config {
     ConfigBuilder("spark.driver.userClassPathFirst").booleanConf.createWithDefault(false)
 
   private[spark] val DRIVER_MEMORY = ConfigBuilder("spark.driver.memory")
+    .doc("Amount of memory to use for the driver process, in MiB unless otherwise specified.")
     .bytesConf(ByteUnit.MiB)
     .createWithDefaultString("1g")
 
   private[spark] val DRIVER_MEMORY_OVERHEAD = ConfigBuilder("spark.driver.memoryOverhead")
+    .doc("The amount of off-heap memory to be allocated per driver in cluster mode, " +
+      "in MiB unless otherwise specified.")
     .bytesConf(ByteUnit.MiB)
     .createOptional
 
@@ -62,6 +65,7 @@ package object config {
       .createWithDefault(false)
 
   private[spark] val EVENT_LOG_OUTPUT_BUFFER_SIZE = ConfigBuilder("spark.eventLog.buffer.kb")
+    .doc("Buffer size to use when writing to output streams, in KiB unless otherwise specified.")
     .bytesConf(ByteUnit.KiB)
     .createWithDefaultString("100k")
 
@@ -81,10 +85,13 @@ package object config {
     ConfigBuilder("spark.executor.userClassPathFirst").booleanConf.createWithDefault(false)
 
   private[spark] val EXECUTOR_MEMORY = ConfigBuilder("spark.executor.memory")
+    .doc("Amount of memory to use per executor process, in MiB unless otherwise specified.")
     .bytesConf(ByteUnit.MiB)
     .createWithDefaultString("1g")
 
   private[spark] val EXECUTOR_MEMORY_OVERHEAD = ConfigBuilder("spark.executor.memoryOverhead")
+    .doc("The amount of off-heap memory to be allocated per executor in cluster mode, " +
+      "in MiB unless otherwise specified.")
     .bytesConf(ByteUnit.MiB)
     .createOptional
 
@@ -353,7 +360,7 @@ package object config {
   private[spark] val BUFFER_WRITE_CHUNK_SIZE =
     ConfigBuilder("spark.buffer.write.chunkSize")
       .internal()
-      .doc("The chunk size during writing out the bytes of ChunkedByteBuffer.")
+      .doc("The chunk size in bytes during writing out the bytes of ChunkedByteBuffer.")
       .bytesConf(ByteUnit.BYTE)
       .checkValue(_ <= Int.MaxValue, "The chunk size during writing out the bytes of" +
         " ChunkedByteBuffer should not larger than Int.MaxValue.")
@@ -368,9 +375,9 @@ package object config {
 
   private[spark] val SHUFFLE_ACCURATE_BLOCK_THRESHOLD =
     ConfigBuilder("spark.shuffle.accurateBlockThreshold")
-      .doc("When we compress the size of shuffle blocks in HighlyCompressedMapStatus, we will " +
-        "record the size accurately if it's above this config. This helps to prevent OOM by " +
-        "avoiding underestimating shuffle block size when fetch shuffle blocks.")
+      .doc("Threshold in bytes above which the size of shuffle blocks in " +
+        "HighlyCompressedMapStatus is accurately recorded. This helps to prevent OOM " +
+        "by avoiding underestimating shuffle block size when fetch shuffle blocks.")
       .bytesConf(ByteUnit.BYTE)
       .createWithDefault(100 * 1024 * 1024)
 
@@ -389,23 +396,23 @@ package object config {
 
   private[spark] val REDUCER_MAX_BLOCKS_IN_FLIGHT_PER_ADDRESS =
     ConfigBuilder("spark.reducer.maxBlocksInFlightPerAddress")
-      .doc("This configuration limits the number of remote blocks being fetched per reduce task" +
-        " from a given host port. When a large number of blocks are being requested from a given" +
-        " address in a single fetch or simultaneously, this could crash the serving executor or" +
-        " Node Manager. This is especially useful to reduce the load on the Node Manager when" +
-        " external shuffle is enabled. You can mitigate the issue by setting it to a lower value.")
+      .doc("This configuration limits the number of remote blocks being fetched per reduce task " +
+        "from a given host port. When a large number of blocks are being requested from a given " +
+        "address in a single fetch or simultaneously, this could crash the serving executor or " +
+        "Node Manager. This is especially useful to reduce the load on the Node Manager when " +
+        "external shuffle is enabled. You can mitigate the issue by setting it to a lower value.")
       .intConf
       .checkValue(_ > 0, "The max no. of blocks in flight cannot be non-positive.")
       .createWithDefault(Int.MaxValue)
 
   private[spark] val MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM =
     ConfigBuilder("spark.maxRemoteBlockSizeFetchToMem")
-      .doc("Remote block will be fetched to disk when size of the block is " +
-        "above this threshold. This is to avoid a giant request takes too much memory. We can " +
-        "enable this config by setting a specific value(e.g. 200m). Note this configuration will " +
-        "affect both shuffle fetch and block manager remote block fetch. For users who " +
-        "enabled external shuffle service, this feature can only be worked when external shuffle" +
-        " service is newer than Spark 2.2.")
+      .doc("Remote block will be fetched to disk when size of the block is above this threshold " +
+        "in bytes. This is to avoid a giant request takes too much memory. We can enable this " +
+        "config by setting a specific value(e.g. 200m). Note this configuration will affect " +
+        "both shuffle fetch and block manager remote block fetch. For users who enabled " +
+        "external shuffle service, this feature can only be worked when external shuffle" +
+        "service is newer than Spark 2.2.")
       .bytesConf(ByteUnit.BYTE)
       .createWithDefault(Long.MaxValue)
 
@@ -419,9 +426,9 @@ package object config {
 
   private[spark] val SHUFFLE_FILE_BUFFER_SIZE =
     ConfigBuilder("spark.shuffle.file.buffer")
-      .doc("Size of the in-memory buffer for each shuffle file output stream. " +
-        "These buffers reduce the number of disk seeks and system calls made " +
-        "in creating intermediate shuffle files.")
+      .doc("Size of the in-memory buffer for each shuffle file output stream, in KiB unless " +
+        "otherwise specified. These buffers reduce the number of disk seeks and system calls " +
+        "made in creating intermediate shuffle files.")
       .bytesConf(ByteUnit.KiB)
       .checkValue(v => v > 0 && v <= Int.MaxValue / 1024,
         s"The file buffer size must be greater than 0 and less than ${Int.MaxValue / 1024}.")
@@ -430,7 +437,7 @@ package object config {
   private[spark] val SHUFFLE_UNSAFE_FILE_OUTPUT_BUFFER_SIZE =
     ConfigBuilder("spark.shuffle.unsafe.file.output.buffer")
       .doc("The file system for this buffer size after each partition " +
-        "is written in unsafe shuffle writer.")
+        "is written in unsafe shuffle writer. In KiB unless otherwise specified.")
       .bytesConf(ByteUnit.KiB)
       .checkValue(v => v > 0 && v <= Int.MaxValue / 1024,
         s"The buffer size must be greater than 0 and less than ${Int.MaxValue / 1024}.")
@@ -438,7 +445,7 @@ package object config {
 
   private[spark] val SHUFFLE_DISK_WRITE_BUFFER_SIZE =
     ConfigBuilder("spark.shuffle.spill.diskWriteBufferSize")
-      .doc("The buffer size to use when writing the sorted records to an on-disk file.")
+      .doc("The buffer size, in bytes, to use when writing the sorted records to an on-disk file.")
       .bytesConf(ByteUnit.BYTE)
       .checkValue(v => v > 0 && v <= Int.MaxValue,
         s"The buffer size must be greater than 0 and less than ${Int.MaxValue}.")
diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 50f51e1af453..6d0059b6a027 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -28,8 +28,9 @@ import org.apache.spark.util.Utils
  *
  * 1. Implementations must be serializable, as the committer instance instantiated on the driver
  *    will be used for tasks on executors.
- * 2. Implementations should have a constructor with 2 arguments:
- *      (jobId: String, path: String)
+ * 2. Implementations should have a constructor with 2 or 3 arguments:
+ *      (jobId: String, path: String) or
+ *      (jobId: String, path: String, dynamicPartitionOverwrite: Boolean)
  * 3. A committer should not be reused across multiple Spark jobs.
  *
  * The proper call sequence is:
@@ -139,10 +140,22 @@ object FileCommitProtocol {
   /**
    * Instantiates a FileCommitProtocol using the given className.
    */
-  def instantiate(className: String, jobId: String, outputPath: String)
-    : FileCommitProtocol = {
+  def instantiate(
+      className: String,
+      jobId: String,
+      outputPath: String,
+      dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = {
     val clazz = Utils.classForName(className).asInstanceOf[Class[FileCommitProtocol]]
-    val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
-    ctor.newInstance(jobId, outputPath)
+    // First try the constructor with arguments (jobId: String, outputPath: String,
+    // dynamicPartitionOverwrite: Boolean).
+    // If that doesn't exist, try the one with (jobId: string, outputPath: String).
+    try {
+      val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean])
+      ctor.newInstance(jobId, outputPath, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
+    } catch {
+      case _: NoSuchMethodException =>
+        val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
+        ctor.newInstance(jobId, outputPath)
+    }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 95c99d29c3a9..6d20ef1f98a3 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -39,8 +39,19 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
  *
  * @param jobId the job's or stage's id
  * @param path the job's output path, or null if committer acts as a noop
+ * @param dynamicPartitionOverwrite If true, Spark will overwrite partition directories at runtime
+ *                                  dynamically, i.e., we first write files under a staging
+ *                                  directory with partition path, e.g.
+ *                                  /path/to/staging/a=1/b=1/xxx.parquet. When committing the job,
+ *                                  we first clean up the corresponding partition directories at
+ *                                  destination path, e.g. /path/to/destination/a=1/b=1, and move
+ *                                  files from staging directory to the corresponding partition
+ *                                  directories under destination path.
  */
-class HadoopMapReduceCommitProtocol(jobId: String, path: String)
+class HadoopMapReduceCommitProtocol(
+    jobId: String,
+    path: String,
+    dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
   import FileCommitProtocol._
@@ -67,9 +78,17 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
   @transient private var addedAbsPathFiles: mutable.Map[String, String] = null
 
   /**
-   * The staging directory for all files committed with absolute output paths.
+   * Tracks partitions with default path that have new files written into them by this task,
+   * e.g. a=1/b=2. Files under these partitions will be saved into staging directory and moved to
+   * destination directory at the end, if `dynamicPartitionOverwrite` is true.
    */
-  private def absPathStagingDir: Path = new Path(path, "_temporary-" + jobId)
+  @transient private var partitionPaths: mutable.Set[String] = null
+
+  /**
+   * The staging directory of this write job. Spark uses it to deal with files with absolute output
+   * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
+   */
+  private def stagingDir = new Path(path, ".spark-staging-" + jobId)
 
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.newInstance()
@@ -85,11 +104,16 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
       taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
     val filename = getFilename(taskContext, ext)
 
-    val stagingDir: String = committer match {
+    val stagingDir: Path = committer match {
+      case _ if dynamicPartitionOverwrite =>
+        assert(dir.isDefined,
+          "The dataset to be written must be partitioned when dynamicPartitionOverwrite is true.")
+        partitionPaths += dir.get
+        this.stagingDir
       // For FileOutputCommitter it has its own staging path called "work path".
       case f: FileOutputCommitter =>
-        Option(f.getWorkPath).map(_.toString).getOrElse(path)
-      case _ => path
+        new Path(Option(f.getWorkPath).map(_.toString).getOrElse(path))
+      case _ => new Path(path)
     }
 
     dir.map { d =>
@@ -106,8 +130,7 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
 
     // Include a UUID here to prevent file collisions for one task writing to different dirs.
     // In principle we could include hash(absoluteDir) instead but this is simpler.
-    val tmpOutputPath = new Path(
-      absPathStagingDir, UUID.randomUUID().toString() + "-" + filename).toString
+    val tmpOutputPath = new Path(stagingDir, UUID.randomUUID().toString() + "-" + filename).toString
 
     addedAbsPathFiles(tmpOutputPath) = absOutputPath
     tmpOutputPath
@@ -141,23 +164,42 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
 
   override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
     committer.commitJob(jobContext)
-    val filesToMove = taskCommits.map(_.obj.asInstanceOf[Map[String, String]])
-      .foldLeft(Map[String, String]())(_ ++ _)
-    logDebug(s"Committing files staged for absolute locations $filesToMove")
+
     if (hasValidPath) {
-      val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
+      val (allAbsPathFiles, allPartitionPaths) =
+        taskCommits.map(_.obj.asInstanceOf[(Map[String, String], Set[String])]).unzip
+      val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
+
+      val filesToMove = allAbsPathFiles.foldLeft(Map[String, String]())(_ ++ _)
+      logDebug(s"Committing files staged for absolute locations $filesToMove")
+      if (dynamicPartitionOverwrite) {
+        val absPartitionPaths = filesToMove.values.map(new Path(_).getParent).toSet
+        logDebug(s"Clean up absolute partition directories for overwriting: $absPartitionPaths")
+        absPartitionPaths.foreach(fs.delete(_, true))
+      }
       for ((src, dst) <- filesToMove) {
         fs.rename(new Path(src), new Path(dst))
       }
-      fs.delete(absPathStagingDir, true)
+
+      if (dynamicPartitionOverwrite) {
+        val partitionPaths = allPartitionPaths.foldLeft(Set[String]())(_ ++ _)
+        logDebug(s"Clean up default partition directories for overwriting: $partitionPaths")
+        for (part <- partitionPaths) {
+          val finalPartPath = new Path(path, part)
+          fs.delete(finalPartPath, true)
+          fs.rename(new Path(stagingDir, part), finalPartPath)
+        }
+      }
+
+      fs.delete(stagingDir, true)
     }
   }
 
   override def abortJob(jobContext: JobContext): Unit = {
     committer.abortJob(jobContext, JobStatus.State.FAILED)
     if (hasValidPath) {
-      val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
-      fs.delete(absPathStagingDir, true)
+      val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
+      fs.delete(stagingDir, true)
     }
   }
 
@@ -165,13 +207,14 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
     committer = setupCommitter(taskContext)
     committer.setupTask(taskContext)
     addedAbsPathFiles = mutable.Map[String, String]()
+    partitionPaths = mutable.Set[String]()
   }
 
   override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
     val attemptId = taskContext.getTaskAttemptID
     SparkHadoopMapRedUtil.commitTask(
       committer, taskContext, attemptId.getJobID.getId, attemptId.getTaskID.getId)
-    new TaskCommitMessage(addedAbsPathFiles.toMap)
+    new TaskCommitMessage(addedAbsPathFiles.toMap -> partitionPaths.toSet)
   }
 
   override def abortTask(taskContext: TaskAttemptContext): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 8798dfc92536..7859781e9822 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -150,7 +150,7 @@ abstract class RDD[T: ClassTag](
   val id: Int = sc.newRddId()
 
   /** A friendly name for this RDD */
-  @transient var name: String = null
+  @transient var name: String = _
 
   /** Assign a name to this RDD */
   def setName(_name: String): this.type = {
@@ -224,8 +224,8 @@ abstract class RDD[T: ClassTag](
 
   // Our dependencies and partitions will be gotten by calling subclass's methods below, and will
   // be overwritten when we're checkpointed
-  private var dependencies_ : Seq[Dependency[_]] = null
-  @transient private var partitions_ : Array[Partition] = null
+  private var dependencies_ : Seq[Dependency[_]] = _
+  @transient private var partitions_ : Array[Partition] = _
 
   /** An Option holding our checkpoint RDD, if we are checkpointed */
   private def checkpointRDD: Option[CheckpointRDD[T]] = checkpointData.flatMap(_.checkpointRDD)
@@ -297,7 +297,7 @@ abstract class RDD[T: ClassTag](
   private[spark] def getNarrowAncestors: Seq[RDD[_]] = {
     val ancestors = new mutable.HashSet[RDD[_]]
 
-    def visit(rdd: RDD[_]) {
+    def visit(rdd: RDD[_]): Unit = {
       val narrowDependencies = rdd.dependencies.filter(_.isInstanceOf[NarrowDependency[_]])
       val narrowParents = narrowDependencies.map(_.rdd)
       val narrowParentsNotVisited = narrowParents.filterNot(ancestors.contains)
@@ -449,7 +449,7 @@ abstract class RDD[T: ClassTag](
     if (shuffle) {
       /** Distributes elements evenly across output partitions, starting from a random partition. */
       val distributePartition = (index: Int, items: Iterator[T]) => {
-        var position = (new Random(hashing.byteswap32(index))).nextInt(numPartitions)
+        var position = new Random(hashing.byteswap32(index)).nextInt(numPartitions)
         items.map { t =>
           // Note that the hash code of the key will just be the key itself. The HashPartitioner
           // will mod it with the number of total partitions.
@@ -951,7 +951,7 @@ abstract class RDD[T: ClassTag](
     def collectPartition(p: Int): Array[T] = {
       sc.runJob(this, (iter: Iterator[T]) => iter.toArray, Seq(p)).head
     }
-    (0 until partitions.length).iterator.flatMap(i => collectPartition(i))
+    partitions.indices.iterator.flatMap(i => collectPartition(i))
   }
 
   /**
@@ -1338,6 +1338,7 @@ abstract class RDD[T: ClassTag](
         // The number of partitions to try in this iteration. It is ok for this number to be
         // greater than totalParts because we actually cap it at totalParts in runJob.
         var numPartsToTry = 1L
+        val left = num - buf.size
         if (partsScanned > 0) {
           // If we didn't find any rows after the previous iteration, quadruple and retry.
           // Otherwise, interpolate the number of partitions we need to try, but overestimate
@@ -1345,13 +1346,12 @@ abstract class RDD[T: ClassTag](
           if (buf.isEmpty) {
             numPartsToTry = partsScanned * scaleUpFactor
           } else {
-            // the left side of max is >=1 whenever partsScanned >= 2
-            numPartsToTry = Math.max((1.5 * num * partsScanned / buf.size).toInt - partsScanned, 1)
+            // As left > 0, numPartsToTry is always >= 1
+            numPartsToTry = Math.ceil(1.5 * left * partsScanned / buf.size).toInt
             numPartsToTry = Math.min(numPartsToTry, partsScanned * scaleUpFactor)
           }
         }
 
-        val left = num - buf.size
         val p = partsScanned.until(math.min(partsScanned + numPartsToTry, totalParts).toInt)
         val res = sc.runJob(this, (it: Iterator[T]) => it.take(left).toArray, p)
 
@@ -1677,8 +1677,7 @@ abstract class RDD[T: ClassTag](
   // an RDD and its parent in every batch, in which case the parent may never be checkpointed
   // and its lineage never truncated, leading to OOMs in the long run (SPARK-6847).
   private val checkpointAllMarkedAncestors =
-    Option(sc.getLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS))
-      .map(_.toBoolean).getOrElse(false)
+    Option(sc.getLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS)).exists(_.toBoolean)
 
   /** Returns the first parent RDD */
   protected[spark] def firstParent[U: ClassTag]: RDD[U] = {
@@ -1686,7 +1685,7 @@ abstract class RDD[T: ClassTag](
   }
 
   /** Returns the jth parent RDD: e.g. rdd.parent[T](0) is equivalent to rdd.firstParent[T] */
-  protected[spark] def parent[U: ClassTag](j: Int) = {
+  protected[spark] def parent[U: ClassTag](j: Int): RDD[U] = {
     dependencies(j).rdd.asInstanceOf[RDD[U]]
   }
 
@@ -1754,7 +1753,7 @@ abstract class RDD[T: ClassTag](
    * collected. Subclasses of RDD may override this method for implementing their own cleaning
    * logic. See [[org.apache.spark.rdd.UnionRDD]] for an example.
    */
-  protected def clearDependencies() {
+  protected def clearDependencies(): Unit = {
     dependencies_ = null
   }
 
@@ -1790,7 +1789,7 @@ abstract class RDD[T: ClassTag](
           val lastDepStrings =
             debugString(lastDep.rdd, prefix, lastDep.isInstanceOf[ShuffleDependency[_, _, _]], true)
 
-          (frontDepStrings ++ lastDepStrings)
+          frontDepStrings ++ lastDepStrings
       }
     }
     // The first RDD in the dependency stack has no parents, so no need for a +-
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index f951591e02a5..a2936d6ad539 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -332,16 +332,14 @@ private[netty] class NettyRpcEnv(
 
     val pipe = Pipe.open()
     val source = new FileDownloadChannel(pipe.source())
-    try {
+    Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
       val client = downloadClient(parsedUri.getHost(), parsedUri.getPort())
       val callback = new FileDownloadCallback(pipe.sink(), source, client)
       client.stream(parsedUri.getPath(), callback)
-    } catch {
-      case e: Exception =>
-        pipe.sink().close()
-        source.close()
-        throw e
-    }
+    })(catchBlock = {
+      pipe.sink().close()
+      source.close()
+    })
 
     source
   }
@@ -370,24 +368,33 @@ private[netty] class NettyRpcEnv(
     fileDownloadFactory.createClient(host, port)
   }
 
-  private class FileDownloadChannel(source: ReadableByteChannel) extends ReadableByteChannel {
+  private class FileDownloadChannel(source: Pipe.SourceChannel) extends ReadableByteChannel {
 
     @volatile private var error: Throwable = _
 
     def setError(e: Throwable): Unit = {
+      // This setError callback is invoked by internal RPC threads in order to propagate remote
+      // exceptions to application-level threads which are reading from this channel. When an
+      // RPC error occurs, the RPC system will call setError() and then will close the
+      // Pipe.SinkChannel corresponding to the other end of the `source` pipe. Closing of the pipe
+      // sink will cause `source.read()` operations to return EOF, unblocking the application-level
+      // reading thread. Thus there is no need to actually call `source.close()` here in the
+      // onError() callback and, in fact, calling it here would be dangerous because the close()
+      // would be asynchronous with respect to the read() call and could trigger race-conditions
+      // that lead to data corruption. See the PR for SPARK-22982 for more details on this topic.
       error = e
-      source.close()
     }
 
     override def read(dst: ByteBuffer): Int = {
       Try(source.read(dst)) match {
+        // See the documentation above in setError(): if an RPC error has occurred then setError()
+        // will be called to propagate the RPC error and then `source`'s corresponding
+        // Pipe.SinkChannel will be closed, unblocking this read. In that case, we want to propagate
+        // the remote RPC exception (and not any exceptions triggered by the pipe close, such as
+        // ChannelClosedException), hence this `error != null` check:
+        case _ if error != null => throw error
         case Success(bytesRead) => bytesRead
-        case Failure(readErr) =>
-          if (error != null) {
-            throw error
-          } else {
-            throw readErr
-          }
+        case Failure(readErr) => throw readErr
       }
     }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index c2498d4808e9..199937b8c27a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -815,7 +815,8 @@ class DAGScheduler(
   private[scheduler] def handleBeginEvent(task: Task[_], taskInfo: TaskInfo) {
     // Note that there is a chance that this task is launched after the stage is cancelled.
     // In that case, we wouldn't have the stage anymore in stageIdToStage.
-    val stageAttemptId = stageIdToStage.get(task.stageId).map(_.latestInfo.attemptId).getOrElse(-1)
+    val stageAttemptId =
+      stageIdToStage.get(task.stageId).map(_.latestInfo.attemptNumber).getOrElse(-1)
     listenerBus.post(SparkListenerTaskStart(task.stageId, stageAttemptId, taskInfo))
   }
 
@@ -1050,7 +1051,7 @@ class DAGScheduler(
             val locs = taskIdToLocations(id)
             val part = stage.rdd.partitions(id)
             stage.pendingPartitions += id
-            new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
+            new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
               taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
               Option(sc.applicationId), sc.applicationAttemptId)
           }
@@ -1060,7 +1061,7 @@ class DAGScheduler(
             val p: Int = stage.partitions(id)
             val part = stage.rdd.partitions(p)
             val locs = taskIdToLocations(id)
-            new ResultTask(stage.id, stage.latestInfo.attemptId,
+            new ResultTask(stage.id, stage.latestInfo.attemptNumber,
               taskBinary, part, locs, id, properties, serializedTaskMetrics,
               Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
           }
@@ -1076,7 +1077,7 @@ class DAGScheduler(
       logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +
         s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
       taskScheduler.submitTasks(new TaskSet(
-        tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
+        tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
     } else {
       // Because we posted SparkListenerStageSubmitted earlier, we should mark
       // the stage as completed here in case there are no tasks to run
@@ -1245,7 +1246,7 @@ class DAGScheduler(
             val status = event.result.asInstanceOf[MapStatus]
             val execId = status.location.executorId
             logDebug("ShuffleMapTask finished on " + execId)
-            if (stageIdToStage(task.stageId).latestInfo.attemptId == task.stageAttemptId) {
+            if (stageIdToStage(task.stageId).latestInfo.attemptNumber == task.stageAttemptId) {
               // This task was for the currently running attempt of the stage. Since the task
               // completed successfully from the perspective of the TaskSetManager, mark it as
               // no longer pending (the TaskSetManager may consider the task complete even
@@ -1324,10 +1325,10 @@ class DAGScheduler(
         val failedStage = stageIdToStage(task.stageId)
         val mapStage = shuffleIdToMapStage(shuffleId)
 
-        if (failedStage.latestInfo.attemptId != task.stageAttemptId) {
+        if (failedStage.latestInfo.attemptNumber != task.stageAttemptId) {
           logInfo(s"Ignoring fetch failure from $task as it's from $failedStage attempt" +
             s" ${task.stageAttemptId} and there is a more recent attempt for that stage " +
-            s"(attempt ID ${failedStage.latestInfo.attemptId}) running")
+            s"(attempt ${failedStage.latestInfo.attemptNumber}) running")
         } else {
           // It is likely that we receive multiple FetchFailed for a single stage (because we have
           // multiple tasks running concurrently on different executors). In that case, it is
diff --git a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
index 23121402b102..ba6387a8f08a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
@@ -62,6 +62,9 @@ private[spark] class LiveListenerBus(conf: SparkConf) {
 
   private val queues = new CopyOnWriteArrayList[AsyncEventQueue]()
 
+  // Visible for testing.
+  @volatile private[scheduler] var queuedEvents = new mutable.ListBuffer[SparkListenerEvent]()
+
   /** Add a listener to queue shared by all non-internal listeners. */
   def addToSharedQueue(listener: SparkListenerInterface): Unit = {
     addToQueue(listener, SHARED_QUEUE)
@@ -125,13 +128,39 @@ private[spark] class LiveListenerBus(conf: SparkConf) {
 
   /** Post an event to all queues. */
   def post(event: SparkListenerEvent): Unit = {
-    if (!stopped.get()) {
-      metrics.numEventsPosted.inc()
-      val it = queues.iterator()
-      while (it.hasNext()) {
-        it.next().post(event)
+    if (stopped.get()) {
+      return
+    }
+
+    metrics.numEventsPosted.inc()
+
+    // If the event buffer is null, it means the bus has been started and we can avoid
+    // synchronization and post events directly to the queues. This should be the most
+    // common case during the life of the bus.
+    if (queuedEvents == null) {
+      postToQueues(event)
+      return
+    }
+
+    // Otherwise, need to synchronize to check whether the bus is started, to make sure the thread
+    // calling start() picks up the new event.
+    synchronized {
+      if (!started.get()) {
+        queuedEvents += event
+        return
       }
     }
+
+    // If the bus was already started when the check above was made, just post directly to the
+    // queues.
+    postToQueues(event)
+  }
+
+  private def postToQueues(event: SparkListenerEvent): Unit = {
+    val it = queues.iterator()
+    while (it.hasNext()) {
+      it.next().post(event)
+    }
   }
 
   /**
@@ -149,7 +178,11 @@ private[spark] class LiveListenerBus(conf: SparkConf) {
     }
 
     this.sparkContext = sc
-    queues.asScala.foreach(_.start(sc))
+    queues.asScala.foreach { q =>
+      q.start(sc)
+      queuedEvents.foreach(q.post)
+    }
+    queuedEvents = null
     metricsSystem.registerSource(metrics)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
index c513ed36d168..903e25b7986f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
@@ -30,7 +30,7 @@ import org.apache.spark.storage.RDDInfo
 @DeveloperApi
 class StageInfo(
     val stageId: Int,
-    val attemptId: Int,
+    @deprecated("Use attemptNumber instead", "2.3.0") val attemptId: Int,
     val name: String,
     val numTasks: Int,
     val rddInfos: Seq[RDDInfo],
@@ -56,6 +56,8 @@ class StageInfo(
     completionTime = Some(System.currentTimeMillis)
   }
 
+  def attemptNumber(): Int = attemptId
+
   private[spark] def getStatusString: String = {
     if (completionTime.isDefined) {
       if (failureReason.isDefined) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala b/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala
index 3c8cab7504c1..3c7af4f6146f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala
@@ -79,7 +79,7 @@ class StatsReportListener extends SparkListener with Logging {
       x => info.completionTime.getOrElse(System.currentTimeMillis()) - x
     ).getOrElse("-")
 
-    s"Stage(${info.stageId}, ${info.attemptId}); Name: '${info.name}'; " +
+    s"Stage(${info.stageId}, ${info.attemptNumber}); Name: '${info.name}'; " +
       s"Status: ${info.getStatusString}$failureReason; numTasks: ${info.numTasks}; " +
       s"Took: $timeTaken msec"
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 7767ef1803a0..f536fc2a5f0a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -79,6 +79,7 @@ private[spark] abstract class Task[T](
     SparkEnv.get.blockManager.registerTask(taskAttemptId)
     context = new TaskContextImpl(
       stageId,
+      stageAttemptId, // stageAttemptId and stageAttemptNumber are semantically equal
       partitionId,
       taskAttemptId,
       attemptNumber,
diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index 15540485170d..266ee42e39cc 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.shuffle
 
 import java.io._
-
-import com.google.common.io.ByteStreams
+import java.nio.channels.Channels
+import java.nio.file.Files
 
 import org.apache.spark.{SparkConf, SparkEnv}
 import org.apache.spark.internal.Logging
@@ -196,11 +196,24 @@ private[spark] class IndexShuffleBlockResolver(
     // find out the consolidated file, then the offset within that from our index
     val indexFile = getIndexFile(blockId.shuffleId, blockId.mapId)
 
-    val in = new DataInputStream(new FileInputStream(indexFile))
+    // SPARK-22982: if this FileInputStream's position is seeked forward by another piece of code
+    // which is incorrectly using our file descriptor then this code will fetch the wrong offsets
+    // (which may cause a reducer to be sent a different reducer's data). The explicit position
+    // checks added here were a useful debugging aid during SPARK-22982 and may help prevent this
+    // class of issue from re-occurring in the future which is why they are left here even though
+    // SPARK-22982 is fixed.
+    val channel = Files.newByteChannel(indexFile.toPath)
+    channel.position(blockId.reduceId * 8)
+    val in = new DataInputStream(Channels.newInputStream(channel))
     try {
-      ByteStreams.skipFully(in, blockId.reduceId * 8)
       val offset = in.readLong()
       val nextOffset = in.readLong()
+      val actualPosition = channel.position()
+      val expectedPosition = blockId.reduceId * 8 + 16
+      if (actualPosition != expectedPosition) {
+        throw new Exception(s"SPARK-22982: Incorrect channel position after index file reads: " +
+          s"expected $expectedPosition but actual position was $actualPosition.")
+      }
       new FileSegmentManagedBuffer(
         transportConf,
         getDataFile(blockId.shuffleId, blockId.mapId),
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 487a782e865e..b4edcf23abc0 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -377,6 +377,10 @@ private[spark] class AppStatusListener(
     Option(liveStages.get((event.stageId, event.stageAttemptId))).foreach { stage =>
       stage.activeTasks += 1
       stage.firstLaunchTime = math.min(stage.firstLaunchTime, event.taskInfo.launchTime)
+
+      val locality = event.taskInfo.taskLocality.toString()
+      val count = stage.localitySummary.getOrElse(locality, 0L) + 1L
+      stage.localitySummary = stage.localitySummary ++ Map(locality -> count)
       maybeUpdate(stage, now)
 
       stage.jobs.foreach { job =>
@@ -433,7 +437,7 @@ private[spark] class AppStatusListener(
       }
       task.errorMessage = errorMessage
       val delta = task.updateMetrics(event.taskMetrics)
-      update(task, now)
+      update(task, now, last = true)
       delta
     }.orNull
 
@@ -450,7 +454,7 @@ private[spark] class AppStatusListener(
 
     Option(liveStages.get((event.stageId, event.stageAttemptId))).foreach { stage =>
       if (metricsDelta != null) {
-        stage.metrics.update(metricsDelta)
+        stage.metrics = LiveEntityHelpers.addMetrics(stage.metrics, metricsDelta)
       }
       stage.activeTasks -= 1
       stage.completedTasks += completedDelta
@@ -486,7 +490,7 @@ private[spark] class AppStatusListener(
       esummary.failedTasks += failedDelta
       esummary.killedTasks += killedDelta
       if (metricsDelta != null) {
-        esummary.metrics.update(metricsDelta)
+        esummary.metrics = LiveEntityHelpers.addMetrics(esummary.metrics, metricsDelta)
       }
       maybeUpdate(esummary, now)
 
@@ -529,7 +533,8 @@ private[spark] class AppStatusListener(
   }
 
   override def onStageCompleted(event: SparkListenerStageCompleted): Unit = {
-    val maybeStage = Option(liveStages.remove((event.stageInfo.stageId, event.stageInfo.attemptId)))
+    val maybeStage =
+      Option(liveStages.remove((event.stageInfo.stageId, event.stageInfo.attemptNumber)))
     maybeStage.foreach { stage =>
       val now = System.nanoTime()
       stage.info = event.stageInfo
@@ -603,11 +608,11 @@ private[spark] class AppStatusListener(
         maybeUpdate(task, now)
 
         Option(liveStages.get((sid, sAttempt))).foreach { stage =>
-          stage.metrics.update(delta)
+          stage.metrics = LiveEntityHelpers.addMetrics(stage.metrics, delta)
           maybeUpdate(stage, now)
 
           val esummary = stage.executorSummary(event.execId)
-          esummary.metrics.update(delta)
+          esummary.metrics = LiveEntityHelpers.addMetrics(esummary.metrics, delta)
           maybeUpdate(esummary, now)
         }
       }
@@ -689,7 +694,7 @@ private[spark] class AppStatusListener(
     // can update the executor information too.
     liveRDDs.get(block.rddId).foreach { rdd =>
       if (updatedStorageLevel.isDefined) {
-        rdd.storageLevel = updatedStorageLevel.get
+        rdd.setStorageLevel(updatedStorageLevel.get)
       }
 
       val partition = rdd.partition(block.name)
@@ -785,7 +790,7 @@ private[spark] class AppStatusListener(
   }
 
   private def getOrCreateStage(info: StageInfo): LiveStage = {
-    val stage = liveStages.computeIfAbsent((info.stageId, info.attemptId),
+    val stage = liveStages.computeIfAbsent((info.stageId, info.attemptNumber),
       new Function[(Int, Int), LiveStage]() {
         override def apply(key: (Int, Int)): LiveStage = new LiveStage()
       })
@@ -813,7 +818,7 @@ private[spark] class AppStatusListener(
 
   /** Update a live entity only if it hasn't been updated in the last configured period. */
   private def maybeUpdate(entity: LiveEntity, now: Long): Unit = {
-    if (liveUpdatePeriodNs >= 0 && now - entity.lastWriteTime > liveUpdatePeriodNs) {
+    if (live && liveUpdatePeriodNs >= 0 && now - entity.lastWriteTime > liveUpdatePeriodNs) {
       update(entity, now)
     }
   }
@@ -864,7 +869,7 @@ private[spark] class AppStatusListener(
     }
 
     stages.foreach { s =>
-      val key = s.id
+      val key = Array(s.info.stageId, s.info.attemptId)
       kvstore.delete(s.getClass(), key)
 
       val execSummaries = kvstore.view(classOf[ExecutorStageSummaryWrapper])
@@ -884,15 +889,15 @@ private[spark] class AppStatusListener(
         .asScala
 
       tasks.foreach { t =>
-        kvstore.delete(t.getClass(), t.info.taskId)
+        kvstore.delete(t.getClass(), t.taskId)
       }
 
       // Check whether there are remaining attempts for the same stage. If there aren't, then
       // also delete the RDD graph data.
       val remainingAttempts = kvstore.view(classOf[StageDataWrapper])
         .index("stageId")
-        .first(s.stageId)
-        .last(s.stageId)
+        .first(s.info.stageId)
+        .last(s.info.stageId)
         .closeableIterator()
 
       val hasMoreAttempts = try {
@@ -904,23 +909,25 @@ private[spark] class AppStatusListener(
       }
 
       if (!hasMoreAttempts) {
-        kvstore.delete(classOf[RDDOperationGraphWrapper], s.stageId)
+        kvstore.delete(classOf[RDDOperationGraphWrapper], s.info.stageId)
       }
+
+      cleanupCachedQuantiles(key)
     }
   }
 
   private def cleanupTasks(stage: LiveStage): Unit = {
     val countToDelete = calculateNumberToRemove(stage.savedTasks.get(), maxTasksPerStage).toInt
     if (countToDelete > 0) {
-      val stageKey = Array(stage.info.stageId, stage.info.attemptId)
+      val stageKey = Array(stage.info.stageId, stage.info.attemptNumber)
       val view = kvstore.view(classOf[TaskDataWrapper]).index("stage").first(stageKey)
         .last(stageKey)
 
       // Try to delete finished tasks only.
       val toDelete = KVUtils.viewToSeq(view, countToDelete) { t =>
-        !live || t.info.status != TaskState.RUNNING.toString()
+        !live || t.status != TaskState.RUNNING.toString()
       }
-      toDelete.foreach { t => kvstore.delete(t.getClass(), t.info.taskId) }
+      toDelete.foreach { t => kvstore.delete(t.getClass(), t.taskId) }
       stage.savedTasks.addAndGet(-toDelete.size)
 
       // If there are more running tasks than the configured limit, delete running tasks. This
@@ -929,13 +936,34 @@ private[spark] class AppStatusListener(
       val remaining = countToDelete - toDelete.size
       if (remaining > 0) {
         val runningTasksToDelete = view.max(remaining).iterator().asScala.toList
-        runningTasksToDelete.foreach { t => kvstore.delete(t.getClass(), t.info.taskId) }
+        runningTasksToDelete.foreach { t => kvstore.delete(t.getClass(), t.taskId) }
         stage.savedTasks.addAndGet(-remaining)
       }
+
+      // On live applications, cleanup any cached quantiles for the stage. This makes sure that
+      // quantiles will be recalculated after tasks are replaced with newer ones.
+      //
+      // This is not needed in the SHS since caching only happens after the event logs are
+      // completely processed.
+      if (live) {
+        cleanupCachedQuantiles(stageKey)
+      }
     }
     stage.cleaning = false
   }
 
+  private def cleanupCachedQuantiles(stageKey: Array[Int]): Unit = {
+    val cachedQuantiles = kvstore.view(classOf[CachedQuantile])
+      .index("stage")
+      .first(stageKey)
+      .last(stageKey)
+      .asScala
+      .toList
+    cachedQuantiles.foreach { q =>
+      kvstore.delete(q.getClass(), q.id)
+    }
+  }
+
   /**
    * Remove at least (retainedSize / 10) items to reduce friction. Because tracking may be done
    * asynchronously, this method may return 0 in case enough items have been deleted already.
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
index 5a942f528401..efc28538a33d 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
@@ -24,7 +24,7 @@ import scala.collection.JavaConverters._
 import org.apache.spark.{JobExecutionStatus, SparkConf}
 import org.apache.spark.status.api.v1
 import org.apache.spark.ui.scope._
-import org.apache.spark.util.Distribution
+import org.apache.spark.util.{Distribution, Utils}
 import org.apache.spark.util.kvstore.{InMemoryStore, KVStore}
 
 /**
@@ -98,7 +98,11 @@ private[spark] class AppStatusStore(
     val it = store.view(classOf[StageDataWrapper]).index("stageId").reverse().first(stageId)
       .closeableIterator()
     try {
-      it.next().info
+      if (it.hasNext()) {
+        it.next().info
+      } else {
+        throw new NoSuchElementException(s"No stage with id $stageId")
+      }
     } finally {
       it.close()
     }
@@ -110,107 +114,238 @@ private[spark] class AppStatusStore(
     if (details) stageWithDetails(stage) else stage
   }
 
+  def taskCount(stageId: Int, stageAttemptId: Int): Long = {
+    store.count(classOf[TaskDataWrapper], "stage", Array(stageId, stageAttemptId))
+  }
+
+  def localitySummary(stageId: Int, stageAttemptId: Int): Map[String, Long] = {
+    store.read(classOf[StageDataWrapper], Array(stageId, stageAttemptId)).locality
+  }
+
+  /**
+   * Calculates a summary of the task metrics for the given stage attempt, returning the
+   * requested quantiles for the recorded metrics.
+   *
+   * This method can be expensive if the requested quantiles are not cached; the method
+   * will only cache certain quantiles (every 0.05 step), so it's recommended to stick to
+   * those to avoid expensive scans of all task data.
+   */
   def taskSummary(
       stageId: Int,
       stageAttemptId: Int,
-      quantiles: Array[Double]): v1.TaskMetricDistributions = {
-
-    val stage = Array(stageId, stageAttemptId)
-
-    val rawMetrics = store.view(classOf[TaskDataWrapper])
-      .index("stage")
-      .first(stage)
-      .last(stage)
-      .asScala
-      .flatMap(_.info.taskMetrics)
-      .toList
-      .view
-
-    def metricQuantiles(f: v1.TaskMetrics => Double): IndexedSeq[Double] =
-      Distribution(rawMetrics.map { d => f(d) }).get.getQuantiles(quantiles)
-
-    // We need to do a lot of similar munging to nested metrics here.  For each one,
-    // we want (a) extract the values for nested metrics (b) make a distribution for each metric
-    // (c) shove the distribution into the right field in our return type and (d) only return
-    // a result if the option is defined for any of the tasks.  MetricHelper is a little util
-    // to make it a little easier to deal w/ all of the nested options.  Mostly it lets us just
-    // implement one "build" method, which just builds the quantiles for each field.
-
-    val inputMetrics =
-      new MetricHelper[v1.InputMetrics, v1.InputMetricDistributions](rawMetrics, quantiles) {
-        def getSubmetrics(raw: v1.TaskMetrics): v1.InputMetrics = raw.inputMetrics
-
-        def build: v1.InputMetricDistributions = new v1.InputMetricDistributions(
-          bytesRead = submetricQuantiles(_.bytesRead),
-          recordsRead = submetricQuantiles(_.recordsRead)
-        )
-      }.build
-
-    val outputMetrics =
-      new MetricHelper[v1.OutputMetrics, v1.OutputMetricDistributions](rawMetrics, quantiles) {
-        def getSubmetrics(raw: v1.TaskMetrics): v1.OutputMetrics = raw.outputMetrics
-
-        def build: v1.OutputMetricDistributions = new v1.OutputMetricDistributions(
-          bytesWritten = submetricQuantiles(_.bytesWritten),
-          recordsWritten = submetricQuantiles(_.recordsWritten)
-        )
-      }.build
-
-    val shuffleReadMetrics =
-      new MetricHelper[v1.ShuffleReadMetrics, v1.ShuffleReadMetricDistributions](rawMetrics,
-        quantiles) {
-        def getSubmetrics(raw: v1.TaskMetrics): v1.ShuffleReadMetrics =
-          raw.shuffleReadMetrics
-
-        def build: v1.ShuffleReadMetricDistributions = new v1.ShuffleReadMetricDistributions(
-          readBytes = submetricQuantiles { s => s.localBytesRead + s.remoteBytesRead },
-          readRecords = submetricQuantiles(_.recordsRead),
-          remoteBytesRead = submetricQuantiles(_.remoteBytesRead),
-          remoteBytesReadToDisk = submetricQuantiles(_.remoteBytesReadToDisk),
-          remoteBlocksFetched = submetricQuantiles(_.remoteBlocksFetched),
-          localBlocksFetched = submetricQuantiles(_.localBlocksFetched),
-          totalBlocksFetched = submetricQuantiles { s =>
-            s.localBlocksFetched + s.remoteBlocksFetched
-          },
-          fetchWaitTime = submetricQuantiles(_.fetchWaitTime)
-        )
-      }.build
-
-    val shuffleWriteMetrics =
-      new MetricHelper[v1.ShuffleWriteMetrics, v1.ShuffleWriteMetricDistributions](rawMetrics,
-        quantiles) {
-        def getSubmetrics(raw: v1.TaskMetrics): v1.ShuffleWriteMetrics =
-          raw.shuffleWriteMetrics
-
-        def build: v1.ShuffleWriteMetricDistributions = new v1.ShuffleWriteMetricDistributions(
-          writeBytes = submetricQuantiles(_.bytesWritten),
-          writeRecords = submetricQuantiles(_.recordsWritten),
-          writeTime = submetricQuantiles(_.writeTime)
-        )
-      }.build
-
-    new v1.TaskMetricDistributions(
+      unsortedQuantiles: Array[Double]): Option[v1.TaskMetricDistributions] = {
+    val stageKey = Array(stageId, stageAttemptId)
+    val quantiles = unsortedQuantiles.sorted
+
+    // We don't know how many tasks remain in the store that actually have metrics. So scan one
+    // metric and count how many valid tasks there are. Use skip() instead of next() since it's
+    // cheaper for disk stores (avoids deserialization).
+    val count = {
+      Utils.tryWithResource(
+        store.view(classOf[TaskDataWrapper])
+          .parent(stageKey)
+          .index(TaskIndexNames.EXEC_RUN_TIME)
+          .first(0L)
+          .closeableIterator()
+      ) { it =>
+        var _count = 0L
+        while (it.hasNext()) {
+          _count += 1
+          it.skip(1)
+        }
+        _count
+      }
+    }
+
+    if (count <= 0) {
+      return None
+    }
+
+    // Find out which quantiles are already cached. The data in the store must match the expected
+    // task count to be considered, otherwise it will be re-scanned and overwritten.
+    val cachedQuantiles = quantiles.filter(shouldCacheQuantile).flatMap { q =>
+      val qkey = Array(stageId, stageAttemptId, quantileToString(q))
+      asOption(store.read(classOf[CachedQuantile], qkey)).filter(_.taskCount == count)
+    }
+
+    // If there are no missing quantiles, return the data. Otherwise, just compute everything
+    // to make the code simpler.
+    if (cachedQuantiles.size == quantiles.size) {
+      def toValues(fn: CachedQuantile => Double): IndexedSeq[Double] = cachedQuantiles.map(fn)
+
+      val distributions = new v1.TaskMetricDistributions(
+        quantiles = quantiles,
+        executorDeserializeTime = toValues(_.executorDeserializeTime),
+        executorDeserializeCpuTime = toValues(_.executorDeserializeCpuTime),
+        executorRunTime = toValues(_.executorRunTime),
+        executorCpuTime = toValues(_.executorCpuTime),
+        resultSize = toValues(_.resultSize),
+        jvmGcTime = toValues(_.jvmGcTime),
+        resultSerializationTime = toValues(_.resultSerializationTime),
+        gettingResultTime = toValues(_.gettingResultTime),
+        schedulerDelay = toValues(_.schedulerDelay),
+        peakExecutionMemory = toValues(_.peakExecutionMemory),
+        memoryBytesSpilled = toValues(_.memoryBytesSpilled),
+        diskBytesSpilled = toValues(_.diskBytesSpilled),
+        inputMetrics = new v1.InputMetricDistributions(
+          toValues(_.bytesRead),
+          toValues(_.recordsRead)),
+        outputMetrics = new v1.OutputMetricDistributions(
+          toValues(_.bytesWritten),
+          toValues(_.recordsWritten)),
+        shuffleReadMetrics = new v1.ShuffleReadMetricDistributions(
+          toValues(_.shuffleReadBytes),
+          toValues(_.shuffleRecordsRead),
+          toValues(_.shuffleRemoteBlocksFetched),
+          toValues(_.shuffleLocalBlocksFetched),
+          toValues(_.shuffleFetchWaitTime),
+          toValues(_.shuffleRemoteBytesRead),
+          toValues(_.shuffleRemoteBytesReadToDisk),
+          toValues(_.shuffleTotalBlocksFetched)),
+        shuffleWriteMetrics = new v1.ShuffleWriteMetricDistributions(
+          toValues(_.shuffleWriteBytes),
+          toValues(_.shuffleWriteRecords),
+          toValues(_.shuffleWriteTime)))
+
+      return Some(distributions)
+    }
+
+    // Compute quantiles by scanning the tasks in the store. This is not really stable for live
+    // stages (e.g. the number of recorded tasks may change while this code is running), but should
+    // stabilize once the stage finishes. It's also slow, especially with disk stores.
+    val indices = quantiles.map { q => math.min((q * count).toLong, count - 1) }
+
+    def scanTasks(index: String)(fn: TaskDataWrapper => Long): IndexedSeq[Double] = {
+      Utils.tryWithResource(
+        store.view(classOf[TaskDataWrapper])
+          .parent(stageKey)
+          .index(index)
+          .first(0L)
+          .closeableIterator()
+      ) { it =>
+        var last = Double.NaN
+        var currentIdx = -1L
+        indices.map { idx =>
+          if (idx == currentIdx) {
+            last
+          } else {
+            val diff = idx - currentIdx
+            currentIdx = idx
+            if (it.skip(diff - 1)) {
+              last = fn(it.next()).toDouble
+              last
+            } else {
+              Double.NaN
+            }
+          }
+        }.toIndexedSeq
+      }
+    }
+
+    val computedQuantiles = new v1.TaskMetricDistributions(
       quantiles = quantiles,
-      executorDeserializeTime = metricQuantiles(_.executorDeserializeTime),
-      executorDeserializeCpuTime = metricQuantiles(_.executorDeserializeCpuTime),
-      executorRunTime = metricQuantiles(_.executorRunTime),
-      executorCpuTime = metricQuantiles(_.executorCpuTime),
-      resultSize = metricQuantiles(_.resultSize),
-      jvmGcTime = metricQuantiles(_.jvmGcTime),
-      resultSerializationTime = metricQuantiles(_.resultSerializationTime),
-      memoryBytesSpilled = metricQuantiles(_.memoryBytesSpilled),
-      diskBytesSpilled = metricQuantiles(_.diskBytesSpilled),
-      inputMetrics = inputMetrics,
-      outputMetrics = outputMetrics,
-      shuffleReadMetrics = shuffleReadMetrics,
-      shuffleWriteMetrics = shuffleWriteMetrics
-    )
+      executorDeserializeTime = scanTasks(TaskIndexNames.DESER_TIME) { t =>
+        t.executorDeserializeTime
+      },
+      executorDeserializeCpuTime = scanTasks(TaskIndexNames.DESER_CPU_TIME) { t =>
+        t.executorDeserializeCpuTime
+      },
+      executorRunTime = scanTasks(TaskIndexNames.EXEC_RUN_TIME) { t => t.executorRunTime },
+      executorCpuTime = scanTasks(TaskIndexNames.EXEC_CPU_TIME) { t => t.executorCpuTime },
+      resultSize = scanTasks(TaskIndexNames.RESULT_SIZE) { t => t.resultSize },
+      jvmGcTime = scanTasks(TaskIndexNames.GC_TIME) { t => t.jvmGcTime },
+      resultSerializationTime = scanTasks(TaskIndexNames.SER_TIME) { t =>
+        t.resultSerializationTime
+      },
+      gettingResultTime = scanTasks(TaskIndexNames.GETTING_RESULT_TIME) { t =>
+        t.gettingResultTime
+      },
+      schedulerDelay = scanTasks(TaskIndexNames.SCHEDULER_DELAY) { t => t.schedulerDelay },
+      peakExecutionMemory = scanTasks(TaskIndexNames.PEAK_MEM) { t => t.peakExecutionMemory },
+      memoryBytesSpilled = scanTasks(TaskIndexNames.MEM_SPILL) { t => t.memoryBytesSpilled },
+      diskBytesSpilled = scanTasks(TaskIndexNames.DISK_SPILL) { t => t.diskBytesSpilled },
+      inputMetrics = new v1.InputMetricDistributions(
+        scanTasks(TaskIndexNames.INPUT_SIZE) { t => t.inputBytesRead },
+        scanTasks(TaskIndexNames.INPUT_RECORDS) { t => t.inputRecordsRead }),
+      outputMetrics = new v1.OutputMetricDistributions(
+        scanTasks(TaskIndexNames.OUTPUT_SIZE) { t => t.outputBytesWritten },
+        scanTasks(TaskIndexNames.OUTPUT_RECORDS) { t => t.outputRecordsWritten }),
+      shuffleReadMetrics = new v1.ShuffleReadMetricDistributions(
+        scanTasks(TaskIndexNames.SHUFFLE_TOTAL_READS) { m =>
+          m.shuffleLocalBytesRead + m.shuffleRemoteBytesRead
+        },
+        scanTasks(TaskIndexNames.SHUFFLE_READ_RECORDS) { t => t.shuffleRecordsRead },
+        scanTasks(TaskIndexNames.SHUFFLE_REMOTE_BLOCKS) { t => t.shuffleRemoteBlocksFetched },
+        scanTasks(TaskIndexNames.SHUFFLE_LOCAL_BLOCKS) { t => t.shuffleLocalBlocksFetched },
+        scanTasks(TaskIndexNames.SHUFFLE_READ_TIME) { t => t.shuffleFetchWaitTime },
+        scanTasks(TaskIndexNames.SHUFFLE_REMOTE_READS) { t => t.shuffleRemoteBytesRead },
+        scanTasks(TaskIndexNames.SHUFFLE_REMOTE_READS_TO_DISK) { t =>
+          t.shuffleRemoteBytesReadToDisk
+        },
+        scanTasks(TaskIndexNames.SHUFFLE_TOTAL_BLOCKS) { m =>
+          m.shuffleLocalBlocksFetched + m.shuffleRemoteBlocksFetched
+        }),
+      shuffleWriteMetrics = new v1.ShuffleWriteMetricDistributions(
+        scanTasks(TaskIndexNames.SHUFFLE_WRITE_SIZE) { t => t.shuffleBytesWritten },
+        scanTasks(TaskIndexNames.SHUFFLE_WRITE_RECORDS) { t => t.shuffleRecordsWritten },
+        scanTasks(TaskIndexNames.SHUFFLE_WRITE_TIME) { t => t.shuffleWriteTime }))
+
+    // Go through the computed quantiles and cache the values that match the caching criteria.
+    computedQuantiles.quantiles.zipWithIndex
+      .filter { case (q, _) => quantiles.contains(q) && shouldCacheQuantile(q) }
+      .foreach { case (q, idx) =>
+        val cached = new CachedQuantile(stageId, stageAttemptId, quantileToString(q), count,
+          executorDeserializeTime = computedQuantiles.executorDeserializeTime(idx),
+          executorDeserializeCpuTime = computedQuantiles.executorDeserializeCpuTime(idx),
+          executorRunTime = computedQuantiles.executorRunTime(idx),
+          executorCpuTime = computedQuantiles.executorCpuTime(idx),
+          resultSize = computedQuantiles.resultSize(idx),
+          jvmGcTime = computedQuantiles.jvmGcTime(idx),
+          resultSerializationTime = computedQuantiles.resultSerializationTime(idx),
+          gettingResultTime = computedQuantiles.gettingResultTime(idx),
+          schedulerDelay = computedQuantiles.schedulerDelay(idx),
+          peakExecutionMemory = computedQuantiles.peakExecutionMemory(idx),
+          memoryBytesSpilled = computedQuantiles.memoryBytesSpilled(idx),
+          diskBytesSpilled = computedQuantiles.diskBytesSpilled(idx),
+
+          bytesRead = computedQuantiles.inputMetrics.bytesRead(idx),
+          recordsRead = computedQuantiles.inputMetrics.recordsRead(idx),
+
+          bytesWritten = computedQuantiles.outputMetrics.bytesWritten(idx),
+          recordsWritten = computedQuantiles.outputMetrics.recordsWritten(idx),
+
+          shuffleReadBytes = computedQuantiles.shuffleReadMetrics.readBytes(idx),
+          shuffleRecordsRead = computedQuantiles.shuffleReadMetrics.readRecords(idx),
+          shuffleRemoteBlocksFetched =
+            computedQuantiles.shuffleReadMetrics.remoteBlocksFetched(idx),
+          shuffleLocalBlocksFetched = computedQuantiles.shuffleReadMetrics.localBlocksFetched(idx),
+          shuffleFetchWaitTime = computedQuantiles.shuffleReadMetrics.fetchWaitTime(idx),
+          shuffleRemoteBytesRead = computedQuantiles.shuffleReadMetrics.remoteBytesRead(idx),
+          shuffleRemoteBytesReadToDisk =
+            computedQuantiles.shuffleReadMetrics.remoteBytesReadToDisk(idx),
+          shuffleTotalBlocksFetched = computedQuantiles.shuffleReadMetrics.totalBlocksFetched(idx),
+
+          shuffleWriteBytes = computedQuantiles.shuffleWriteMetrics.writeBytes(idx),
+          shuffleWriteRecords = computedQuantiles.shuffleWriteMetrics.writeRecords(idx),
+          shuffleWriteTime = computedQuantiles.shuffleWriteMetrics.writeTime(idx))
+        store.write(cached)
+      }
+
+    Some(computedQuantiles)
   }
 
+  /**
+   * Whether to cache information about a specific metric quantile. We cache quantiles at every 0.05
+   * step, which covers the default values used both in the API and in the stages page.
+   */
+  private def shouldCacheQuantile(q: Double): Boolean = (math.round(q * 100) % 5) == 0
+
+  private def quantileToString(q: Double): String = math.round(q * 100).toString
+
   def taskList(stageId: Int, stageAttemptId: Int, maxTasks: Int): Seq[v1.TaskData] = {
     val stageKey = Array(stageId, stageAttemptId)
     store.view(classOf[TaskDataWrapper]).index("stage").first(stageKey).last(stageKey).reverse()
-      .max(maxTasks).asScala.map(_.info).toSeq.reverse
+      .max(maxTasks).asScala.map(_.toApi).toSeq.reverse
   }
 
   def taskList(
@@ -219,18 +354,43 @@ private[spark] class AppStatusStore(
       offset: Int,
       length: Int,
       sortBy: v1.TaskSorting): Seq[v1.TaskData] = {
+    val (indexName, ascending) = sortBy match {
+      case v1.TaskSorting.ID =>
+        (None, true)
+      case v1.TaskSorting.INCREASING_RUNTIME =>
+        (Some(TaskIndexNames.EXEC_RUN_TIME), true)
+      case v1.TaskSorting.DECREASING_RUNTIME =>
+        (Some(TaskIndexNames.EXEC_RUN_TIME), false)
+    }
+    taskList(stageId, stageAttemptId, offset, length, indexName, ascending)
+  }
+
+  def taskList(
+      stageId: Int,
+      stageAttemptId: Int,
+      offset: Int,
+      length: Int,
+      sortBy: Option[String],
+      ascending: Boolean): Seq[v1.TaskData] = {
     val stageKey = Array(stageId, stageAttemptId)
     val base = store.view(classOf[TaskDataWrapper])
     val indexed = sortBy match {
-      case v1.TaskSorting.ID =>
+      case Some(index) =>
+        base.index(index).parent(stageKey)
+
+      case _ =>
+        // Sort by ID, which is the "stage" index.
         base.index("stage").first(stageKey).last(stageKey)
-      case v1.TaskSorting.INCREASING_RUNTIME =>
-        base.index("runtime").first(stageKey ++ Array(-1L)).last(stageKey ++ Array(Long.MaxValue))
-      case v1.TaskSorting.DECREASING_RUNTIME =>
-        base.index("runtime").first(stageKey ++ Array(Long.MaxValue)).last(stageKey ++ Array(-1L))
-          .reverse()
     }
-    indexed.skip(offset).max(length).asScala.map(_.info).toSeq
+
+    val ordered = if (ascending) indexed else indexed.reverse()
+    ordered.skip(offset).max(length).asScala.map(_.toApi).toSeq
+  }
+
+  def executorSummary(stageId: Int, attemptId: Int): Map[String, v1.ExecutorStageSummary] = {
+    val stageKey = Array(stageId, attemptId)
+    store.view(classOf[ExecutorStageSummaryWrapper]).index("stage").first(stageKey).last(stageKey)
+      .asScala.map { exec => (exec.executorId -> exec.info) }.toMap
   }
 
   def rddList(cachedOnly: Boolean = true): Seq[v1.RDDStorageInfo] = {
@@ -256,12 +416,6 @@ private[spark] class AppStatusStore(
       .map { t => (t.taskId, t) }
       .toMap
 
-    val stageKey = Array(stage.stageId, stage.attemptId)
-    val execs = store.view(classOf[ExecutorStageSummaryWrapper]).index("stage").first(stageKey)
-      .last(stageKey).closeableIterator().asScala
-      .map { exec => (exec.executorId -> exec.info) }
-      .toMap
-
     new v1.StageData(
       stage.status,
       stage.stageId,
@@ -295,7 +449,7 @@ private[spark] class AppStatusStore(
       stage.rddIds,
       stage.accumulatorUpdates,
       Some(tasks),
-      Some(execs),
+      Some(executorSummary(stage.stageId, stage.attemptId)),
       stage.killedTasksSummary)
   }
 
@@ -352,22 +506,3 @@ private[spark] object AppStatusStore {
   }
 
 }
-
-/**
- * Helper for getting distributions from nested metric types.
- */
-private abstract class MetricHelper[I, O](
-    rawMetrics: Seq[v1.TaskMetrics],
-    quantiles: Array[Double]) {
-
-  def getSubmetrics(raw: v1.TaskMetrics): I
-
-  def build: O
-
-  val data: Seq[I] = rawMetrics.map(getSubmetrics)
-
-  /** applies the given function to all input metrics, and returns the quantiles */
-  def submetricQuantiles(f: I => Double): IndexedSeq[Double] = {
-    Distribution(data.map { d => f(d) }).get.getQuantiles(quantiles)
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala b/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala
new file mode 100644
index 000000000000..341bd4e0cd01
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import org.apache.spark.status.api.v1.{TaskData, TaskMetrics}
+
+private[spark] object AppStatusUtils {
+
+  def schedulerDelay(task: TaskData): Long = {
+    if (task.taskMetrics.isDefined && task.duration.isDefined) {
+      val m = task.taskMetrics.get
+      schedulerDelay(task.launchTime.getTime(), fetchStart(task), task.duration.get,
+        m.executorDeserializeTime, m.resultSerializationTime, m.executorRunTime)
+    } else {
+      0L
+    }
+  }
+
+  def gettingResultTime(task: TaskData): Long = {
+    gettingResultTime(task.launchTime.getTime(), fetchStart(task), task.duration.getOrElse(-1L))
+  }
+
+  def schedulerDelay(
+      launchTime: Long,
+      fetchStart: Long,
+      duration: Long,
+      deserializeTime: Long,
+      serializeTime: Long,
+      runTime: Long): Long = {
+    math.max(0, duration - runTime - deserializeTime - serializeTime -
+      gettingResultTime(launchTime, fetchStart, duration))
+  }
+
+  def gettingResultTime(launchTime: Long, fetchStart: Long, duration: Long): Long = {
+    if (fetchStart > 0) {
+      if (duration > 0) {
+        launchTime + duration - fetchStart
+      } else {
+        System.currentTimeMillis() - fetchStart
+      }
+    } else {
+      0L
+    }
+  }
+
+  private def fetchStart(task: TaskData): Long = {
+    if (task.resultFetchStart.isDefined) {
+      task.resultFetchStart.get.getTime()
+    } else {
+      -1
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
index 52e83f250d34..4295e664e131 100644
--- a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
+++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
@@ -22,6 +22,8 @@ import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.mutable.HashMap
 
+import com.google.common.collect.Interners
+
 import org.apache.spark.JobExecutionStatus
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler.{AccumulableInfo, StageInfo, TaskInfo}
@@ -119,7 +121,9 @@ private class LiveTask(
 
   import LiveEntityHelpers._
 
-  private var recordedMetrics: v1.TaskMetrics = null
+  // The task metrics use a special value when no metrics have been reported. The special value is
+  // checked when calculating indexed values when writing to the store (see [[TaskDataWrapper]]).
+  private var metrics: v1.TaskMetrics = createMetrics(default = -1L)
 
   var errorMessage: Option[String] = None
 
@@ -129,8 +133,8 @@ private class LiveTask(
    */
   def updateMetrics(metrics: TaskMetrics): v1.TaskMetrics = {
     if (metrics != null) {
-      val old = recordedMetrics
-      recordedMetrics = new v1.TaskMetrics(
+      val old = this.metrics
+      val newMetrics = createMetrics(
         metrics.executorDeserializeTime,
         metrics.executorDeserializeCpuTime,
         metrics.executorRunTime,
@@ -141,73 +145,35 @@ private class LiveTask(
         metrics.memoryBytesSpilled,
         metrics.diskBytesSpilled,
         metrics.peakExecutionMemory,
-        new v1.InputMetrics(
-          metrics.inputMetrics.bytesRead,
-          metrics.inputMetrics.recordsRead),
-        new v1.OutputMetrics(
-          metrics.outputMetrics.bytesWritten,
-          metrics.outputMetrics.recordsWritten),
-        new v1.ShuffleReadMetrics(
-          metrics.shuffleReadMetrics.remoteBlocksFetched,
-          metrics.shuffleReadMetrics.localBlocksFetched,
-          metrics.shuffleReadMetrics.fetchWaitTime,
-          metrics.shuffleReadMetrics.remoteBytesRead,
-          metrics.shuffleReadMetrics.remoteBytesReadToDisk,
-          metrics.shuffleReadMetrics.localBytesRead,
-          metrics.shuffleReadMetrics.recordsRead),
-        new v1.ShuffleWriteMetrics(
-          metrics.shuffleWriteMetrics.bytesWritten,
-          metrics.shuffleWriteMetrics.writeTime,
-          metrics.shuffleWriteMetrics.recordsWritten))
-      if (old != null) calculateMetricsDelta(recordedMetrics, old) else recordedMetrics
+        metrics.inputMetrics.bytesRead,
+        metrics.inputMetrics.recordsRead,
+        metrics.outputMetrics.bytesWritten,
+        metrics.outputMetrics.recordsWritten,
+        metrics.shuffleReadMetrics.remoteBlocksFetched,
+        metrics.shuffleReadMetrics.localBlocksFetched,
+        metrics.shuffleReadMetrics.fetchWaitTime,
+        metrics.shuffleReadMetrics.remoteBytesRead,
+        metrics.shuffleReadMetrics.remoteBytesReadToDisk,
+        metrics.shuffleReadMetrics.localBytesRead,
+        metrics.shuffleReadMetrics.recordsRead,
+        metrics.shuffleWriteMetrics.bytesWritten,
+        metrics.shuffleWriteMetrics.writeTime,
+        metrics.shuffleWriteMetrics.recordsWritten)
+
+      this.metrics = newMetrics
+
+      // Only calculate the delta if the old metrics contain valid information, otherwise
+      // the new metrics are the delta.
+      if (old.executorDeserializeTime >= 0L) {
+        subtractMetrics(newMetrics, old)
+      } else {
+        newMetrics
+      }
     } else {
       null
     }
   }
 
-  /**
-   * Return a new TaskMetrics object containing the delta of the various fields of the given
-   * metrics objects. This is currently targeted at updating stage data, so it does not
-   * necessarily calculate deltas for all the fields.
-   */
-  private def calculateMetricsDelta(
-      metrics: v1.TaskMetrics,
-      old: v1.TaskMetrics): v1.TaskMetrics = {
-    val shuffleWriteDelta = new v1.ShuffleWriteMetrics(
-      metrics.shuffleWriteMetrics.bytesWritten - old.shuffleWriteMetrics.bytesWritten,
-      0L,
-      metrics.shuffleWriteMetrics.recordsWritten - old.shuffleWriteMetrics.recordsWritten)
-
-    val shuffleReadDelta = new v1.ShuffleReadMetrics(
-      0L, 0L, 0L,
-      metrics.shuffleReadMetrics.remoteBytesRead - old.shuffleReadMetrics.remoteBytesRead,
-      metrics.shuffleReadMetrics.remoteBytesReadToDisk -
-        old.shuffleReadMetrics.remoteBytesReadToDisk,
-      metrics.shuffleReadMetrics.localBytesRead - old.shuffleReadMetrics.localBytesRead,
-      metrics.shuffleReadMetrics.recordsRead - old.shuffleReadMetrics.recordsRead)
-
-    val inputDelta = new v1.InputMetrics(
-      metrics.inputMetrics.bytesRead - old.inputMetrics.bytesRead,
-      metrics.inputMetrics.recordsRead - old.inputMetrics.recordsRead)
-
-    val outputDelta = new v1.OutputMetrics(
-      metrics.outputMetrics.bytesWritten - old.outputMetrics.bytesWritten,
-      metrics.outputMetrics.recordsWritten - old.outputMetrics.recordsWritten)
-
-    new v1.TaskMetrics(
-      0L, 0L,
-      metrics.executorRunTime - old.executorRunTime,
-      metrics.executorCpuTime - old.executorCpuTime,
-      0L, 0L, 0L,
-      metrics.memoryBytesSpilled - old.memoryBytesSpilled,
-      metrics.diskBytesSpilled - old.diskBytesSpilled,
-      0L,
-      inputDelta,
-      outputDelta,
-      shuffleReadDelta,
-      shuffleWriteDelta)
-  }
-
   override protected def doUpdate(): Any = {
     val duration = if (info.finished) {
       info.duration
@@ -215,22 +181,48 @@ private class LiveTask(
       info.timeRunning(lastUpdateTime.getOrElse(System.currentTimeMillis()))
     }
 
-    val task = new v1.TaskData(
+    new TaskDataWrapper(
       info.taskId,
       info.index,
       info.attemptNumber,
-      new Date(info.launchTime),
-      if (info.gettingResult) Some(new Date(info.gettingResultTime)) else None,
-      Some(duration),
-      info.executorId,
-      info.host,
-      info.status,
-      info.taskLocality.toString(),
+      info.launchTime,
+      if (info.gettingResult) info.gettingResultTime else -1L,
+      duration,
+      weakIntern(info.executorId),
+      weakIntern(info.host),
+      weakIntern(info.status),
+      weakIntern(info.taskLocality.toString()),
       info.speculative,
       newAccumulatorInfos(info.accumulables),
       errorMessage,
-      Option(recordedMetrics))
-    new TaskDataWrapper(task, stageId, stageAttemptId)
+
+      metrics.executorDeserializeTime,
+      metrics.executorDeserializeCpuTime,
+      metrics.executorRunTime,
+      metrics.executorCpuTime,
+      metrics.resultSize,
+      metrics.jvmGcTime,
+      metrics.resultSerializationTime,
+      metrics.memoryBytesSpilled,
+      metrics.diskBytesSpilled,
+      metrics.peakExecutionMemory,
+      metrics.inputMetrics.bytesRead,
+      metrics.inputMetrics.recordsRead,
+      metrics.outputMetrics.bytesWritten,
+      metrics.outputMetrics.recordsWritten,
+      metrics.shuffleReadMetrics.remoteBlocksFetched,
+      metrics.shuffleReadMetrics.localBlocksFetched,
+      metrics.shuffleReadMetrics.fetchWaitTime,
+      metrics.shuffleReadMetrics.remoteBytesRead,
+      metrics.shuffleReadMetrics.remoteBytesReadToDisk,
+      metrics.shuffleReadMetrics.localBytesRead,
+      metrics.shuffleReadMetrics.recordsRead,
+      metrics.shuffleWriteMetrics.bytesWritten,
+      metrics.shuffleWriteMetrics.writeTime,
+      metrics.shuffleWriteMetrics.recordsWritten,
+
+      stageId,
+      stageAttemptId)
   }
 
 }
@@ -313,50 +305,19 @@ private class LiveExecutor(val executorId: String, _addTime: Long) extends LiveE
 
 }
 
-/** Metrics tracked per stage (both total and per executor). */
-private class MetricsTracker {
-  var executorRunTime = 0L
-  var executorCpuTime = 0L
-  var inputBytes = 0L
-  var inputRecords = 0L
-  var outputBytes = 0L
-  var outputRecords = 0L
-  var shuffleReadBytes = 0L
-  var shuffleReadRecords = 0L
-  var shuffleWriteBytes = 0L
-  var shuffleWriteRecords = 0L
-  var memoryBytesSpilled = 0L
-  var diskBytesSpilled = 0L
-
-  def update(delta: v1.TaskMetrics): Unit = {
-    executorRunTime += delta.executorRunTime
-    executorCpuTime += delta.executorCpuTime
-    inputBytes += delta.inputMetrics.bytesRead
-    inputRecords += delta.inputMetrics.recordsRead
-    outputBytes += delta.outputMetrics.bytesWritten
-    outputRecords += delta.outputMetrics.recordsWritten
-    shuffleReadBytes += delta.shuffleReadMetrics.localBytesRead +
-      delta.shuffleReadMetrics.remoteBytesRead
-    shuffleReadRecords += delta.shuffleReadMetrics.recordsRead
-    shuffleWriteBytes += delta.shuffleWriteMetrics.bytesWritten
-    shuffleWriteRecords += delta.shuffleWriteMetrics.recordsWritten
-    memoryBytesSpilled += delta.memoryBytesSpilled
-    diskBytesSpilled += delta.diskBytesSpilled
-  }
-
-}
-
 private class LiveExecutorStageSummary(
     stageId: Int,
     attemptId: Int,
     executorId: String) extends LiveEntity {
 
+  import LiveEntityHelpers._
+
   var taskTime = 0L
   var succeededTasks = 0
   var failedTasks = 0
   var killedTasks = 0
 
-  val metrics = new MetricsTracker()
+  var metrics = createMetrics(default = 0L)
 
   override protected def doUpdate(): Any = {
     val info = new v1.ExecutorStageSummary(
@@ -364,14 +325,14 @@ private class LiveExecutorStageSummary(
       failedTasks,
       succeededTasks,
       killedTasks,
-      metrics.inputBytes,
-      metrics.inputRecords,
-      metrics.outputBytes,
-      metrics.outputRecords,
-      metrics.shuffleReadBytes,
-      metrics.shuffleReadRecords,
-      metrics.shuffleWriteBytes,
-      metrics.shuffleWriteRecords,
+      metrics.inputMetrics.bytesRead,
+      metrics.inputMetrics.recordsRead,
+      metrics.outputMetrics.bytesWritten,
+      metrics.outputMetrics.recordsWritten,
+      metrics.shuffleReadMetrics.remoteBytesRead + metrics.shuffleReadMetrics.localBytesRead,
+      metrics.shuffleReadMetrics.recordsRead,
+      metrics.shuffleWriteMetrics.bytesWritten,
+      metrics.shuffleWriteMetrics.recordsWritten,
       metrics.memoryBytesSpilled,
       metrics.diskBytesSpilled)
     new ExecutorStageSummaryWrapper(stageId, attemptId, executorId, info)
@@ -402,7 +363,9 @@ private class LiveStage extends LiveEntity {
 
   var firstLaunchTime = Long.MaxValue
 
-  val metrics = new MetricsTracker()
+  var localitySummary: Map[String, Long] = Map()
+
+  var metrics = createMetrics(default = 0L)
 
   val executorSummaries = new HashMap[String, LiveExecutorStageSummary]()
 
@@ -412,14 +375,14 @@ private class LiveStage extends LiveEntity {
 
   def executorSummary(executorId: String): LiveExecutorStageSummary = {
     executorSummaries.getOrElseUpdate(executorId,
-      new LiveExecutorStageSummary(info.stageId, info.attemptId, executorId))
+      new LiveExecutorStageSummary(info.stageId, info.attemptNumber, executorId))
   }
 
   def toApi(): v1.StageData = {
     new v1.StageData(
       status,
       info.stageId,
-      info.attemptId,
+      info.attemptNumber,
 
       info.numTasks,
       activeTasks,
@@ -435,14 +398,14 @@ private class LiveStage extends LiveEntity {
       info.completionTime.map(new Date(_)),
       info.failureReason,
 
-      metrics.inputBytes,
-      metrics.inputRecords,
-      metrics.outputBytes,
-      metrics.outputRecords,
-      metrics.shuffleReadBytes,
-      metrics.shuffleReadRecords,
-      metrics.shuffleWriteBytes,
-      metrics.shuffleWriteRecords,
+      metrics.inputMetrics.bytesRead,
+      metrics.inputMetrics.recordsRead,
+      metrics.outputMetrics.bytesWritten,
+      metrics.outputMetrics.recordsWritten,
+      metrics.shuffleReadMetrics.localBytesRead + metrics.shuffleReadMetrics.remoteBytesRead,
+      metrics.shuffleReadMetrics.recordsRead,
+      metrics.shuffleWriteMetrics.bytesWritten,
+      metrics.shuffleWriteMetrics.recordsWritten,
       metrics.memoryBytesSpilled,
       metrics.diskBytesSpilled,
 
@@ -459,13 +422,15 @@ private class LiveStage extends LiveEntity {
   }
 
   override protected def doUpdate(): Any = {
-    new StageDataWrapper(toApi(), jobIds)
+    new StageDataWrapper(toApi(), jobIds, localitySummary)
   }
 
 }
 
 private class LiveRDDPartition(val blockName: String) {
 
+  import LiveEntityHelpers._
+
   // Pointers used by RDDPartitionSeq.
   @volatile var prev: LiveRDDPartition = null
   @volatile var next: LiveRDDPartition = null
@@ -485,7 +450,7 @@ private class LiveRDDPartition(val blockName: String) {
       diskUsed: Long): Unit = {
     value = new v1.RDDPartitionInfo(
       blockName,
-      storageLevel,
+      weakIntern(storageLevel),
       memoryUsed,
       diskUsed,
       executors)
@@ -495,6 +460,8 @@ private class LiveRDDPartition(val blockName: String) {
 
 private class LiveRDDDistribution(exec: LiveExecutor) {
 
+  import LiveEntityHelpers._
+
   val executorId = exec.executorId
   var memoryUsed = 0L
   var diskUsed = 0L
@@ -508,7 +475,7 @@ private class LiveRDDDistribution(exec: LiveExecutor) {
   def toApi(): v1.RDDDataDistribution = {
     if (lastUpdate == null) {
       lastUpdate = new v1.RDDDataDistribution(
-        exec.hostPort,
+        weakIntern(exec.hostPort),
         memoryUsed,
         exec.maxMemory - exec.memoryUsed,
         diskUsed,
@@ -524,7 +491,9 @@ private class LiveRDDDistribution(exec: LiveExecutor) {
 
 private class LiveRDD(val info: RDDInfo) extends LiveEntity {
 
-  var storageLevel: String = info.storageLevel.description
+  import LiveEntityHelpers._
+
+  var storageLevel: String = weakIntern(info.storageLevel.description)
   var memoryUsed = 0L
   var diskUsed = 0L
 
@@ -533,6 +502,10 @@ private class LiveRDD(val info: RDDInfo) extends LiveEntity {
 
   private val distributions = new HashMap[String, LiveRDDDistribution]()
 
+  def setStorageLevel(level: String): Unit = {
+    this.storageLevel = weakIntern(level)
+  }
+
   def partition(blockName: String): LiveRDDPartition = {
     partitions.getOrElseUpdate(blockName, {
       val part = new LiveRDDPartition(blockName)
@@ -593,6 +566,9 @@ private class SchedulerPool(name: String) extends LiveEntity {
 
 private object LiveEntityHelpers {
 
+  private val stringInterner = Interners.newWeakInterner[String]()
+
+
   def newAccumulatorInfos(accums: Iterable[AccumulableInfo]): Seq[v1.AccumulableInfo] = {
     accums
       .filter { acc =>
@@ -604,13 +580,119 @@ private object LiveEntityHelpers {
       .map { acc =>
         new v1.AccumulableInfo(
           acc.id,
-          acc.name.orNull,
+          acc.name.map(weakIntern).orNull,
           acc.update.map(_.toString()),
           acc.value.map(_.toString()).orNull)
       }
       .toSeq
   }
 
+  /** String interning to reduce the memory usage. */
+  def weakIntern(s: String): String = {
+    stringInterner.intern(s)
+  }
+
+  // scalastyle:off argcount
+  def createMetrics(
+      executorDeserializeTime: Long,
+      executorDeserializeCpuTime: Long,
+      executorRunTime: Long,
+      executorCpuTime: Long,
+      resultSize: Long,
+      jvmGcTime: Long,
+      resultSerializationTime: Long,
+      memoryBytesSpilled: Long,
+      diskBytesSpilled: Long,
+      peakExecutionMemory: Long,
+      inputBytesRead: Long,
+      inputRecordsRead: Long,
+      outputBytesWritten: Long,
+      outputRecordsWritten: Long,
+      shuffleRemoteBlocksFetched: Long,
+      shuffleLocalBlocksFetched: Long,
+      shuffleFetchWaitTime: Long,
+      shuffleRemoteBytesRead: Long,
+      shuffleRemoteBytesReadToDisk: Long,
+      shuffleLocalBytesRead: Long,
+      shuffleRecordsRead: Long,
+      shuffleBytesWritten: Long,
+      shuffleWriteTime: Long,
+      shuffleRecordsWritten: Long): v1.TaskMetrics = {
+    new v1.TaskMetrics(
+      executorDeserializeTime,
+      executorDeserializeCpuTime,
+      executorRunTime,
+      executorCpuTime,
+      resultSize,
+      jvmGcTime,
+      resultSerializationTime,
+      memoryBytesSpilled,
+      diskBytesSpilled,
+      peakExecutionMemory,
+      new v1.InputMetrics(
+        inputBytesRead,
+        inputRecordsRead),
+      new v1.OutputMetrics(
+        outputBytesWritten,
+        outputRecordsWritten),
+      new v1.ShuffleReadMetrics(
+        shuffleRemoteBlocksFetched,
+        shuffleLocalBlocksFetched,
+        shuffleFetchWaitTime,
+        shuffleRemoteBytesRead,
+        shuffleRemoteBytesReadToDisk,
+        shuffleLocalBytesRead,
+        shuffleRecordsRead),
+      new v1.ShuffleWriteMetrics(
+        shuffleBytesWritten,
+        shuffleWriteTime,
+        shuffleRecordsWritten))
+  }
+  // scalastyle:on argcount
+
+  def createMetrics(default: Long): v1.TaskMetrics = {
+    createMetrics(default, default, default, default, default, default, default, default,
+      default, default, default, default, default, default, default, default,
+      default, default, default, default, default, default, default, default)
+  }
+
+  /** Add m2 values to m1. */
+  def addMetrics(m1: v1.TaskMetrics, m2: v1.TaskMetrics): v1.TaskMetrics = addMetrics(m1, m2, 1)
+
+  /** Subtract m2 values from m1. */
+  def subtractMetrics(m1: v1.TaskMetrics, m2: v1.TaskMetrics): v1.TaskMetrics = {
+    addMetrics(m1, m2, -1)
+  }
+
+  private def addMetrics(m1: v1.TaskMetrics, m2: v1.TaskMetrics, mult: Int): v1.TaskMetrics = {
+    createMetrics(
+      m1.executorDeserializeTime + m2.executorDeserializeTime * mult,
+      m1.executorDeserializeCpuTime + m2.executorDeserializeCpuTime * mult,
+      m1.executorRunTime + m2.executorRunTime * mult,
+      m1.executorCpuTime + m2.executorCpuTime * mult,
+      m1.resultSize + m2.resultSize * mult,
+      m1.jvmGcTime + m2.jvmGcTime * mult,
+      m1.resultSerializationTime + m2.resultSerializationTime * mult,
+      m1.memoryBytesSpilled + m2.memoryBytesSpilled * mult,
+      m1.diskBytesSpilled + m2.diskBytesSpilled * mult,
+      m1.peakExecutionMemory + m2.peakExecutionMemory * mult,
+      m1.inputMetrics.bytesRead + m2.inputMetrics.bytesRead * mult,
+      m1.inputMetrics.recordsRead + m2.inputMetrics.recordsRead * mult,
+      m1.outputMetrics.bytesWritten + m2.outputMetrics.bytesWritten * mult,
+      m1.outputMetrics.recordsWritten + m2.outputMetrics.recordsWritten * mult,
+      m1.shuffleReadMetrics.remoteBlocksFetched + m2.shuffleReadMetrics.remoteBlocksFetched * mult,
+      m1.shuffleReadMetrics.localBlocksFetched + m2.shuffleReadMetrics.localBlocksFetched * mult,
+      m1.shuffleReadMetrics.fetchWaitTime + m2.shuffleReadMetrics.fetchWaitTime * mult,
+      m1.shuffleReadMetrics.remoteBytesRead + m2.shuffleReadMetrics.remoteBytesRead * mult,
+      m1.shuffleReadMetrics.remoteBytesReadToDisk +
+        m2.shuffleReadMetrics.remoteBytesReadToDisk * mult,
+      m1.shuffleReadMetrics.localBytesRead + m2.shuffleReadMetrics.localBytesRead * mult,
+      m1.shuffleReadMetrics.recordsRead + m2.shuffleReadMetrics.recordsRead * mult,
+      m1.shuffleWriteMetrics.bytesWritten + m2.shuffleWriteMetrics.bytesWritten * mult,
+      m1.shuffleWriteMetrics.writeTime + m2.shuffleWriteMetrics.writeTime * mult,
+      m1.shuffleWriteMetrics.recordsWritten + m2.shuffleWriteMetrics.recordsWritten * mult)
+  }
+
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
index 3b879545b3d2..96249e4bfd5f 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
@@ -87,7 +87,8 @@ private[v1] class StagesResource extends BaseAppResource {
       }
     }
 
-    ui.store.taskSummary(stageId, stageAttemptId, quantiles)
+    ui.store.taskSummary(stageId, stageAttemptId, quantiles).getOrElse(
+      throw new NotFoundException(s"No tasks reported metrics for $stageId / $stageAttemptId yet."))
   }
 
   @GET
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
index 45eaf935fb08..7d8e4de3c8ef 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
@@ -261,6 +261,9 @@ class TaskMetricDistributions private[spark](
     val resultSize: IndexedSeq[Double],
     val jvmGcTime: IndexedSeq[Double],
     val resultSerializationTime: IndexedSeq[Double],
+    val gettingResultTime: IndexedSeq[Double],
+    val schedulerDelay: IndexedSeq[Double],
+    val peakExecutionMemory: IndexedSeq[Double],
     val memoryBytesSpilled: IndexedSeq[Double],
     val diskBytesSpilled: IndexedSeq[Double],
 
diff --git a/core/src/main/scala/org/apache/spark/status/storeTypes.scala b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
index 1cfd30df4909..c9cb996a55fc 100644
--- a/core/src/main/scala/org/apache/spark/status/storeTypes.scala
+++ b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.status
 
-import java.lang.{Integer => JInteger, Long => JLong}
+import java.lang.{Long => JLong}
+import java.util.Date
 
 import com.fasterxml.jackson.annotation.JsonIgnore
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 
 import org.apache.spark.status.KVUtils._
 import org.apache.spark.status.api.v1._
@@ -49,10 +51,10 @@ private[spark] class ApplicationEnvironmentInfoWrapper(val info: ApplicationEnvi
 private[spark] class ExecutorSummaryWrapper(val info: ExecutorSummary) {
 
   @JsonIgnore @KVIndex
-  private[this] val id: String = info.id
+  private def id: String = info.id
 
   @JsonIgnore @KVIndex("active")
-  private[this] val active: Boolean = info.isActive
+  private def active: Boolean = info.isActive
 
   @JsonIgnore @KVIndex("host")
   val host: String = info.hostPort.split(":")(0)
@@ -69,51 +71,271 @@ private[spark] class JobDataWrapper(
     val skippedStages: Set[Int]) {
 
   @JsonIgnore @KVIndex
-  private[this] val id: Int = info.jobId
+  private def id: Int = info.jobId
 
 }
 
 private[spark] class StageDataWrapper(
     val info: StageData,
-    val jobIds: Set[Int]) {
+    val jobIds: Set[Int],
+    @JsonDeserialize(contentAs = classOf[JLong])
+    val locality: Map[String, Long]) {
 
   @JsonIgnore @KVIndex
-  def id: Array[Int] = Array(info.stageId, info.attemptId)
+  private[this] val id: Array[Int] = Array(info.stageId, info.attemptId)
 
   @JsonIgnore @KVIndex("stageId")
-  def stageId: Int = info.stageId
+  private def stageId: Int = info.stageId
 
+  @JsonIgnore @KVIndex("active")
+  private def active: Boolean = info.status == StageStatus.ACTIVE
+
+}
+
+/**
+ * Tasks have a lot of indices that are used in a few different places. This object keeps logical
+ * names for these indices, mapped to short strings to save space when using a disk store.
+ */
+private[spark] object TaskIndexNames {
+  final val ACCUMULATORS = "acc"
+  final val ATTEMPT = "att"
+  final val DESER_CPU_TIME = "dct"
+  final val DESER_TIME = "des"
+  final val DISK_SPILL = "dbs"
+  final val DURATION = "dur"
+  final val ERROR = "err"
+  final val EXECUTOR = "exe"
+  final val EXEC_CPU_TIME = "ect"
+  final val EXEC_RUN_TIME = "ert"
+  final val GC_TIME = "gc"
+  final val GETTING_RESULT_TIME = "grt"
+  final val INPUT_RECORDS = "ir"
+  final val INPUT_SIZE = "is"
+  final val LAUNCH_TIME = "lt"
+  final val LOCALITY = "loc"
+  final val MEM_SPILL = "mbs"
+  final val OUTPUT_RECORDS = "or"
+  final val OUTPUT_SIZE = "os"
+  final val PEAK_MEM = "pem"
+  final val RESULT_SIZE = "rs"
+  final val SCHEDULER_DELAY = "dly"
+  final val SER_TIME = "rst"
+  final val SHUFFLE_LOCAL_BLOCKS = "slbl"
+  final val SHUFFLE_READ_RECORDS = "srr"
+  final val SHUFFLE_READ_TIME = "srt"
+  final val SHUFFLE_REMOTE_BLOCKS = "srbl"
+  final val SHUFFLE_REMOTE_READS = "srby"
+  final val SHUFFLE_REMOTE_READS_TO_DISK = "srbd"
+  final val SHUFFLE_TOTAL_READS = "stby"
+  final val SHUFFLE_TOTAL_BLOCKS = "stbl"
+  final val SHUFFLE_WRITE_RECORDS = "swr"
+  final val SHUFFLE_WRITE_SIZE = "sws"
+  final val SHUFFLE_WRITE_TIME = "swt"
+  final val STAGE = "stage"
+  final val STATUS = "sta"
+  final val TASK_INDEX = "idx"
 }
 
 /**
- * The task information is always indexed with the stage ID, since that is how the UI and API
- * consume it. That means every indexed value has the stage ID and attempt ID included, aside
- * from the actual data being indexed.
+ * Unlike other data types, the task data wrapper does not keep a reference to the API's TaskData.
+ * That is to save memory, since for large applications there can be a large number of these
+ * elements (by default up to 100,000 per stage), and every bit of wasted memory adds up.
+ *
+ * It also contains many secondary indices, which are used to sort data efficiently in the UI at the
+ * expense of storage space (and slower write times).
  */
 private[spark] class TaskDataWrapper(
-    val info: TaskData,
+    // Storing this as an object actually saves memory; it's also used as the key in the in-memory
+    // store, so in that case you'd save the extra copy of the value here.
+    @KVIndexParam
+    val taskId: JLong,
+    @KVIndexParam(value = TaskIndexNames.TASK_INDEX, parent = TaskIndexNames.STAGE)
+    val index: Int,
+    @KVIndexParam(value = TaskIndexNames.ATTEMPT, parent = TaskIndexNames.STAGE)
+    val attempt: Int,
+    @KVIndexParam(value = TaskIndexNames.LAUNCH_TIME, parent = TaskIndexNames.STAGE)
+    val launchTime: Long,
+    val resultFetchStart: Long,
+    @KVIndexParam(value = TaskIndexNames.DURATION, parent = TaskIndexNames.STAGE)
+    val duration: Long,
+    @KVIndexParam(value = TaskIndexNames.EXECUTOR, parent = TaskIndexNames.STAGE)
+    val executorId: String,
+    val host: String,
+    @KVIndexParam(value = TaskIndexNames.STATUS, parent = TaskIndexNames.STAGE)
+    val status: String,
+    @KVIndexParam(value = TaskIndexNames.LOCALITY, parent = TaskIndexNames.STAGE)
+    val taskLocality: String,
+    val speculative: Boolean,
+    val accumulatorUpdates: Seq[AccumulableInfo],
+    val errorMessage: Option[String],
+
+    // The following is an exploded view of a TaskMetrics API object. This saves 5 objects
+    // (= 80 bytes of Java object overhead) per instance of this wrapper. If the first value
+    // (executorDeserializeTime) is -1L, it means the metrics for this task have not been
+    // recorded.
+    @KVIndexParam(value = TaskIndexNames.DESER_TIME, parent = TaskIndexNames.STAGE)
+    val executorDeserializeTime: Long,
+    @KVIndexParam(value = TaskIndexNames.DESER_CPU_TIME, parent = TaskIndexNames.STAGE)
+    val executorDeserializeCpuTime: Long,
+    @KVIndexParam(value = TaskIndexNames.EXEC_RUN_TIME, parent = TaskIndexNames.STAGE)
+    val executorRunTime: Long,
+    @KVIndexParam(value = TaskIndexNames.EXEC_CPU_TIME, parent = TaskIndexNames.STAGE)
+    val executorCpuTime: Long,
+    @KVIndexParam(value = TaskIndexNames.RESULT_SIZE, parent = TaskIndexNames.STAGE)
+    val resultSize: Long,
+    @KVIndexParam(value = TaskIndexNames.GC_TIME, parent = TaskIndexNames.STAGE)
+    val jvmGcTime: Long,
+    @KVIndexParam(value = TaskIndexNames.SER_TIME, parent = TaskIndexNames.STAGE)
+    val resultSerializationTime: Long,
+    @KVIndexParam(value = TaskIndexNames.MEM_SPILL, parent = TaskIndexNames.STAGE)
+    val memoryBytesSpilled: Long,
+    @KVIndexParam(value = TaskIndexNames.DISK_SPILL, parent = TaskIndexNames.STAGE)
+    val diskBytesSpilled: Long,
+    @KVIndexParam(value = TaskIndexNames.PEAK_MEM, parent = TaskIndexNames.STAGE)
+    val peakExecutionMemory: Long,
+    @KVIndexParam(value = TaskIndexNames.INPUT_SIZE, parent = TaskIndexNames.STAGE)
+    val inputBytesRead: Long,
+    @KVIndexParam(value = TaskIndexNames.INPUT_RECORDS, parent = TaskIndexNames.STAGE)
+    val inputRecordsRead: Long,
+    @KVIndexParam(value = TaskIndexNames.OUTPUT_SIZE, parent = TaskIndexNames.STAGE)
+    val outputBytesWritten: Long,
+    @KVIndexParam(value = TaskIndexNames.OUTPUT_RECORDS, parent = TaskIndexNames.STAGE)
+    val outputRecordsWritten: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_REMOTE_BLOCKS, parent = TaskIndexNames.STAGE)
+    val shuffleRemoteBlocksFetched: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_LOCAL_BLOCKS, parent = TaskIndexNames.STAGE)
+    val shuffleLocalBlocksFetched: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_READ_TIME, parent = TaskIndexNames.STAGE)
+    val shuffleFetchWaitTime: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_REMOTE_READS, parent = TaskIndexNames.STAGE)
+    val shuffleRemoteBytesRead: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_REMOTE_READS_TO_DISK,
+      parent = TaskIndexNames.STAGE)
+    val shuffleRemoteBytesReadToDisk: Long,
+    val shuffleLocalBytesRead: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_READ_RECORDS, parent = TaskIndexNames.STAGE)
+    val shuffleRecordsRead: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_WRITE_SIZE, parent = TaskIndexNames.STAGE)
+    val shuffleBytesWritten: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_WRITE_TIME, parent = TaskIndexNames.STAGE)
+    val shuffleWriteTime: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_WRITE_RECORDS, parent = TaskIndexNames.STAGE)
+    val shuffleRecordsWritten: Long,
+
     val stageId: Int,
     val stageAttemptId: Int) {
 
-  @JsonIgnore @KVIndex
-  def id: Long = info.taskId
+  def hasMetrics: Boolean = executorDeserializeTime >= 0
+
+  def toApi: TaskData = {
+    val metrics = if (hasMetrics) {
+      Some(new TaskMetrics(
+        executorDeserializeTime,
+        executorDeserializeCpuTime,
+        executorRunTime,
+        executorCpuTime,
+        resultSize,
+        jvmGcTime,
+        resultSerializationTime,
+        memoryBytesSpilled,
+        diskBytesSpilled,
+        peakExecutionMemory,
+        new InputMetrics(
+          inputBytesRead,
+          inputRecordsRead),
+        new OutputMetrics(
+          outputBytesWritten,
+          outputRecordsWritten),
+        new ShuffleReadMetrics(
+          shuffleRemoteBlocksFetched,
+          shuffleLocalBlocksFetched,
+          shuffleFetchWaitTime,
+          shuffleRemoteBytesRead,
+          shuffleRemoteBytesReadToDisk,
+          shuffleLocalBytesRead,
+          shuffleRecordsRead),
+        new ShuffleWriteMetrics(
+          shuffleBytesWritten,
+          shuffleWriteTime,
+          shuffleRecordsWritten)))
+    } else {
+      None
+    }
 
-  @JsonIgnore @KVIndex("stage")
-  def stage: Array[Int] = Array(stageId, stageAttemptId)
+    new TaskData(
+      taskId,
+      index,
+      attempt,
+      new Date(launchTime),
+      if (resultFetchStart > 0L) Some(new Date(resultFetchStart)) else None,
+      if (duration > 0L) Some(duration) else None,
+      executorId,
+      host,
+      status,
+      taskLocality,
+      speculative,
+      accumulatorUpdates,
+      errorMessage,
+      metrics)
+  }
+
+  @JsonIgnore @KVIndex(TaskIndexNames.STAGE)
+  private def stage: Array[Int] = Array(stageId, stageAttemptId)
 
-  @JsonIgnore @KVIndex("runtime")
-  def runtime: Array[AnyRef] = {
-    val _runtime = info.taskMetrics.map(_.executorRunTime).getOrElse(-1L)
-    Array(stageId: JInteger, stageAttemptId: JInteger, _runtime: JLong)
+  @JsonIgnore @KVIndex(value = TaskIndexNames.SCHEDULER_DELAY, parent = TaskIndexNames.STAGE)
+  def schedulerDelay: Long = {
+    if (hasMetrics) {
+      AppStatusUtils.schedulerDelay(launchTime, resultFetchStart, duration, executorDeserializeTime,
+        resultSerializationTime, executorRunTime)
+    } else {
+      -1L
+    }
   }
 
-  @JsonIgnore @KVIndex("startTime")
-  def startTime: Array[AnyRef] = {
-    Array(stageId: JInteger, stageAttemptId: JInteger, info.launchTime.getTime(): JLong)
+  @JsonIgnore @KVIndex(value = TaskIndexNames.GETTING_RESULT_TIME, parent = TaskIndexNames.STAGE)
+  def gettingResultTime: Long = {
+    if (hasMetrics) {
+      AppStatusUtils.gettingResultTime(launchTime, resultFetchStart, duration)
+    } else {
+      -1L
+    }
   }
 
-  @JsonIgnore @KVIndex("active")
-  def active: Boolean = info.duration.isEmpty
+  /**
+   * Sorting by accumulators is a little weird, and the previous behavior would generate
+   * insanely long keys in the index. So this implementation just considers the first
+   * accumulator and its String representation.
+   */
+  @JsonIgnore @KVIndex(value = TaskIndexNames.ACCUMULATORS, parent = TaskIndexNames.STAGE)
+  private def accumulators: String = {
+    if (accumulatorUpdates.nonEmpty) {
+      val acc = accumulatorUpdates.head
+      s"${acc.name}:${acc.value}"
+    } else {
+      ""
+    }
+  }
+
+  @JsonIgnore @KVIndex(value = TaskIndexNames.SHUFFLE_TOTAL_READS, parent = TaskIndexNames.STAGE)
+  private def shuffleTotalReads: Long = {
+    if (hasMetrics) {
+      shuffleLocalBytesRead + shuffleRemoteBytesRead
+    } else {
+      -1L
+    }
+  }
+
+  @JsonIgnore @KVIndex(value = TaskIndexNames.SHUFFLE_TOTAL_BLOCKS, parent = TaskIndexNames.STAGE)
+  private def shuffleTotalBlocks: Long = {
+    if (hasMetrics) {
+      shuffleLocalBlocksFetched + shuffleRemoteBlocksFetched
+    } else {
+      -1L
+    }
+  }
+
+  @JsonIgnore @KVIndex(value = TaskIndexNames.ERROR, parent = TaskIndexNames.STAGE)
+  private def error: String = if (errorMessage.isDefined) errorMessage.get else ""
 
 }
 
@@ -134,10 +356,13 @@ private[spark] class ExecutorStageSummaryWrapper(
     val info: ExecutorStageSummary) {
 
   @JsonIgnore @KVIndex
-  val id: Array[Any] = Array(stageId, stageAttemptId, executorId)
+  private val _id: Array[Any] = Array(stageId, stageAttemptId, executorId)
 
   @JsonIgnore @KVIndex("stage")
-  private[this] val stage: Array[Int] = Array(stageId, stageAttemptId)
+  private def stage: Array[Int] = Array(stageId, stageAttemptId)
+
+  @JsonIgnore
+  def id: Array[Any] = _id
 
 }
 
@@ -203,3 +428,53 @@ private[spark] class AppSummary(
   def id: String = classOf[AppSummary].getName()
 
 }
+
+/**
+ * A cached view of a specific quantile for one stage attempt's metrics.
+ */
+private[spark] class CachedQuantile(
+    val stageId: Int,
+    val stageAttemptId: Int,
+    val quantile: String,
+    val taskCount: Long,
+
+    // The following fields are an exploded view of a single entry for TaskMetricDistributions.
+    val executorDeserializeTime: Double,
+    val executorDeserializeCpuTime: Double,
+    val executorRunTime: Double,
+    val executorCpuTime: Double,
+    val resultSize: Double,
+    val jvmGcTime: Double,
+    val resultSerializationTime: Double,
+    val gettingResultTime: Double,
+    val schedulerDelay: Double,
+    val peakExecutionMemory: Double,
+    val memoryBytesSpilled: Double,
+    val diskBytesSpilled: Double,
+
+    val bytesRead: Double,
+    val recordsRead: Double,
+
+    val bytesWritten: Double,
+    val recordsWritten: Double,
+
+    val shuffleReadBytes: Double,
+    val shuffleRecordsRead: Double,
+    val shuffleRemoteBlocksFetched: Double,
+    val shuffleLocalBlocksFetched: Double,
+    val shuffleFetchWaitTime: Double,
+    val shuffleRemoteBytesRead: Double,
+    val shuffleRemoteBytesReadToDisk: Double,
+    val shuffleTotalBlocksFetched: Double,
+
+    val shuffleWriteBytes: Double,
+    val shuffleWriteRecords: Double,
+    val shuffleWriteTime: Double) {
+
+  @KVIndex @JsonIgnore
+  def id: Array[Any] = Array(stageId, stageAttemptId, quantile)
+
+  @KVIndex("stage") @JsonIgnore
+  def stage: Array[Int] = Array(stageId, stageAttemptId)
+
+}
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index 37e3b3b304a6..ff916bb6a575 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -65,12 +65,10 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We
     }.map { job =>
       val jobId = job.jobId
       val status = job.status
-      val displayJobDescription =
-        if (job.description.isEmpty) {
-          job.name
-        } else {
-          UIUtils.makeDescription(job.description.get, "", plainText = true).text
-        }
+      val jobDescription = store.lastStageAttempt(job.stageIds.max).description
+      val displayJobDescription = jobDescription
+        .map(UIUtils.makeDescription(_, "", plainText = true).text)
+        .getOrElse("")
       val submissionTime = job.submissionTime.get.getTime()
       val completionTime = job.completionTime.map(_.getTime()).getOrElse(System.currentTimeMillis())
       val classNameByStatus = status match {
@@ -429,20 +427,23 @@ private[ui] class JobDataSource(
     val formattedDuration = duration.map(d => UIUtils.formatDuration(d)).getOrElse("Unknown")
     val submissionTime = jobData.submissionTime
     val formattedSubmissionTime = submissionTime.map(UIUtils.formatDate).getOrElse("Unknown")
-    val jobDescription = UIUtils.makeDescription(jobData.description.getOrElse(""),
-      basePath, plainText = false)
+    val lastStageAttempt = store.lastStageAttempt(jobData.stageIds.max)
+    val lastStageDescription = lastStageAttempt.description.getOrElse("")
+
+    val formattedJobDescription =
+      UIUtils.makeDescription(lastStageDescription, basePath, plainText = false)
 
     val detailUrl = "%s/jobs/job?id=%s".format(basePath, jobData.jobId)
 
     new JobTableRowData(
       jobData,
-      jobData.name,
-      jobData.description.getOrElse(jobData.name),
+      lastStageAttempt.name,
+      lastStageDescription,
       duration.getOrElse(-1),
       formattedDuration,
       submissionTime.map(_.getTime()).getOrElse(-1L),
       formattedSubmissionTime,
-      jobDescription,
+      formattedJobDescription,
       detailUrl
     )
   }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
index 41d42b52430a..95c12b1e7365 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
@@ -87,7 +87,9 @@ private[ui] class ExecutorTable(stage: StageData, store: AppStatusStore) {
   }
 
   private def createExecutorTable(stage: StageData) : Seq[Node] = {
-    stage.executorSummary.getOrElse(Map.empty).toSeq.sortBy(_._1).map { case (k, v) =>
+    val executorSummary = store.executorSummary(stage.stageId, stage.attemptId)
+
+    executorSummary.toSeq.sortBy(_._1).map { case (k, v) =>
       val executor = store.asOption(store.executorSummary(k))
       <tr>
         <td>
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
index 740f12e7d13d..bf59152c8c0c 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
@@ -201,7 +201,7 @@ private[ui] class JobPage(parent: JobsTab, store: AppStatusStore) extends WebUIP
     val stages = jobData.stageIds.map { stageId =>
       // This could be empty if the listener hasn't received information about the
       // stage or if the stage information has been garbage collected
-      store.stageData(stageId).lastOption.getOrElse {
+      store.asOption(store.lastStageAttempt(stageId)).getOrElse {
         new v1.StageData(
           v1.StageStatus.PENDING,
           stageId,
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
index 99eab1b2a27d..ff1b75e5c506 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
@@ -34,10 +34,10 @@ private[ui] class JobsTab(parent: SparkUI, store: AppStatusStore)
   val killEnabled = parent.killEnabled
 
   def isFairScheduler: Boolean = {
-    store.environmentInfo().sparkProperties.toMap
-      .get("spark.scheduler.mode")
-      .map { mode => mode == SchedulingMode.FAIR }
-      .getOrElse(false)
+    store
+      .environmentInfo()
+      .sparkProperties
+      .contains(("spark.scheduler.mode", SchedulingMode.FAIR.toString))
   }
 
   def getSparkUser: String = parent.getSparkUser
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 11a6a3434497..af78373ddb4b 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ui.jobs
 
 import java.net.URLEncoder
 import java.util.Date
+import java.util.concurrent.TimeUnit
 import javax.servlet.http.HttpServletRequest
 
 import scala.collection.mutable.{HashMap, HashSet}
@@ -29,15 +30,14 @@ import org.apache.commons.lang3.StringEscapeUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.config._
 import org.apache.spark.scheduler.TaskLocality
-import org.apache.spark.status.AppStatusStore
+import org.apache.spark.status._
 import org.apache.spark.status.api.v1._
 import org.apache.spark.ui._
-import org.apache.spark.util.{Distribution, Utils}
+import org.apache.spark.util.Utils
 
 /** Page showing statistics and task list for a given stage */
 private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends WebUIPage("stage") {
   import ApiHelper._
-  import StagePage._
 
   private val TIMELINE_LEGEND = {
     <div class="legend-area">
@@ -67,17 +67,17 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
   // if we find that it's okay.
   private val MAX_TIMELINE_TASKS = parent.conf.getInt("spark.ui.timeline.tasks.maximum", 1000)
 
-  private def getLocalitySummaryString(stageData: StageData, taskList: Seq[TaskData]): String = {
-    val localities = taskList.map(_.taskLocality)
-    val localityCounts = localities.groupBy(identity).mapValues(_.size)
+  private def getLocalitySummaryString(localitySummary: Map[String, Long]): String = {
     val names = Map(
       TaskLocality.PROCESS_LOCAL.toString() -> "Process local",
       TaskLocality.NODE_LOCAL.toString() -> "Node local",
       TaskLocality.RACK_LOCAL.toString() -> "Rack local",
       TaskLocality.ANY.toString() -> "Any")
-    val localityNamesAndCounts = localityCounts.toSeq.map { case (locality, count) =>
-      s"${names(locality)}: $count"
-    }
+    val localityNamesAndCounts = names.flatMap { case (key, name) =>
+      localitySummary.get(key).map { count =>
+        s"$name: $count"
+      }
+    }.toSeq
     localityNamesAndCounts.sorted.mkString("; ")
   }
 
@@ -108,7 +108,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
 
     val stageHeader = s"Details for Stage $stageId (Attempt $stageAttemptId)"
     val stageData = parent.store
-      .asOption(parent.store.stageAttempt(stageId, stageAttemptId, details = true))
+      .asOption(parent.store.stageAttempt(stageId, stageAttemptId, details = false))
       .getOrElse {
         val content =
           <div id="no-info">
@@ -117,8 +117,11 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
         return UIUtils.headerSparkPage(stageHeader, content, parent)
       }
 
-    val tasks = stageData.tasks.getOrElse(Map.empty).values.toSeq
-    if (tasks.isEmpty) {
+    val localitySummary = store.localitySummary(stageData.stageId, stageData.attemptId)
+
+    val totalTasks = stageData.numActiveTasks + stageData.numCompleteTasks +
+      stageData.numFailedTasks + stageData.numKilledTasks
+    if (totalTasks == 0) {
       val content =
         <div>
           <h4>Summary Metrics</h4> No tasks have started yet
@@ -127,18 +130,14 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
       return UIUtils.headerSparkPage(stageHeader, content, parent)
     }
 
+    val storedTasks = store.taskCount(stageData.stageId, stageData.attemptId)
     val numCompleted = stageData.numCompleteTasks
-    val totalTasks = stageData.numActiveTasks + stageData.numCompleteTasks +
-      stageData.numFailedTasks + stageData.numKilledTasks
-    val totalTasksNumStr = if (totalTasks == tasks.size) {
+    val totalTasksNumStr = if (totalTasks == storedTasks) {
       s"$totalTasks"
     } else {
-      s"$totalTasks, showing ${tasks.size}"
+      s"$totalTasks, showing ${storedTasks}"
     }
 
-    val externalAccumulables = stageData.accumulatorUpdates
-    val hasAccumulators = externalAccumulables.size > 0
-
     val summary =
       <div>
         <ul class="unstyled">
@@ -148,7 +147,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
           </li>
           <li>
             <strong>Locality Level Summary: </strong>
-            {getLocalitySummaryString(stageData, tasks)}
+            {getLocalitySummaryString(localitySummary)}
           </li>
           {if (hasInput(stageData)) {
             <li>
@@ -266,7 +265,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
     val accumulableTable = UIUtils.listingTable(
       accumulableHeaders,
       accumulableRow,
-      externalAccumulables.toSeq)
+      stageData.accumulatorUpdates.toSeq)
 
     val page: Int = {
       // If the user has changed to a larger page size, then go to page 1 in order to avoid
@@ -280,16 +279,9 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
     val currentTime = System.currentTimeMillis()
     val (taskTable, taskTableHTML) = try {
       val _taskTable = new TaskPagedTable(
-        parent.conf,
+        stageData,
         UIUtils.prependBaseUri(parent.basePath) +
           s"/stages/stage?id=${stageId}&attempt=${stageAttemptId}",
-        tasks,
-        hasAccumulators,
-        hasInput(stageData),
-        hasOutput(stageData),
-        hasShuffleRead(stageData),
-        hasShuffleWrite(stageData),
-        hasBytesSpilled(stageData),
         currentTime,
         pageSize = taskPageSize,
         sortColumn = taskSortColumn,
@@ -320,217 +312,155 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
             |  }
             |});
           """.stripMargin
-         }
+          }
         }
       </script>
 
-    val taskIdsInPage = if (taskTable == null) Set.empty[Long]
-      else taskTable.dataSource.slicedTaskIds
+    val metricsSummary = store.taskSummary(stageData.stageId, stageData.attemptId,
+      Array(0, 0.25, 0.5, 0.75, 1.0))
 
-    // Excludes tasks which failed and have incomplete metrics
-    val validTasks = tasks.filter(t => t.status == "SUCCESS" && t.taskMetrics.isDefined)
-
-    val summaryTable: Option[Seq[Node]] =
-      if (validTasks.size == 0) {
-        None
-      } else {
-        def getDistributionQuantiles(data: Seq[Double]): IndexedSeq[Double] = {
-          Distribution(data).get.getQuantiles()
-        }
-        def getFormattedTimeQuantiles(times: Seq[Double]): Seq[Node] = {
-          getDistributionQuantiles(times).map { millis =>
-            <td>{UIUtils.formatDuration(millis.toLong)}</td>
-          }
-        }
-        def getFormattedSizeQuantiles(data: Seq[Double]): Seq[Elem] = {
-          getDistributionQuantiles(data).map(d => <td>{Utils.bytesToString(d.toLong)}</td>)
+    val summaryTable = metricsSummary.map { metrics =>
+      def timeQuantiles(data: IndexedSeq[Double]): Seq[Node] = {
+        data.map { millis =>
+          <td>{UIUtils.formatDuration(millis.toLong)}</td>
         }
+      }
 
-        val deserializationTimes = validTasks.map { task =>
-          task.taskMetrics.get.executorDeserializeTime.toDouble
-        }
-        val deserializationQuantiles =
-          <td>
-            <span data-toggle="tooltip" title={ToolTips.TASK_DESERIALIZATION_TIME}
-                  data-placement="right">
-              Task Deserialization Time
-            </span>
-          </td> +: getFormattedTimeQuantiles(deserializationTimes)
-
-        val serviceTimes = validTasks.map(_.taskMetrics.get.executorRunTime.toDouble)
-        val serviceQuantiles = <td>Duration</td> +: getFormattedTimeQuantiles(serviceTimes)
-
-        val gcTimes = validTasks.map(_.taskMetrics.get.jvmGcTime.toDouble)
-        val gcQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                title={ToolTips.GC_TIME} data-placement="right">GC Time
-            </span>
-          </td> +: getFormattedTimeQuantiles(gcTimes)
-
-        val serializationTimes = validTasks.map(_.taskMetrics.get.resultSerializationTime.toDouble)
-        val serializationQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.RESULT_SERIALIZATION_TIME} data-placement="right">
-              Result Serialization Time
-            </span>
-          </td> +: getFormattedTimeQuantiles(serializationTimes)
-
-        val gettingResultTimes = validTasks.map(getGettingResultTime(_, currentTime).toDouble)
-        val gettingResultQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                title={ToolTips.GETTING_RESULT_TIME} data-placement="right">
-              Getting Result Time
-            </span>
-          </td> +:
-          getFormattedTimeQuantiles(gettingResultTimes)
-
-        val peakExecutionMemory = validTasks.map(_.taskMetrics.get.peakExecutionMemory.toDouble)
-        val peakExecutionMemoryQuantiles = {
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.PEAK_EXECUTION_MEMORY} data-placement="right">
-              Peak Execution Memory
-            </span>
-          </td> +: getFormattedSizeQuantiles(peakExecutionMemory)
+      def sizeQuantiles(data: IndexedSeq[Double]): Seq[Node] = {
+        data.map { size =>
+          <td>{Utils.bytesToString(size.toLong)}</td>
         }
+      }
 
-        // The scheduler delay includes the network delay to send the task to the worker
-        // machine and to send back the result (but not the time to fetch the task result,
-        // if it needed to be fetched from the block manager on the worker).
-        val schedulerDelays = validTasks.map { task =>
-          getSchedulerDelay(task, task.taskMetrics.get, currentTime).toDouble
-        }
-        val schedulerDelayTitle = <td><span data-toggle="tooltip"
-          title={ToolTips.SCHEDULER_DELAY} data-placement="right">Scheduler Delay</span></td>
-        val schedulerDelayQuantiles = schedulerDelayTitle +:
-          getFormattedTimeQuantiles(schedulerDelays)
-        def getFormattedSizeQuantilesWithRecords(data: Seq[Double], records: Seq[Double])
-          : Seq[Elem] = {
-          val recordDist = getDistributionQuantiles(records).iterator
-          getDistributionQuantiles(data).map(d =>
-            <td>{s"${Utils.bytesToString(d.toLong)} / ${recordDist.next().toLong}"}</td>
-          )
+      def sizeQuantilesWithRecords(
+          data: IndexedSeq[Double],
+          records: IndexedSeq[Double]) : Seq[Node] = {
+        data.zip(records).map { case (d, r) =>
+          <td>{s"${Utils.bytesToString(d.toLong)} / ${r.toLong}"}</td>
         }
+      }
 
-        val inputSizes = validTasks.map(_.taskMetrics.get.inputMetrics.bytesRead.toDouble)
-        val inputRecords = validTasks.map(_.taskMetrics.get.inputMetrics.recordsRead.toDouble)
-        val inputQuantiles = <td>Input Size / Records</td> +:
-          getFormattedSizeQuantilesWithRecords(inputSizes, inputRecords)
+      def titleCell(title: String, tooltip: String): Seq[Node] = {
+        <td>
+          <span data-toggle="tooltip" title={tooltip} data-placement="right">
+            {title}
+          </span>
+        </td>
+      }
 
-        val outputSizes = validTasks.map(_.taskMetrics.get.outputMetrics.bytesWritten.toDouble)
-        val outputRecords = validTasks.map(_.taskMetrics.get.outputMetrics.recordsWritten.toDouble)
-        val outputQuantiles = <td>Output Size / Records</td> +:
-          getFormattedSizeQuantilesWithRecords(outputSizes, outputRecords)
+      def simpleTitleCell(title: String): Seq[Node] = <td>{title}</td>
 
-        val shuffleReadBlockedTimes = validTasks.map { task =>
-          task.taskMetrics.get.shuffleReadMetrics.fetchWaitTime.toDouble
-        }
-        val shuffleReadBlockedQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.SHUFFLE_READ_BLOCKED_TIME} data-placement="right">
-              Shuffle Read Blocked Time
-            </span>
-          </td> +:
-          getFormattedTimeQuantiles(shuffleReadBlockedTimes)
-
-        val shuffleReadTotalSizes = validTasks.map { task =>
-          totalBytesRead(task.taskMetrics.get.shuffleReadMetrics).toDouble
-        }
-        val shuffleReadTotalRecords = validTasks.map { task =>
-          task.taskMetrics.get.shuffleReadMetrics.recordsRead.toDouble
-        }
-        val shuffleReadTotalQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.SHUFFLE_READ} data-placement="right">
-              Shuffle Read Size / Records
-            </span>
-          </td> +:
-          getFormattedSizeQuantilesWithRecords(shuffleReadTotalSizes, shuffleReadTotalRecords)
-
-        val shuffleReadRemoteSizes = validTasks.map { task =>
-          task.taskMetrics.get.shuffleReadMetrics.remoteBytesRead.toDouble
-        }
-        val shuffleReadRemoteQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.SHUFFLE_READ_REMOTE_SIZE} data-placement="right">
-              Shuffle Remote Reads
-            </span>
-          </td> +:
-          getFormattedSizeQuantiles(shuffleReadRemoteSizes)
-
-        val shuffleWriteSizes = validTasks.map { task =>
-          task.taskMetrics.get.shuffleWriteMetrics.bytesWritten.toDouble
-        }
+      val deserializationQuantiles = titleCell("Task Deserialization Time",
+        ToolTips.TASK_DESERIALIZATION_TIME) ++ timeQuantiles(metrics.executorDeserializeTime)
 
-        val shuffleWriteRecords = validTasks.map { task =>
-          task.taskMetrics.get.shuffleWriteMetrics.recordsWritten.toDouble
-        }
+      val serviceQuantiles = simpleTitleCell("Duration") ++ timeQuantiles(metrics.executorRunTime)
 
-        val shuffleWriteQuantiles = <td>Shuffle Write Size / Records</td> +:
-          getFormattedSizeQuantilesWithRecords(shuffleWriteSizes, shuffleWriteRecords)
+      val gcQuantiles = titleCell("GC Time", ToolTips.GC_TIME) ++ timeQuantiles(metrics.jvmGcTime)
 
-        val memoryBytesSpilledSizes = validTasks.map(_.taskMetrics.get.memoryBytesSpilled.toDouble)
-        val memoryBytesSpilledQuantiles = <td>Shuffle spill (memory)</td> +:
-          getFormattedSizeQuantiles(memoryBytesSpilledSizes)
+      val serializationQuantiles = titleCell("Result Serialization Time",
+        ToolTips.RESULT_SERIALIZATION_TIME) ++ timeQuantiles(metrics.resultSerializationTime)
 
-        val diskBytesSpilledSizes = validTasks.map(_.taskMetrics.get.diskBytesSpilled.toDouble)
-        val diskBytesSpilledQuantiles = <td>Shuffle spill (disk)</td> +:
-          getFormattedSizeQuantiles(diskBytesSpilledSizes)
+      val gettingResultQuantiles = titleCell("Getting Result Time", ToolTips.GETTING_RESULT_TIME) ++
+        timeQuantiles(metrics.gettingResultTime)
 
-        val listings: Seq[Seq[Node]] = Seq(
-          <tr>{serviceQuantiles}</tr>,
-          <tr class={TaskDetailsClassNames.SCHEDULER_DELAY}>{schedulerDelayQuantiles}</tr>,
-          <tr class={TaskDetailsClassNames.TASK_DESERIALIZATION_TIME}>
-            {deserializationQuantiles}
-          </tr>
-          <tr>{gcQuantiles}</tr>,
-          <tr class={TaskDetailsClassNames.RESULT_SERIALIZATION_TIME}>
-            {serializationQuantiles}
-          </tr>,
-          <tr class={TaskDetailsClassNames.GETTING_RESULT_TIME}>{gettingResultQuantiles}</tr>,
-          <tr class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
-            {peakExecutionMemoryQuantiles}
-          </tr>,
-          if (hasInput(stageData)) <tr>{inputQuantiles}</tr> else Nil,
-          if (hasOutput(stageData)) <tr>{outputQuantiles}</tr> else Nil,
-          if (hasShuffleRead(stageData)) {
-            <tr class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
-              {shuffleReadBlockedQuantiles}
-            </tr>
-            <tr>{shuffleReadTotalQuantiles}</tr>
-            <tr class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
-              {shuffleReadRemoteQuantiles}
-            </tr>
-          } else {
-            Nil
-          },
-          if (hasShuffleWrite(stageData)) <tr>{shuffleWriteQuantiles}</tr> else Nil,
-          if (hasBytesSpilled(stageData)) <tr>{memoryBytesSpilledQuantiles}</tr> else Nil,
-          if (hasBytesSpilled(stageData)) <tr>{diskBytesSpilledQuantiles}</tr> else Nil)
-
-        val quantileHeaders = Seq("Metric", "Min", "25th percentile",
-          "Median", "75th percentile", "Max")
-        // The summary table does not use CSS to stripe rows, which doesn't work with hidden
-        // rows (instead, JavaScript in table.js is used to stripe the non-hidden rows).
-        Some(UIUtils.listingTable(
-          quantileHeaders,
-          identity[Seq[Node]],
-          listings,
-          fixedWidth = true,
-          id = Some("task-summary-table"),
-          stripeRowsWithCss = false))
+      val peakExecutionMemoryQuantiles = titleCell("Peak Execution Memory",
+        ToolTips.PEAK_EXECUTION_MEMORY) ++ sizeQuantiles(metrics.peakExecutionMemory)
+
+      // The scheduler delay includes the network delay to send the task to the worker
+      // machine and to send back the result (but not the time to fetch the task result,
+      // if it needed to be fetched from the block manager on the worker).
+      val schedulerDelayQuantiles = titleCell("Scheduler Delay", ToolTips.SCHEDULER_DELAY) ++
+        timeQuantiles(metrics.schedulerDelay)
+
+      def inputQuantiles: Seq[Node] = {
+        simpleTitleCell("Input Size / Records") ++
+          sizeQuantilesWithRecords(metrics.inputMetrics.bytesRead, metrics.inputMetrics.recordsRead)
+      }
+
+      def outputQuantiles: Seq[Node] = {
+        simpleTitleCell("Output Size / Records") ++
+          sizeQuantilesWithRecords(metrics.outputMetrics.bytesWritten,
+            metrics.outputMetrics.recordsWritten)
       }
 
+      def shuffleReadBlockedQuantiles: Seq[Node] = {
+        titleCell("Shuffle Read Blocked Time", ToolTips.SHUFFLE_READ_BLOCKED_TIME) ++
+          timeQuantiles(metrics.shuffleReadMetrics.fetchWaitTime)
+      }
+
+      def shuffleReadTotalQuantiles: Seq[Node] = {
+        titleCell("Shuffle Read Size / Records", ToolTips.SHUFFLE_READ) ++
+          sizeQuantilesWithRecords(metrics.shuffleReadMetrics.readBytes,
+            metrics.shuffleReadMetrics.readRecords)
+      }
+
+      def shuffleReadRemoteQuantiles: Seq[Node] = {
+        titleCell("Shuffle Remote Reads", ToolTips.SHUFFLE_READ_REMOTE_SIZE) ++
+          sizeQuantiles(metrics.shuffleReadMetrics.remoteBytesRead)
+      }
+
+      def shuffleWriteQuantiles: Seq[Node] = {
+        simpleTitleCell("Shuffle Write Size / Records") ++
+          sizeQuantilesWithRecords(metrics.shuffleWriteMetrics.writeBytes,
+            metrics.shuffleWriteMetrics.writeRecords)
+      }
+
+      def memoryBytesSpilledQuantiles: Seq[Node] = {
+        simpleTitleCell("Shuffle spill (memory)") ++ sizeQuantiles(metrics.memoryBytesSpilled)
+      }
+
+      def diskBytesSpilledQuantiles: Seq[Node] = {
+        simpleTitleCell("Shuffle spill (disk)") ++ sizeQuantiles(metrics.diskBytesSpilled)
+      }
+
+      val listings: Seq[Seq[Node]] = Seq(
+        <tr>{serviceQuantiles}</tr>,
+        <tr class={TaskDetailsClassNames.SCHEDULER_DELAY}>{schedulerDelayQuantiles}</tr>,
+        <tr class={TaskDetailsClassNames.TASK_DESERIALIZATION_TIME}>
+          {deserializationQuantiles}
+        </tr>
+        <tr>{gcQuantiles}</tr>,
+        <tr class={TaskDetailsClassNames.RESULT_SERIALIZATION_TIME}>
+          {serializationQuantiles}
+        </tr>,
+        <tr class={TaskDetailsClassNames.GETTING_RESULT_TIME}>{gettingResultQuantiles}</tr>,
+        <tr class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
+          {peakExecutionMemoryQuantiles}
+        </tr>,
+        if (hasInput(stageData)) <tr>{inputQuantiles}</tr> else Nil,
+        if (hasOutput(stageData)) <tr>{outputQuantiles}</tr> else Nil,
+        if (hasShuffleRead(stageData)) {
+          <tr class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
+            {shuffleReadBlockedQuantiles}
+          </tr>
+          <tr>{shuffleReadTotalQuantiles}</tr>
+          <tr class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
+            {shuffleReadRemoteQuantiles}
+          </tr>
+        } else {
+          Nil
+        },
+        if (hasShuffleWrite(stageData)) <tr>{shuffleWriteQuantiles}</tr> else Nil,
+        if (hasBytesSpilled(stageData)) <tr>{memoryBytesSpilledQuantiles}</tr> else Nil,
+        if (hasBytesSpilled(stageData)) <tr>{diskBytesSpilledQuantiles}</tr> else Nil)
+
+      val quantileHeaders = Seq("Metric", "Min", "25th percentile", "Median", "75th percentile",
+        "Max")
+      // The summary table does not use CSS to stripe rows, which doesn't work with hidden
+      // rows (instead, JavaScript in table.js is used to stripe the non-hidden rows).
+      UIUtils.listingTable(
+        quantileHeaders,
+        identity[Seq[Node]],
+        listings,
+        fixedWidth = true,
+        id = Some("task-summary-table"),
+        stripeRowsWithCss = false)
+    }
+
     val executorTable = new ExecutorTable(stageData, parent.store)
 
     val maybeAccumulableTable: Seq[Node] =
-      if (hasAccumulators) { <h4>Accumulators</h4> ++ accumulableTable } else Seq()
+      if (hasAccumulators(stageData)) { <h4>Accumulators</h4> ++ accumulableTable } else Seq()
 
     val aggMetrics =
       <span class="collapse-aggregated-metrics collapse-table"
@@ -550,7 +480,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
       showAdditionalMetrics ++
       makeTimeline(
         // Only show the tasks in the table
-        tasks.filter { t => taskIdsInPage.contains(t.taskId) },
+        Option(taskTable).map(_.dataSource.tasks).getOrElse(Nil),
         currentTime) ++
       <h4>Summary Metrics for <a href="#tasks-section">{numCompleted} Completed Tasks</a></h4> ++
       <div>{summaryTable.getOrElse("No tasks have reported metrics yet.")}</div> ++
@@ -593,10 +523,9 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
         val serializationTimeProportion = toProportion(serializationTime)
         val deserializationTime = metricsOpt.map(_.executorDeserializeTime).getOrElse(0L)
         val deserializationTimeProportion = toProportion(deserializationTime)
-        val gettingResultTime = getGettingResultTime(taskInfo, currentTime)
+        val gettingResultTime = AppStatusUtils.gettingResultTime(taskInfo)
         val gettingResultTimeProportion = toProportion(gettingResultTime)
-        val schedulerDelay =
-          metricsOpt.map(getSchedulerDelay(taskInfo, _, currentTime)).getOrElse(0L)
+        val schedulerDelay = AppStatusUtils.schedulerDelay(taskInfo)
         val schedulerDelayProportion = toProportion(schedulerDelay)
 
         val executorOverhead = serializationTime + deserializationTime
@@ -708,7 +637,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
       {
         if (MAX_TIMELINE_TASKS < tasks.size) {
           <strong>
-            This stage has more than the maximum number of tasks that can be shown in the
+            This page has more than the maximum number of tasks that can be shown in the
             visualization! Only the most recent {MAX_TIMELINE_TASKS} tasks
             (of {tasks.size} total) are shown.
           </strong>
@@ -733,402 +662,49 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
 
 }
 
-private[ui] object StagePage {
-  private[ui] def getGettingResultTime(info: TaskData, currentTime: Long): Long = {
-    info.resultFetchStart match {
-      case Some(start) =>
-        info.duration match {
-          case Some(duration) =>
-            info.launchTime.getTime() + duration - start.getTime()
-
-          case _ =>
-            currentTime - start.getTime()
-        }
-
-      case _ =>
-        0L
-    }
-  }
-
-  private[ui] def getSchedulerDelay(
-      info: TaskData,
-      metrics: TaskMetrics,
-      currentTime: Long): Long = {
-    info.duration match {
-      case Some(duration) =>
-        val executorOverhead = metrics.executorDeserializeTime + metrics.resultSerializationTime
-        math.max(
-          0,
-          duration - metrics.executorRunTime - executorOverhead -
-            getGettingResultTime(info, currentTime))
-
-      case _ =>
-        // The task is still running and the metrics like executorRunTime are not available.
-        0L
-    }
-  }
-
-}
-
-private[ui] case class TaskTableRowInputData(inputSortable: Long, inputReadable: String)
-
-private[ui] case class TaskTableRowOutputData(outputSortable: Long, outputReadable: String)
-
-private[ui] case class TaskTableRowShuffleReadData(
-    shuffleReadBlockedTimeSortable: Long,
-    shuffleReadBlockedTimeReadable: String,
-    shuffleReadSortable: Long,
-    shuffleReadReadable: String,
-    shuffleReadRemoteSortable: Long,
-    shuffleReadRemoteReadable: String)
-
-private[ui] case class TaskTableRowShuffleWriteData(
-    writeTimeSortable: Long,
-    writeTimeReadable: String,
-    shuffleWriteSortable: Long,
-    shuffleWriteReadable: String)
-
-private[ui] case class TaskTableRowBytesSpilledData(
-    memoryBytesSpilledSortable: Long,
-    memoryBytesSpilledReadable: String,
-    diskBytesSpilledSortable: Long,
-    diskBytesSpilledReadable: String)
-
-/**
- * Contains all data that needs for sorting and generating HTML. Using this one rather than
- * TaskData to avoid creating duplicate contents during sorting the data.
- */
-private[ui] class TaskTableRowData(
-    val index: Int,
-    val taskId: Long,
-    val attempt: Int,
-    val speculative: Boolean,
-    val status: String,
-    val taskLocality: String,
-    val executorId: String,
-    val host: String,
-    val launchTime: Long,
-    val duration: Long,
-    val formatDuration: String,
-    val schedulerDelay: Long,
-    val taskDeserializationTime: Long,
-    val gcTime: Long,
-    val serializationTime: Long,
-    val gettingResultTime: Long,
-    val peakExecutionMemoryUsed: Long,
-    val accumulators: Option[String], // HTML
-    val input: Option[TaskTableRowInputData],
-    val output: Option[TaskTableRowOutputData],
-    val shuffleRead: Option[TaskTableRowShuffleReadData],
-    val shuffleWrite: Option[TaskTableRowShuffleWriteData],
-    val bytesSpilled: Option[TaskTableRowBytesSpilledData],
-    val error: String,
-    val logs: Map[String, String])
-
 private[ui] class TaskDataSource(
-    tasks: Seq[TaskData],
-    hasAccumulators: Boolean,
-    hasInput: Boolean,
-    hasOutput: Boolean,
-    hasShuffleRead: Boolean,
-    hasShuffleWrite: Boolean,
-    hasBytesSpilled: Boolean,
+    stage: StageData,
     currentTime: Long,
     pageSize: Int,
     sortColumn: String,
     desc: Boolean,
-    store: AppStatusStore) extends PagedDataSource[TaskTableRowData](pageSize) {
-  import StagePage._
+    store: AppStatusStore) extends PagedDataSource[TaskData](pageSize) {
+  import ApiHelper._
 
   // Keep an internal cache of executor log maps so that long task lists render faster.
   private val executorIdToLogs = new HashMap[String, Map[String, String]]()
 
-  // Convert TaskData to TaskTableRowData which contains the final contents to show in the table
-  // so that we can avoid creating duplicate contents during sorting the data
-  private val data = tasks.map(taskRow).sorted(ordering(sortColumn, desc))
-
-  private var _slicedTaskIds: Set[Long] = _
+  private var _tasksToShow: Seq[TaskData] = null
 
-  override def dataSize: Int = data.size
+  override def dataSize: Int = stage.numTasks
 
-  override def sliceData(from: Int, to: Int): Seq[TaskTableRowData] = {
-    val r = data.slice(from, to)
-    _slicedTaskIds = r.map(_.taskId).toSet
-    r
-  }
-
-  def slicedTaskIds: Set[Long] = _slicedTaskIds
-
-  private def taskRow(info: TaskData): TaskTableRowData = {
-    val metrics = info.taskMetrics
-    val duration = info.duration.getOrElse(1L)
-    val formatDuration = info.duration.map(d => UIUtils.formatDuration(d)).getOrElse("")
-    val schedulerDelay = metrics.map(getSchedulerDelay(info, _, currentTime)).getOrElse(0L)
-    val gcTime = metrics.map(_.jvmGcTime).getOrElse(0L)
-    val taskDeserializationTime = metrics.map(_.executorDeserializeTime).getOrElse(0L)
-    val serializationTime = metrics.map(_.resultSerializationTime).getOrElse(0L)
-    val gettingResultTime = getGettingResultTime(info, currentTime)
-
-    val externalAccumulableReadable = info.accumulatorUpdates.map { acc =>
-      StringEscapeUtils.escapeHtml4(s"${acc.name}: ${acc.update}")
+  override def sliceData(from: Int, to: Int): Seq[TaskData] = {
+    if (_tasksToShow == null) {
+      _tasksToShow = store.taskList(stage.stageId, stage.attemptId, from, to - from,
+        indexName(sortColumn), !desc)
     }
-    val peakExecutionMemoryUsed = metrics.map(_.peakExecutionMemory).getOrElse(0L)
-
-    val maybeInput = metrics.map(_.inputMetrics)
-    val inputSortable = maybeInput.map(_.bytesRead).getOrElse(0L)
-    val inputReadable = maybeInput
-      .map(m => s"${Utils.bytesToString(m.bytesRead)}")
-      .getOrElse("")
-    val inputRecords = maybeInput.map(_.recordsRead.toString).getOrElse("")
-
-    val maybeOutput = metrics.map(_.outputMetrics)
-    val outputSortable = maybeOutput.map(_.bytesWritten).getOrElse(0L)
-    val outputReadable = maybeOutput
-      .map(m => s"${Utils.bytesToString(m.bytesWritten)}")
-      .getOrElse("")
-    val outputRecords = maybeOutput.map(_.recordsWritten.toString).getOrElse("")
-
-    val maybeShuffleRead = metrics.map(_.shuffleReadMetrics)
-    val shuffleReadBlockedTimeSortable = maybeShuffleRead.map(_.fetchWaitTime).getOrElse(0L)
-    val shuffleReadBlockedTimeReadable =
-      maybeShuffleRead.map(ms => UIUtils.formatDuration(ms.fetchWaitTime)).getOrElse("")
-
-    val totalShuffleBytes = maybeShuffleRead.map(ApiHelper.totalBytesRead)
-    val shuffleReadSortable = totalShuffleBytes.getOrElse(0L)
-    val shuffleReadReadable = totalShuffleBytes.map(Utils.bytesToString).getOrElse("")
-    val shuffleReadRecords = maybeShuffleRead.map(_.recordsRead.toString).getOrElse("")
-
-    val remoteShuffleBytes = maybeShuffleRead.map(_.remoteBytesRead)
-    val shuffleReadRemoteSortable = remoteShuffleBytes.getOrElse(0L)
-    val shuffleReadRemoteReadable = remoteShuffleBytes.map(Utils.bytesToString).getOrElse("")
-
-    val maybeShuffleWrite = metrics.map(_.shuffleWriteMetrics)
-    val shuffleWriteSortable = maybeShuffleWrite.map(_.bytesWritten).getOrElse(0L)
-    val shuffleWriteReadable = maybeShuffleWrite
-      .map(m => s"${Utils.bytesToString(m.bytesWritten)}").getOrElse("")
-    val shuffleWriteRecords = maybeShuffleWrite
-      .map(_.recordsWritten.toString).getOrElse("")
-
-    val maybeWriteTime = metrics.map(_.shuffleWriteMetrics.writeTime)
-    val writeTimeSortable = maybeWriteTime.getOrElse(0L)
-    val writeTimeReadable = maybeWriteTime.map(t => t / (1000 * 1000)).map { ms =>
-      if (ms == 0) "" else UIUtils.formatDuration(ms)
-    }.getOrElse("")
-
-    val maybeMemoryBytesSpilled = metrics.map(_.memoryBytesSpilled)
-    val memoryBytesSpilledSortable = maybeMemoryBytesSpilled.getOrElse(0L)
-    val memoryBytesSpilledReadable =
-      maybeMemoryBytesSpilled.map(Utils.bytesToString).getOrElse("")
-
-    val maybeDiskBytesSpilled = metrics.map(_.diskBytesSpilled)
-    val diskBytesSpilledSortable = maybeDiskBytesSpilled.getOrElse(0L)
-    val diskBytesSpilledReadable = maybeDiskBytesSpilled.map(Utils.bytesToString).getOrElse("")
-
-    val input =
-      if (hasInput) {
-        Some(TaskTableRowInputData(inputSortable, s"$inputReadable / $inputRecords"))
-      } else {
-        None
-      }
-
-    val output =
-      if (hasOutput) {
-        Some(TaskTableRowOutputData(outputSortable, s"$outputReadable / $outputRecords"))
-      } else {
-        None
-      }
-
-    val shuffleRead =
-      if (hasShuffleRead) {
-        Some(TaskTableRowShuffleReadData(
-          shuffleReadBlockedTimeSortable,
-          shuffleReadBlockedTimeReadable,
-          shuffleReadSortable,
-          s"$shuffleReadReadable / $shuffleReadRecords",
-          shuffleReadRemoteSortable,
-          shuffleReadRemoteReadable
-        ))
-      } else {
-        None
-      }
-
-    val shuffleWrite =
-      if (hasShuffleWrite) {
-        Some(TaskTableRowShuffleWriteData(
-          writeTimeSortable,
-          writeTimeReadable,
-          shuffleWriteSortable,
-          s"$shuffleWriteReadable / $shuffleWriteRecords"
-        ))
-      } else {
-        None
-      }
-
-    val bytesSpilled =
-      if (hasBytesSpilled) {
-        Some(TaskTableRowBytesSpilledData(
-          memoryBytesSpilledSortable,
-          memoryBytesSpilledReadable,
-          diskBytesSpilledSortable,
-          diskBytesSpilledReadable
-        ))
-      } else {
-        None
-      }
-
-    new TaskTableRowData(
-      info.index,
-      info.taskId,
-      info.attempt,
-      info.speculative,
-      info.status,
-      info.taskLocality.toString,
-      info.executorId,
-      info.host,
-      info.launchTime.getTime(),
-      duration,
-      formatDuration,
-      schedulerDelay,
-      taskDeserializationTime,
-      gcTime,
-      serializationTime,
-      gettingResultTime,
-      peakExecutionMemoryUsed,
-      if (hasAccumulators) Some(externalAccumulableReadable.mkString("<br/>")) else None,
-      input,
-      output,
-      shuffleRead,
-      shuffleWrite,
-      bytesSpilled,
-      info.errorMessage.getOrElse(""),
-      executorLogs(info.executorId))
+    _tasksToShow
   }
 
-  private def executorLogs(id: String): Map[String, String] = {
+  def tasks: Seq[TaskData] = _tasksToShow
+
+  def executorLogs(id: String): Map[String, String] = {
     executorIdToLogs.getOrElseUpdate(id,
       store.asOption(store.executorSummary(id)).map(_.executorLogs).getOrElse(Map.empty))
   }
 
-  /**
-   * Return Ordering according to sortColumn and desc
-   */
-  private def ordering(sortColumn: String, desc: Boolean): Ordering[TaskTableRowData] = {
-    val ordering: Ordering[TaskTableRowData] = sortColumn match {
-      case "Index" => Ordering.by(_.index)
-      case "ID" => Ordering.by(_.taskId)
-      case "Attempt" => Ordering.by(_.attempt)
-      case "Status" => Ordering.by(_.status)
-      case "Locality Level" => Ordering.by(_.taskLocality)
-      case "Executor ID" => Ordering.by(_.executorId)
-      case "Host" => Ordering.by(_.host)
-      case "Launch Time" => Ordering.by(_.launchTime)
-      case "Duration" => Ordering.by(_.duration)
-      case "Scheduler Delay" => Ordering.by(_.schedulerDelay)
-      case "Task Deserialization Time" => Ordering.by(_.taskDeserializationTime)
-      case "GC Time" => Ordering.by(_.gcTime)
-      case "Result Serialization Time" => Ordering.by(_.serializationTime)
-      case "Getting Result Time" => Ordering.by(_.gettingResultTime)
-      case "Peak Execution Memory" => Ordering.by(_.peakExecutionMemoryUsed)
-      case "Accumulators" =>
-        if (hasAccumulators) {
-          Ordering.by(_.accumulators.get)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Accumulators because of no accumulators")
-        }
-      case "Input Size / Records" =>
-        if (hasInput) {
-          Ordering.by(_.input.get.inputSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Input Size / Records because of no inputs")
-        }
-      case "Output Size / Records" =>
-        if (hasOutput) {
-          Ordering.by(_.output.get.outputSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Output Size / Records because of no outputs")
-        }
-      // ShuffleRead
-      case "Shuffle Read Blocked Time" =>
-        if (hasShuffleRead) {
-          Ordering.by(_.shuffleRead.get.shuffleReadBlockedTimeSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Read Blocked Time because of no shuffle reads")
-        }
-      case "Shuffle Read Size / Records" =>
-        if (hasShuffleRead) {
-          Ordering.by(_.shuffleRead.get.shuffleReadSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Read Size / Records because of no shuffle reads")
-        }
-      case "Shuffle Remote Reads" =>
-        if (hasShuffleRead) {
-          Ordering.by(_.shuffleRead.get.shuffleReadRemoteSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Remote Reads because of no shuffle reads")
-        }
-      // ShuffleWrite
-      case "Write Time" =>
-        if (hasShuffleWrite) {
-          Ordering.by(_.shuffleWrite.get.writeTimeSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Write Time because of no shuffle writes")
-        }
-      case "Shuffle Write Size / Records" =>
-        if (hasShuffleWrite) {
-          Ordering.by(_.shuffleWrite.get.shuffleWriteSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Write Size / Records because of no shuffle writes")
-        }
-      // BytesSpilled
-      case "Shuffle Spill (Memory)" =>
-        if (hasBytesSpilled) {
-          Ordering.by(_.bytesSpilled.get.memoryBytesSpilledSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Spill (Memory) because of no spills")
-        }
-      case "Shuffle Spill (Disk)" =>
-        if (hasBytesSpilled) {
-          Ordering.by(_.bytesSpilled.get.diskBytesSpilledSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Spill (Disk) because of no spills")
-        }
-      case "Errors" => Ordering.by(_.error)
-      case unknownColumn => throw new IllegalArgumentException(s"Unknown column: $unknownColumn")
-    }
-    if (desc) {
-      ordering.reverse
-    } else {
-      ordering
-    }
-  }
-
 }
 
 private[ui] class TaskPagedTable(
-    conf: SparkConf,
+    stage: StageData,
     basePath: String,
-    data: Seq[TaskData],
-    hasAccumulators: Boolean,
-    hasInput: Boolean,
-    hasOutput: Boolean,
-    hasShuffleRead: Boolean,
-    hasShuffleWrite: Boolean,
-    hasBytesSpilled: Boolean,
     currentTime: Long,
     pageSize: Int,
     sortColumn: String,
     desc: Boolean,
-    store: AppStatusStore) extends PagedTable[TaskTableRowData] {
+    store: AppStatusStore) extends PagedTable[TaskData] {
+
+  import ApiHelper._
 
   override def tableId: String = "task-table"
 
@@ -1142,13 +718,7 @@ private[ui] class TaskPagedTable(
   override def pageNumberFormField: String = "task.page"
 
   override val dataSource: TaskDataSource = new TaskDataSource(
-    data,
-    hasAccumulators,
-    hasInput,
-    hasOutput,
-    hasShuffleRead,
-    hasShuffleWrite,
-    hasBytesSpilled,
+    stage,
     currentTime,
     pageSize,
     sortColumn,
@@ -1180,22 +750,22 @@ private[ui] class TaskPagedTable(
         ("Result Serialization Time", TaskDetailsClassNames.RESULT_SERIALIZATION_TIME),
         ("Getting Result Time", TaskDetailsClassNames.GETTING_RESULT_TIME),
         ("Peak Execution Memory", TaskDetailsClassNames.PEAK_EXECUTION_MEMORY)) ++
-        {if (hasAccumulators) Seq(("Accumulators", "")) else Nil} ++
-        {if (hasInput) Seq(("Input Size / Records", "")) else Nil} ++
-        {if (hasOutput) Seq(("Output Size / Records", "")) else Nil} ++
-        {if (hasShuffleRead) {
+        {if (hasAccumulators(stage)) Seq(("Accumulators", "")) else Nil} ++
+        {if (hasInput(stage)) Seq(("Input Size / Records", "")) else Nil} ++
+        {if (hasOutput(stage)) Seq(("Output Size / Records", "")) else Nil} ++
+        {if (hasShuffleRead(stage)) {
           Seq(("Shuffle Read Blocked Time", TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME),
             ("Shuffle Read Size / Records", ""),
             ("Shuffle Remote Reads", TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE))
         } else {
           Nil
         }} ++
-        {if (hasShuffleWrite) {
+        {if (hasShuffleWrite(stage)) {
           Seq(("Write Time", ""), ("Shuffle Write Size / Records", ""))
         } else {
           Nil
         }} ++
-        {if (hasBytesSpilled) {
+        {if (hasBytesSpilled(stage)) {
           Seq(("Shuffle Spill (Memory)", ""), ("Shuffle Spill (Disk)", ""))
         } else {
           Nil
@@ -1237,7 +807,17 @@ private[ui] class TaskPagedTable(
     <thead>{headerRow}</thead>
   }
 
-  def row(task: TaskTableRowData): Seq[Node] = {
+  def row(task: TaskData): Seq[Node] = {
+    def formatDuration(value: Option[Long], hideZero: Boolean = false): String = {
+      value.map { v =>
+        if (v > 0 || !hideZero) UIUtils.formatDuration(v) else ""
+      }.getOrElse("")
+    }
+
+    def formatBytes(value: Option[Long]): String = {
+      Utils.bytesToString(value.getOrElse(0L))
+    }
+
     <tr>
       <td>{task.index}</td>
       <td>{task.taskId}</td>
@@ -1249,62 +829,98 @@ private[ui] class TaskPagedTable(
         <div style="float: left">{task.host}</div>
         <div style="float: right">
         {
-          task.logs.map {
+          dataSource.executorLogs(task.executorId).map {
             case (logName, logUrl) => <div><a href={logUrl}>{logName}</a></div>
           }
         }
         </div>
       </td>
-      <td>{UIUtils.formatDate(new Date(task.launchTime))}</td>
-      <td>{task.formatDuration}</td>
+      <td>{UIUtils.formatDate(task.launchTime)}</td>
+      <td>{formatDuration(task.duration)}</td>
       <td class={TaskDetailsClassNames.SCHEDULER_DELAY}>
-        {UIUtils.formatDuration(task.schedulerDelay)}
+        {UIUtils.formatDuration(AppStatusUtils.schedulerDelay(task))}
       </td>
       <td class={TaskDetailsClassNames.TASK_DESERIALIZATION_TIME}>
-        {UIUtils.formatDuration(task.taskDeserializationTime)}
+        {formatDuration(task.taskMetrics.map(_.executorDeserializeTime))}
       </td>
       <td>
-        {if (task.gcTime > 0) UIUtils.formatDuration(task.gcTime) else ""}
+        {formatDuration(task.taskMetrics.map(_.jvmGcTime), hideZero = true)}
       </td>
       <td class={TaskDetailsClassNames.RESULT_SERIALIZATION_TIME}>
-        {UIUtils.formatDuration(task.serializationTime)}
+        {formatDuration(task.taskMetrics.map(_.resultSerializationTime))}
       </td>
       <td class={TaskDetailsClassNames.GETTING_RESULT_TIME}>
-        {UIUtils.formatDuration(task.gettingResultTime)}
+        {UIUtils.formatDuration(AppStatusUtils.gettingResultTime(task))}
       </td>
       <td class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
-        {Utils.bytesToString(task.peakExecutionMemoryUsed)}
+        {formatBytes(task.taskMetrics.map(_.peakExecutionMemory))}
       </td>
-      {if (task.accumulators.nonEmpty) {
-        <td>{Unparsed(task.accumulators.get)}</td>
+      {if (hasAccumulators(stage)) {
+        accumulatorsInfo(task)
       }}
-      {if (task.input.nonEmpty) {
-        <td>{task.input.get.inputReadable}</td>
+      {if (hasInput(stage)) {
+        metricInfo(task) { m =>
+          val bytesRead = Utils.bytesToString(m.inputMetrics.bytesRead)
+          val records = m.inputMetrics.recordsRead
+          <td>{bytesRead} / {records}</td>
+        }
       }}
-      {if (task.output.nonEmpty) {
-        <td>{task.output.get.outputReadable}</td>
+      {if (hasOutput(stage)) {
+        metricInfo(task) { m =>
+          val bytesWritten = Utils.bytesToString(m.outputMetrics.bytesWritten)
+          val records = m.outputMetrics.recordsWritten
+          <td>{bytesWritten} / {records}</td>
+        }
       }}
-      {if (task.shuffleRead.nonEmpty) {
+      {if (hasShuffleRead(stage)) {
         <td class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
-          {task.shuffleRead.get.shuffleReadBlockedTimeReadable}
+          {formatDuration(task.taskMetrics.map(_.shuffleReadMetrics.fetchWaitTime))}
         </td>
-        <td>{task.shuffleRead.get.shuffleReadReadable}</td>
+        <td>{
+          metricInfo(task) { m =>
+            val bytesRead = Utils.bytesToString(totalBytesRead(m.shuffleReadMetrics))
+            val records = m.shuffleReadMetrics.recordsRead
+            Unparsed(s"$bytesRead / $records")
+          }
+        }</td>
         <td class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
-          {task.shuffleRead.get.shuffleReadRemoteReadable}
+          {formatBytes(task.taskMetrics.map(_.shuffleReadMetrics.remoteBytesRead))}
         </td>
       }}
-      {if (task.shuffleWrite.nonEmpty) {
-        <td>{task.shuffleWrite.get.writeTimeReadable}</td>
-        <td>{task.shuffleWrite.get.shuffleWriteReadable}</td>
+      {if (hasShuffleWrite(stage)) {
+        <td>{
+          formatDuration(
+            task.taskMetrics.map { m =>
+              TimeUnit.NANOSECONDS.toMillis(m.shuffleWriteMetrics.writeTime)
+            },
+            hideZero = true)
+        }</td>
+        <td>{
+          metricInfo(task) { m =>
+            val bytesWritten = Utils.bytesToString(m.shuffleWriteMetrics.bytesWritten)
+            val records = m.shuffleWriteMetrics.recordsWritten
+            Unparsed(s"$bytesWritten / $records")
+          }
+        }</td>
       }}
-      {if (task.bytesSpilled.nonEmpty) {
-        <td>{task.bytesSpilled.get.memoryBytesSpilledReadable}</td>
-        <td>{task.bytesSpilled.get.diskBytesSpilledReadable}</td>
+      {if (hasBytesSpilled(stage)) {
+        <td>{formatBytes(task.taskMetrics.map(_.memoryBytesSpilled))}</td>
+        <td>{formatBytes(task.taskMetrics.map(_.diskBytesSpilled))}</td>
       }}
-      {errorMessageCell(task.error)}
+      {errorMessageCell(task.errorMessage.getOrElse(""))}
     </tr>
   }
 
+  private def accumulatorsInfo(task: TaskData): Seq[Node] = {
+    task.accumulatorUpdates.map { acc =>
+      Unparsed(StringEscapeUtils.escapeHtml4(s"${acc.name}: ${acc.update}"))
+    }
+  }
+
+  private def metricInfo(task: TaskData)(fn: TaskMetrics => Seq[Node]): Seq[Node] = {
+    task.taskMetrics.map(fn).getOrElse(Nil)
+  }
+
   private def errorMessageCell(error: String): Seq[Node] = {
     val isMultiline = error.indexOf('\n') >= 0
     // Display the first line by default
@@ -1333,6 +949,36 @@ private[ui] class TaskPagedTable(
 
 private object ApiHelper {
 
+
+  private val COLUMN_TO_INDEX = Map(
+    "ID" -> null.asInstanceOf[String],
+    "Index" -> TaskIndexNames.TASK_INDEX,
+    "Attempt" -> TaskIndexNames.ATTEMPT,
+    "Status" -> TaskIndexNames.STATUS,
+    "Locality Level" -> TaskIndexNames.LOCALITY,
+    "Executor ID / Host" -> TaskIndexNames.EXECUTOR,
+    "Launch Time" -> TaskIndexNames.LAUNCH_TIME,
+    "Duration" -> TaskIndexNames.DURATION,
+    "Scheduler Delay" -> TaskIndexNames.SCHEDULER_DELAY,
+    "Task Deserialization Time" -> TaskIndexNames.DESER_TIME,
+    "GC Time" -> TaskIndexNames.GC_TIME,
+    "Result Serialization Time" -> TaskIndexNames.SER_TIME,
+    "Getting Result Time" -> TaskIndexNames.GETTING_RESULT_TIME,
+    "Peak Execution Memory" -> TaskIndexNames.PEAK_MEM,
+    "Accumulators" -> TaskIndexNames.ACCUMULATORS,
+    "Input Size / Records" -> TaskIndexNames.INPUT_SIZE,
+    "Output Size / Records" -> TaskIndexNames.OUTPUT_SIZE,
+    "Shuffle Read Blocked Time" -> TaskIndexNames.SHUFFLE_READ_TIME,
+    "Shuffle Read Size / Records" -> TaskIndexNames.SHUFFLE_TOTAL_READS,
+    "Shuffle Remote Reads" -> TaskIndexNames.SHUFFLE_REMOTE_READS,
+    "Write Time" -> TaskIndexNames.SHUFFLE_WRITE_TIME,
+    "Shuffle Write Size / Records" -> TaskIndexNames.SHUFFLE_WRITE_SIZE,
+    "Shuffle Spill (Memory)" -> TaskIndexNames.MEM_SPILL,
+    "Shuffle Spill (Disk)" -> TaskIndexNames.DISK_SPILL,
+    "Errors" -> TaskIndexNames.ERROR)
+
+  def hasAccumulators(stageData: StageData): Boolean = stageData.accumulatorUpdates.size > 0
+
   def hasInput(stageData: StageData): Boolean = stageData.inputBytes > 0
 
   def hasOutput(stageData: StageData): Boolean = stageData.outputBytes > 0
@@ -1349,4 +995,11 @@ private object ApiHelper {
     metrics.localBytesRead + metrics.remoteBytesRead
   }
 
+  def indexName(sortColumn: String): Option[String] = {
+    COLUMN_TO_INDEX.get(sortColumn) match {
+      case Some(v) => Option(v)
+      case _ => throw new IllegalArgumentException(s"Invalid sort column: $sortColumn")
+    }
+  }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
index be05a963f0e6..10b032084ce4 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
@@ -37,10 +37,10 @@ private[ui] class StagesTab(val parent: SparkUI, val store: AppStatusStore)
   attachPage(new PoolPage(this))
 
   def isFairScheduler: Boolean = {
-    store.environmentInfo().sparkProperties.toMap
-      .get("spark.scheduler.mode")
-      .map { mode => mode == SchedulingMode.FAIR }
-      .getOrElse(false)
+    store
+      .environmentInfo()
+      .sparkProperties
+      .contains(("spark.scheduler.mode", SchedulingMode.FAIR.toString))
   }
 
   def handleKillRequest(request: HttpServletRequest): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
index 827a8637b9bd..948858224d72 100644
--- a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
+++ b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
@@ -116,7 +116,7 @@ private[spark] object RDDOperationGraph extends Logging {
     // Use a special prefix here to differentiate this cluster from other operation clusters
     val stageClusterId = STAGE_CLUSTER_PREFIX + stage.stageId
     val stageClusterName = s"Stage ${stage.stageId}" +
-      { if (stage.attemptId == 0) "" else s" (attempt ${stage.attemptId})" }
+      { if (stage.attemptNumber == 0) "" else s" (attempt ${stage.attemptNumber})" }
     val rootCluster = new RDDOperationCluster(stageClusterId, stageClusterName)
 
     var rootNodeCount = 0
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 5e60218c5740..ff83301d631c 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -263,7 +263,7 @@ private[spark] object JsonProtocol {
     val completionTime = stageInfo.completionTime.map(JInt(_)).getOrElse(JNothing)
     val failureReason = stageInfo.failureReason.map(JString(_)).getOrElse(JNothing)
     ("Stage ID" -> stageInfo.stageId) ~
-    ("Stage Attempt ID" -> stageInfo.attemptId) ~
+    ("Stage Attempt ID" -> stageInfo.attemptNumber) ~
     ("Stage Name" -> stageInfo.name) ~
     ("Number of Tasks" -> stageInfo.numTasks) ~
     ("RDD Info" -> rddInfo) ~
diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index c2261c204cd4..dffa609f1cbd 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -23,7 +23,9 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;
+import java.util.concurrent.TimeUnit;
 
+import org.junit.Ignore;
 import org.junit.Test;
 import static org.junit.Assert.*;
 import static org.junit.Assume.*;
@@ -119,7 +121,8 @@ public void testChildProcLauncher() throws Exception {
     assertEquals(0, app.waitFor());
   }
 
-  @Test
+  // TODO: [SPARK-23020] Re-enable this
+  @Ignore
   public void testInProcessLauncher() throws Exception {
     // Because this test runs SparkLauncher in process and in client mode, it pollutes the system
     // properties, and that can cause test failures down the test pipeline. So restore the original
@@ -133,6 +136,10 @@ public void testInProcessLauncher() throws Exception {
         p.put(e.getKey(), e.getValue());
       }
       System.setProperties(p);
+      // Here DAGScheduler is stopped, while SparkContext.clearActiveContext may not be called yet.
+      // Wait for a reasonable amount of time to avoid creating two active SparkContext in JVM.
+      // See SPARK-23019 and SparkContext.stop() for details.
+      TimeUnit.MILLISECONDS.sleep(500);
     }
   }
 
diff --git a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
index 46b0516e3614..a0664b30d6cc 100644
--- a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
+++ b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
@@ -21,6 +21,7 @@
 import org.junit.Test;
 
 import org.apache.spark.SparkConf;
+import org.apache.spark.unsafe.memory.MemoryAllocator;
 import org.apache.spark.unsafe.memory.MemoryBlock;
 
 public class TaskMemoryManagerSuite {
@@ -68,6 +69,34 @@ public void encodePageNumberAndOffsetOnHeap() {
     Assert.assertEquals(64, manager.getOffsetInPage(encodedAddress));
   }
 
+  @Test
+  public void freeingPageSetsPageNumberToSpecialConstant() {
+    final TaskMemoryManager manager = new TaskMemoryManager(
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
+    final MemoryConsumer c = new TestMemoryConsumer(manager, MemoryMode.ON_HEAP);
+    final MemoryBlock dataPage = manager.allocatePage(256, c);
+    c.freePage(dataPage);
+    Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, dataPage.pageNumber);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void freeingPageDirectlyInAllocatorTriggersAssertionError() {
+    final TaskMemoryManager manager = new TaskMemoryManager(
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
+    final MemoryConsumer c = new TestMemoryConsumer(manager, MemoryMode.ON_HEAP);
+    final MemoryBlock dataPage = manager.allocatePage(256, c);
+    MemoryAllocator.HEAP.free(dataPage);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void callingFreePageOnDirectlyAllocatedPageTriggersAssertionError() {
+    final TaskMemoryManager manager = new TaskMemoryManager(
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
+    final MemoryConsumer c = new TestMemoryConsumer(manager, MemoryMode.ON_HEAP);
+    final MemoryBlock dataPage = MemoryAllocator.HEAP.allocate(256);
+    manager.freePage(dataPage, c);
+  }
+
   @Test
   public void cooperativeSpilling() {
     final TestMemoryManager memoryManager = new TestMemoryManager(new SparkConf());
diff --git a/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java b/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java
index 94f5805853e1..f8e233a05a44 100644
--- a/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java
+++ b/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java
@@ -38,6 +38,7 @@ public static void test() {
     tc.attemptNumber();
     tc.partitionId();
     tc.stageId();
+    tc.stageAttemptNumber();
     tc.taskAttemptId();
   }
 
@@ -51,6 +52,7 @@ public void onTaskCompletion(TaskContext context) {
       context.isCompleted();
       context.isInterrupted();
       context.stageId();
+      context.stageAttemptNumber();
       context.partitionId();
       context.addTaskCompletionListener(this);
     }
diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json
index f8e27703c0de..5c42ac1d87f4 100644
--- a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json
@@ -7,6 +7,9 @@
   "resultSize" : [ 2010.0, 2065.0, 2065.0 ],
   "jvmGcTime" : [ 0.0, 0.0, 7.0 ],
   "resultSerializationTime" : [ 0.0, 0.0, 2.0 ],
+  "gettingResultTime" : [ 0.0, 0.0, 0.0 ],
+  "schedulerDelay" : [ 2.0, 6.0, 53.0 ],
+  "peakExecutionMemory" : [ 0.0, 0.0, 0.0 ],
   "memoryBytesSpilled" : [ 0.0, 0.0, 0.0 ],
   "diskBytesSpilled" : [ 0.0, 0.0, 0.0 ],
   "inputMetrics" : {
diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json
index a28bda16a956..e6b705989cc9 100644
--- a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json
@@ -7,6 +7,9 @@
   "resultSize" : [ 1034.0, 1034.0, 1034.0, 1034.0, 1034.0 ],
   "jvmGcTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "resultSerializationTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
+  "gettingResultTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
+  "schedulerDelay" : [ 4.0, 4.0, 6.0, 7.0, 9.0 ],
+  "peakExecutionMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "memoryBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "diskBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "inputMetrics" : {
diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json
index ede3eaed1d1d..788f28cf7b36 100644
--- a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json
@@ -7,6 +7,9 @@
   "resultSize" : [ 2010.0, 2065.0, 2065.0, 2065.0, 2065.0 ],
   "jvmGcTime" : [ 0.0, 0.0, 0.0, 5.0, 7.0 ],
   "resultSerializationTime" : [ 0.0, 0.0, 0.0, 0.0, 1.0 ],
+  "gettingResultTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
+  "schedulerDelay" : [ 2.0, 4.0, 6.0, 13.0, 40.0 ],
+  "peakExecutionMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "memoryBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "diskBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "inputMetrics" : {
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index 3931d53b4ae0..ced5a06516f7 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -363,14 +363,14 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
 
     // first attempt -- its successful
     val writer1 = manager.getWriter[Int, Int](shuffleHandle, 0,
-      new TaskContextImpl(0, 0, 0L, 0, taskMemoryManager, new Properties, metricsSystem))
+      new TaskContextImpl(0, 0, 0, 0L, 0, taskMemoryManager, new Properties, metricsSystem))
     val data1 = (1 to 10).map { x => x -> x}
 
     // second attempt -- also successful.  We'll write out different data,
     // just to simulate the fact that the records may get written differently
     // depending on what gets spilled, what gets combined, etc.
     val writer2 = manager.getWriter[Int, Int](shuffleHandle, 0,
-      new TaskContextImpl(0, 0, 1L, 0, taskMemoryManager, new Properties, metricsSystem))
+      new TaskContextImpl(0, 0, 0, 1L, 0, taskMemoryManager, new Properties, metricsSystem))
     val data2 = (11 to 20).map { x => x -> x}
 
     // interleave writes of both attempts -- we want to test that both attempts can occur
@@ -398,7 +398,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
     }
 
     val reader = manager.getReader[Int, Int](shuffleHandle, 0, 1,
-      new TaskContextImpl(1, 0, 2L, 0, taskMemoryManager, new Properties, metricsSystem))
+      new TaskContextImpl(1, 0, 0, 2L, 0, taskMemoryManager, new Properties, metricsSystem))
     val readData = reader.read().toIndexedSeq
     assert(readData === data1.toIndexedSeq || readData === data2.toIndexedSeq)
 
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index 159629825c67..9ad2e9a5e74a 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -153,6 +153,40 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext with Encryptio
     assert(broadcast.value.sum === 10)
   }
 
+  test("One broadcast value instance per executor") {
+    val conf = new SparkConf()
+      .setMaster("local[4]")
+      .setAppName("test")
+
+    sc = new SparkContext(conf)
+    val list = List[Int](1, 2, 3, 4)
+    val broadcast = sc.broadcast(list)
+    val instances = sc.parallelize(1 to 10)
+      .map(x => System.identityHashCode(broadcast.value))
+      .collect()
+      .toSet
+
+    assert(instances.size === 1)
+  }
+
+  test("One broadcast value instance per executor when memory is constrained") {
+    val conf = new SparkConf()
+      .setMaster("local[4]")
+      .setAppName("test")
+      .set("spark.memory.useLegacyMode", "true")
+      .set("spark.storage.memoryFraction", "0.0")
+
+    sc = new SparkContext(conf)
+    val list = List[Int](1, 2, 3, 4)
+    val broadcast = sc.broadcast(list)
+    val instances = sc.parallelize(1 to 10)
+      .map(x => System.identityHashCode(broadcast.value))
+      .collect()
+      .toSet
+
+    assert(instances.size === 1)
+  }
+
   /**
    * Verify the persistence of state associated with a TorrentBroadcast in a local-cluster.
    *
diff --git a/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala b/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala
index 362cd861cc24..dcf89e4f75ac 100644
--- a/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala
+++ b/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala
@@ -29,6 +29,7 @@ object MemoryTestingUtils {
     val taskMemoryManager = new TaskMemoryManager(env.memoryManager, 0)
     new TaskContextImpl(
       stageId = 0,
+      stageAttemptNumber = 0,
       partitionId = 0,
       taskAttemptId = 0,
       attemptNumber = 0,
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index 1beb36afa95f..da6ecb82c7e4 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -48,7 +48,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     bus.metrics.metricRegistry.counter(s"queue.$SHARED_QUEUE.numDroppedEvents").getCount
   }
 
-  private def queueSize(bus: LiveListenerBus): Int = {
+  private def sharedQueueSize(bus: LiveListenerBus): Int = {
     bus.metrics.metricRegistry.getGauges().get(s"queue.$SHARED_QUEUE.size").getValue()
       .asInstanceOf[Int]
   }
@@ -73,12 +73,11 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     val conf = new SparkConf()
     val counter = new BasicJobCounter
     val bus = new LiveListenerBus(conf)
-    bus.addToSharedQueue(counter)
 
     // Metrics are initially empty.
     assert(bus.metrics.numEventsPosted.getCount === 0)
     assert(numDroppedEvents(bus) === 0)
-    assert(queueSize(bus) === 0)
+    assert(bus.queuedEvents.size === 0)
     assert(eventProcessingTimeCount(bus) === 0)
 
     // Post five events:
@@ -87,7 +86,10 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     // Five messages should be marked as received and queued, but no messages should be posted to
     // listeners yet because the the listener bus hasn't been started.
     assert(bus.metrics.numEventsPosted.getCount === 5)
-    assert(queueSize(bus) === 5)
+    assert(bus.queuedEvents.size === 5)
+
+    // Add the counter to the bus after messages have been queued for later delivery.
+    bus.addToSharedQueue(counter)
     assert(counter.count === 0)
 
     // Starting listener bus should flush all buffered events
@@ -95,9 +97,12 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     Mockito.verify(mockMetricsSystem).registerSource(bus.metrics)
     bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(counter.count === 5)
-    assert(queueSize(bus) === 0)
+    assert(sharedQueueSize(bus) === 0)
     assert(eventProcessingTimeCount(bus) === 5)
 
+    // After the bus is started, there should be no more queued events.
+    assert(bus.queuedEvents === null)
+
     // After listener bus has stopped, posting events should not increment counter
     bus.stop()
     (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) }
@@ -188,18 +193,18 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     // Post a message to the listener bus and wait for processing to begin:
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
     listenerStarted.acquire()
-    assert(queueSize(bus) === 0)
+    assert(sharedQueueSize(bus) === 0)
     assert(numDroppedEvents(bus) === 0)
 
     // If we post an additional message then it should remain in the queue because the listener is
     // busy processing the first event:
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
-    assert(queueSize(bus) === 1)
+    assert(sharedQueueSize(bus) === 1)
     assert(numDroppedEvents(bus) === 0)
 
     // The queue is now full, so any additional events posted to the listener will be dropped:
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
-    assert(queueSize(bus) === 1)
+    assert(sharedQueueSize(bus) === 1)
     assert(numDroppedEvents(bus) === 1)
 
     // Allow the the remaining events to be processed so we can stop the listener bus:
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
index a1d9085fa085..aa9c36c0aaac 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.metrics.source.JvmSource
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.rdd.RDD
+import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.util._
 
 class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkContext {
@@ -158,6 +159,30 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     assert(attemptIdsWithFailedTask.toSet === Set(0, 1))
   }
 
+  test("TaskContext.stageAttemptNumber getter") {
+    sc = new SparkContext("local[1,2]", "test")
+
+    // Check stageAttemptNumbers are 0 for initial stage
+    val stageAttemptNumbers = sc.parallelize(Seq(1, 2), 2).mapPartitions { _ =>
+      Seq(TaskContext.get().stageAttemptNumber()).iterator
+    }.collect()
+    assert(stageAttemptNumbers.toSet === Set(0))
+
+    // Check stageAttemptNumbers that are resubmitted when tasks have FetchFailedException
+    val stageAttemptNumbersWithFailedStage =
+      sc.parallelize(Seq(1, 2, 3, 4), 4).repartition(1).mapPartitions { _ =>
+      val stageAttemptNumber = TaskContext.get().stageAttemptNumber()
+      if (stageAttemptNumber < 2) {
+        // Throw FetchFailedException to explicitly trigger stage resubmission. A normal exception
+        // will only trigger task resubmission in the same stage.
+        throw new FetchFailedException(null, 0, 0, 0, "Fake")
+      }
+      Seq(stageAttemptNumber).iterator
+    }.collect()
+
+    assert(stageAttemptNumbersWithFailedStage.toSet === Set(2))
+  }
+
   test("accumulators are updated on exception failures") {
     // This means use 1 core and 4 max task failures
     sc = new SparkContext("local[1,4]", "test")
@@ -190,7 +215,7 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     // accumulator updates from it.
     val taskMetrics = TaskMetrics.empty
     val task = new Task[Int](0, 0, 0) {
-      context = new TaskContextImpl(0, 0, 0L, 0,
+      context = new TaskContextImpl(0, 0, 0, 0L, 0,
         new TaskMemoryManager(SparkEnv.get.memoryManager, 0L),
         new Properties,
         SparkEnv.get.metricsSystem,
@@ -213,7 +238,7 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     // accumulator updates from it.
     val taskMetrics = TaskMetrics.registered
     val task = new Task[Int](0, 0, 0) {
-      context = new TaskContextImpl(0, 0, 0L, 0,
+      context = new TaskContextImpl(0, 0, 0, 0L, 0,
         new TaskMemoryManager(SparkEnv.get.memoryManager, 0L),
         new Properties,
         SparkEnv.get.metricsSystem,
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index 997c7de8dd02..ca66b6b9db89 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -195,7 +195,9 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     val s1Tasks = createTasks(4, execIds)
     s1Tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId,
+        stages.head.attemptNumber,
+        task))
     }
 
     assert(store.count(classOf[TaskDataWrapper]) === s1Tasks.size)
@@ -211,55 +213,53 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     s1Tasks.foreach { task =>
       check[TaskDataWrapper](task.taskId) { wrapper =>
-        assert(wrapper.info.taskId === task.taskId)
+        assert(wrapper.taskId === task.taskId)
         assert(wrapper.stageId === stages.head.stageId)
         assert(wrapper.stageAttemptId === stages.head.attemptId)
-        assert(Arrays.equals(wrapper.stage, Array(stages.head.stageId, stages.head.attemptId)))
-
-        val runtime = Array[AnyRef](stages.head.stageId: JInteger, stages.head.attemptId: JInteger,
-          -1L: JLong)
-        assert(Arrays.equals(wrapper.runtime, runtime))
-
-        assert(wrapper.info.index === task.index)
-        assert(wrapper.info.attempt === task.attemptNumber)
-        assert(wrapper.info.launchTime === new Date(task.launchTime))
-        assert(wrapper.info.executorId === task.executorId)
-        assert(wrapper.info.host === task.host)
-        assert(wrapper.info.status === task.status)
-        assert(wrapper.info.taskLocality === task.taskLocality.toString())
-        assert(wrapper.info.speculative === task.speculative)
+        assert(wrapper.index === task.index)
+        assert(wrapper.attempt === task.attemptNumber)
+        assert(wrapper.launchTime === task.launchTime)
+        assert(wrapper.executorId === task.executorId)
+        assert(wrapper.host === task.host)
+        assert(wrapper.status === task.status)
+        assert(wrapper.taskLocality === task.taskLocality.toString())
+        assert(wrapper.speculative === task.speculative)
       }
     }
 
-    // Send executor metrics update. Only update one metric to avoid a lot of boilerplate code.
-    s1Tasks.foreach { task =>
-      val accum = new AccumulableInfo(1L, Some(InternalAccumulator.MEMORY_BYTES_SPILLED),
-        Some(1L), None, true, false, None)
-      listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate(
-        task.executorId,
-        Seq((task.taskId, stages.head.stageId, stages.head.attemptId, Seq(accum)))))
-    }
+    // Send two executor metrics update. Only update one metric to avoid a lot of boilerplate code.
+    // The tasks are distributed among the two executors, so the executor-level metrics should
+    // hold half of the cummulative value of the metric being updated.
+    Seq(1L, 2L).foreach { value =>
+      s1Tasks.foreach { task =>
+        val accum = new AccumulableInfo(1L, Some(InternalAccumulator.MEMORY_BYTES_SPILLED),
+          Some(value), None, true, false, None)
+        listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate(
+          task.executorId,
+          Seq((task.taskId, stages.head.stageId, stages.head.attemptNumber, Seq(accum)))))
+      }
 
-    check[StageDataWrapper](key(stages.head)) { stage =>
-      assert(stage.info.memoryBytesSpilled === s1Tasks.size)
-    }
+      check[StageDataWrapper](key(stages.head)) { stage =>
+        assert(stage.info.memoryBytesSpilled === s1Tasks.size * value)
+      }
 
-    val execs = store.view(classOf[ExecutorStageSummaryWrapper]).index("stage")
-      .first(key(stages.head)).last(key(stages.head)).asScala.toSeq
-    assert(execs.size > 0)
-    execs.foreach { exec =>
-      assert(exec.info.memoryBytesSpilled === s1Tasks.size / 2)
+      val execs = store.view(classOf[ExecutorStageSummaryWrapper]).index("stage")
+        .first(key(stages.head)).last(key(stages.head)).asScala.toSeq
+      assert(execs.size > 0)
+      execs.foreach { exec =>
+        assert(exec.info.memoryBytesSpilled === s1Tasks.size * value / 2)
+      }
     }
 
     // Fail one of the tasks, re-start it.
     time += 1
     s1Tasks.head.markFinished(TaskState.FAILED, time)
-    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber,
       "taskType", TaskResultLost, s1Tasks.head, null))
 
     time += 1
     val reattempt = newAttempt(s1Tasks.head, nextTaskId())
-    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptNumber,
       reattempt))
 
     assert(store.count(classOf[TaskDataWrapper]) === s1Tasks.size + 1)
@@ -275,13 +275,13 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
 
     check[TaskDataWrapper](s1Tasks.head.taskId) { task =>
-      assert(task.info.status === s1Tasks.head.status)
-      assert(task.info.errorMessage == Some(TaskResultLost.toErrorString))
+      assert(task.status === s1Tasks.head.status)
+      assert(task.errorMessage == Some(TaskResultLost.toErrorString))
     }
 
     check[TaskDataWrapper](reattempt.taskId) { task =>
-      assert(task.info.index === s1Tasks.head.index)
-      assert(task.info.attempt === reattempt.attemptNumber)
+      assert(task.index === s1Tasks.head.index)
+      assert(task.attempt === reattempt.attemptNumber)
     }
 
     // Kill one task, restart it.
@@ -289,7 +289,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val killed = s1Tasks.drop(1).head
     killed.finishTime = time
     killed.failed = true
-    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber,
       "taskType", TaskKilled("killed"), killed, null))
 
     check[JobDataWrapper](1) { job =>
@@ -303,21 +303,21 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
 
     check[TaskDataWrapper](killed.taskId) { task =>
-      assert(task.info.index === killed.index)
-      assert(task.info.errorMessage === Some("killed"))
+      assert(task.index === killed.index)
+      assert(task.errorMessage === Some("killed"))
     }
 
     // Start a new attempt and finish it with TaskCommitDenied, make sure it's handled like a kill.
     time += 1
     val denied = newAttempt(killed, nextTaskId())
     val denyReason = TaskCommitDenied(1, 1, 1)
-    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptNumber,
       denied))
 
     time += 1
     denied.finishTime = time
     denied.failed = true
-    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber,
       "taskType", denyReason, denied, null))
 
     check[JobDataWrapper](1) { job =>
@@ -331,13 +331,13 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
 
     check[TaskDataWrapper](denied.taskId) { task =>
-      assert(task.info.index === killed.index)
-      assert(task.info.errorMessage === Some(denyReason.toErrorString))
+      assert(task.index === killed.index)
+      assert(task.errorMessage === Some(denyReason.toErrorString))
     }
 
     // Start a new attempt.
     val reattempt2 = newAttempt(denied, nextTaskId())
-    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptNumber,
       reattempt2))
 
     // Succeed all tasks in stage 1.
@@ -350,7 +350,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     time += 1
     pending.foreach { task =>
       task.markFinished(TaskState.FINISHED, time)
-      listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+      listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber,
         "taskType", Success, task, s1Metrics))
     }
 
@@ -370,10 +370,10 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     pending.foreach { task =>
       check[TaskDataWrapper](task.taskId) { wrapper =>
-        assert(wrapper.info.errorMessage === None)
-        assert(wrapper.info.taskMetrics.get.executorCpuTime === 2L)
-        assert(wrapper.info.taskMetrics.get.executorRunTime === 4L)
-        assert(wrapper.info.duration === Some(task.duration))
+        assert(wrapper.errorMessage === None)
+        assert(wrapper.executorCpuTime === 2L)
+        assert(wrapper.executorRunTime === 4L)
+        assert(wrapper.duration === task.duration)
       }
     }
 
@@ -414,13 +414,15 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     time += 1
     val s2Tasks = createTasks(4, execIds)
     s2Tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(stages.last.stageId, stages.last.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(stages.last.stageId,
+        stages.last.attemptNumber,
+        task))
     }
 
     time += 1
     s2Tasks.foreach { task =>
       task.markFinished(TaskState.FAILED, time)
-      listener.onTaskEnd(SparkListenerTaskEnd(stages.last.stageId, stages.last.attemptId,
+      listener.onTaskEnd(SparkListenerTaskEnd(stages.last.stageId, stages.last.attemptNumber,
         "taskType", TaskResultLost, task, null))
     }
 
@@ -455,7 +457,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     // - Re-submit stage 2, all tasks, and succeed them and the stage.
     val oldS2 = stages.last
-    val newS2 = new StageInfo(oldS2.stageId, oldS2.attemptId + 1, oldS2.name, oldS2.numTasks,
+    val newS2 = new StageInfo(oldS2.stageId, oldS2.attemptNumber + 1, oldS2.name, oldS2.numTasks,
       oldS2.rddInfos, oldS2.parentIds, oldS2.details, oldS2.taskMetrics)
 
     time += 1
@@ -466,14 +468,14 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val newS2Tasks = createTasks(4, execIds)
 
     newS2Tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(newS2.stageId, newS2.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(newS2.stageId, newS2.attemptNumber, task))
     }
 
     time += 1
     newS2Tasks.foreach { task =>
       task.markFinished(TaskState.FINISHED, time)
-      listener.onTaskEnd(SparkListenerTaskEnd(newS2.stageId, newS2.attemptId, "taskType", Success,
-        task, null))
+      listener.onTaskEnd(SparkListenerTaskEnd(newS2.stageId, newS2.attemptNumber, "taskType",
+        Success, task, null))
     }
 
     time += 1
@@ -522,14 +524,15 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val j2s2Tasks = createTasks(4, execIds)
 
     j2s2Tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(j2Stages.last.stageId, j2Stages.last.attemptId,
+      listener.onTaskStart(SparkListenerTaskStart(j2Stages.last.stageId,
+        j2Stages.last.attemptNumber,
         task))
     }
 
     time += 1
     j2s2Tasks.foreach { task =>
       task.markFinished(TaskState.FINISHED, time)
-      listener.onTaskEnd(SparkListenerTaskEnd(j2Stages.last.stageId, j2Stages.last.attemptId,
+      listener.onTaskEnd(SparkListenerTaskEnd(j2Stages.last.stageId, j2Stages.last.attemptNumber,
         "taskType", Success, task, null))
     }
 
@@ -888,6 +891,23 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     assert(store.count(classOf[StageDataWrapper]) === 3)
     assert(store.count(classOf[RDDOperationGraphWrapper]) === 3)
 
+    val dropped = stages.drop(1).head
+
+    // Cache some quantiles by calling AppStatusStore.taskSummary(). For quantiles to be
+    // calculcated, we need at least one finished task.
+    time += 1
+    val task = createTasks(1, Array("1")).head
+    listener.onTaskStart(SparkListenerTaskStart(dropped.stageId, dropped.attemptId, task))
+
+    time += 1
+    task.markFinished(TaskState.FINISHED, time)
+    listener.onTaskEnd(SparkListenerTaskEnd(dropped.stageId, dropped.attemptId,
+      "taskType", Success, task, null))
+
+    new AppStatusStore(store)
+      .taskSummary(dropped.stageId, dropped.attemptId, Array(0.25d, 0.50d, 0.75d))
+    assert(store.count(classOf[CachedQuantile], "stage", key(dropped)) === 3)
+
     stages.drop(1).foreach { s =>
       time += 1
       s.completionTime = Some(time)
@@ -899,6 +919,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     intercept[NoSuchElementException] {
       store.read(classOf[StageDataWrapper], Array(2, 0))
     }
+    assert(store.count(classOf[CachedQuantile], "stage", key(dropped)) === 0)
 
     val attempt2 = new StageInfo(3, 1, "stage3", 4, Nil, Nil, "details3")
     time += 1
@@ -919,13 +940,13 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     time += 1
     val tasks = createTasks(2, Array("1"))
     tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptNumber, task))
     }
     assert(store.count(classOf[TaskDataWrapper]) === 2)
 
     // Start a 3rd task. The finished tasks should be deleted.
     createTasks(1, Array("1")).foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptNumber, task))
     }
     assert(store.count(classOf[TaskDataWrapper]) === 2)
     intercept[NoSuchElementException] {
@@ -934,7 +955,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     // Start a 4th task. The first task should be deleted, even if it's still running.
     createTasks(1, Array("1")).foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptNumber, task))
     }
     assert(store.count(classOf[TaskDataWrapper]) === 2)
     intercept[NoSuchElementException] {
@@ -960,7 +981,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
   }
 
-  private def key(stage: StageInfo): Array[Int] = Array(stage.stageId, stage.attemptId)
+  private def key(stage: StageInfo): Array[Int] = Array(stage.stageId, stage.attemptNumber)
 
   private def check[T: ClassTag](key: Any)(fn: T => Unit): Unit = {
     val value = store.read(classTag[T].runtimeClass, key).asInstanceOf[T]
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala
new file mode 100644
index 000000000000..92f90f3d96dd
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.status.api.v1.TaskMetricDistributions
+import org.apache.spark.util.Distribution
+import org.apache.spark.util.kvstore._
+
+class AppStatusStoreSuite extends SparkFunSuite {
+
+  private val uiQuantiles = Array(0.0, 0.25, 0.5, 0.75, 1.0)
+  private val stageId = 1
+  private val attemptId = 1
+
+  test("quantile calculation: 1 task") {
+    compareQuantiles(1, uiQuantiles)
+  }
+
+  test("quantile calculation: few tasks") {
+    compareQuantiles(4, uiQuantiles)
+  }
+
+  test("quantile calculation: more tasks") {
+    compareQuantiles(100, uiQuantiles)
+  }
+
+  test("quantile calculation: lots of tasks") {
+    compareQuantiles(4096, uiQuantiles)
+  }
+
+  test("quantile calculation: custom quantiles") {
+    compareQuantiles(4096, Array(0.01, 0.33, 0.5, 0.42, 0.69, 0.99))
+  }
+
+  test("quantile cache") {
+    val store = new InMemoryStore()
+    (0 until 4096).foreach { i => store.write(newTaskData(i)) }
+
+    val appStore = new AppStatusStore(store)
+
+    appStore.taskSummary(stageId, attemptId, Array(0.13d))
+    intercept[NoSuchElementException] {
+      store.read(classOf[CachedQuantile], Array(stageId, attemptId, "13"))
+    }
+
+    appStore.taskSummary(stageId, attemptId, Array(0.25d))
+    val d1 = store.read(classOf[CachedQuantile], Array(stageId, attemptId, "25"))
+
+    // Add a new task to force the cached quantile to be evicted, and make sure it's updated.
+    store.write(newTaskData(4096))
+    appStore.taskSummary(stageId, attemptId, Array(0.25d, 0.50d, 0.73d))
+
+    val d2 = store.read(classOf[CachedQuantile], Array(stageId, attemptId, "25"))
+    assert(d1.taskCount != d2.taskCount)
+
+    store.read(classOf[CachedQuantile], Array(stageId, attemptId, "50"))
+    intercept[NoSuchElementException] {
+      store.read(classOf[CachedQuantile], Array(stageId, attemptId, "73"))
+    }
+
+    assert(store.count(classOf[CachedQuantile]) === 2)
+  }
+
+  private def compareQuantiles(count: Int, quantiles: Array[Double]): Unit = {
+    val store = new InMemoryStore()
+    val values = (0 until count).map { i =>
+      val task = newTaskData(i)
+      store.write(task)
+      i.toDouble
+    }.toArray
+
+    val summary = new AppStatusStore(store).taskSummary(stageId, attemptId, quantiles).get
+    val dist = new Distribution(values, 0, values.length).getQuantiles(quantiles.sorted)
+
+    dist.zip(summary.executorRunTime).foreach { case (expected, actual) =>
+      assert(expected === actual)
+    }
+  }
+
+  private def newTaskData(i: Int): TaskDataWrapper = {
+    new TaskDataWrapper(
+      i, i, i, i, i, i, i.toString, i.toString, i.toString, i.toString, false, Nil, None,
+      i, i, i, i, i, i, i, i, i, i,
+      i, i, i, i, i, i, i, i, i, i,
+      i, i, i, i, stageId, attemptId)
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala
index 917db766f7f1..9c0699bc981f 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala
@@ -62,7 +62,7 @@ class BlockInfoManagerSuite extends SparkFunSuite with BeforeAndAfterEach {
   private def withTaskId[T](taskAttemptId: Long)(block: => T): T = {
     try {
       TaskContext.setTaskContext(
-        new TaskContextImpl(0, 0, taskAttemptId, 0, null, new Properties, null))
+        new TaskContextImpl(0, 0, 0, taskAttemptId, 0, null, new Properties, null))
       block
     } finally {
       TaskContext.unset()
diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
index 661d0d48d2f3..0aeddf730cd3 100644
--- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
@@ -28,6 +28,7 @@ import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
 import org.apache.spark.status.AppStatusStore
+import org.apache.spark.status.config._
 import org.apache.spark.ui.jobs.{StagePage, StagesTab}
 
 class StagePageSuite extends SparkFunSuite with LocalSparkContext {
@@ -35,15 +36,13 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
   private val peakExecutionMemory = 10
 
   test("peak execution memory should displayed") {
-    val conf = new SparkConf(false)
-    val html = renderStagePage(conf).toString().toLowerCase(Locale.ROOT)
+    val html = renderStagePage().toString().toLowerCase(Locale.ROOT)
     val targetString = "peak execution memory"
     assert(html.contains(targetString))
   }
 
   test("SPARK-10543: peak execution memory should be per-task rather than cumulative") {
-    val conf = new SparkConf(false)
-    val html = renderStagePage(conf).toString().toLowerCase(Locale.ROOT)
+    val html = renderStagePage().toString().toLowerCase(Locale.ROOT)
     // verify min/25/50/75/max show task value not cumulative values
     assert(html.contains(s"<td>$peakExecutionMemory.0 b</td>" * 5))
   }
@@ -52,7 +51,8 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
    * Render a stage page started with the given conf and return the HTML.
    * This also runs a dummy stage to populate the page with useful content.
    */
-  private def renderStagePage(conf: SparkConf): Seq[Node] = {
+  private def renderStagePage(): Seq[Node] = {
+    val conf = new SparkConf(false).set(LIVE_ENTITY_UPDATE_PERIOD, 0L)
     val statusStore = AppStatusStore.createLiveStore(conf)
     val listener = statusStore.listener.get
 
diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index c71137468054..a3579f21fc53 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -92,9 +92,9 @@ MVN="build/mvn --force"
 # Hive-specific profiles for some builds
 HIVE_PROFILES="-Phive -Phive-thriftserver"
 # Profiles for publishing snapshots and release to Maven Central
-PUBLISH_PROFILES="-Pmesos -Pyarn -Pflume $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
+PUBLISH_PROFILES="-Pmesos -Pyarn -Pkubernetes -Pflume $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
 # Profiles for building binary releases
-BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Pflume -Psparkr"
+BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Pkubernetes -Pflume -Psparkr"
 # Scala 2.11 only profiles for some builds
 SCALA_2_11_PROFILES="-Pkafka-0-8"
 # Scala 2.12 only profiles for some builds
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 730138195e5f..32f6cbb29f0b 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -185,6 +185,8 @@ def get_commits(tag):
     "graphx": "GraphX",
     "input/output": CORE_COMPONENT,
     "java api": "Java API",
+    "k8s": "Kubernetes",
+    "kubernetes": "Kubernetes",
     "mesos": "Mesos",
     "ml": "MLlib",
     "mllib": "MLlib",
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index a7fce2ede0ea..3b2a88537ca5 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -17,6 +17,7 @@ arpack_combined_all-0.1.jar
 arrow-format-0.8.0.jar
 arrow-memory-0.8.0.jar
 arrow-vector-0.8.0.jar
+automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
@@ -60,6 +61,7 @@ datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
 flatbuffers-1.2.0-3f79e055.jar
+generex-1.0.1.jar
 gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
@@ -91,8 +93,10 @@ jackson-annotations-2.6.7.jar
 jackson-core-2.6.7.jar
 jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.7.1.jar
+jackson-dataformat-yaml-2.6.7.jar
 jackson-jaxrs-1.9.13.jar
 jackson-mapper-asl-1.9.13.jar
+jackson-module-jaxb-annotations-2.6.7.jar
 jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
@@ -130,10 +134,13 @@ jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
+kubernetes-client-3.0.0.jar
+kubernetes-model-2.0.0.jar
 leveldbjni-all-1.8.jar
 libfb303-0.9.3.jar
 libthrift-0.9.3.jar
 log4j-1.2.17.jar
+logging-interceptor-3.8.1.jar
 lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
@@ -146,6 +153,8 @@ minlog-1.3.0.jar
 netty-3.9.9.Final.jar
 netty-all-4.1.17.Final.jar
 objenesis-2.1.jar
+okhttp-3.8.1.jar
+okio-1.13.0.jar
 opencsv-2.3.jar
 orc-core-1.4.1-nohive.jar
 orc-mapreduce-1.4.1-nohive.jar
@@ -171,6 +180,7 @@ scalap-2.11.8.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
+snakeyaml-1.15.jar
 snappy-0.2.jar
 snappy-java-1.1.2.6.jar
 spire-macros_2.11-0.13.0.jar
@@ -186,5 +196,6 @@ xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
 xmlenc-0.52.jar
 xz-1.0.jar
+zjsonpatch-0.3.0.jar
 zookeeper-3.4.6.jar
 zstd-jni-1.3.2-2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 94b2e98d85e7..757da00788e5 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -17,6 +17,7 @@ arpack_combined_all-0.1.jar
 arrow-format-0.8.0.jar
 arrow-memory-0.8.0.jar
 arrow-vector-0.8.0.jar
+automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
@@ -60,6 +61,7 @@ datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
 flatbuffers-1.2.0-3f79e055.jar
+generex-1.0.1.jar
 gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
@@ -91,8 +93,10 @@ jackson-annotations-2.6.7.jar
 jackson-core-2.6.7.jar
 jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.7.1.jar
+jackson-dataformat-yaml-2.6.7.jar
 jackson-jaxrs-1.9.13.jar
 jackson-mapper-asl-1.9.13.jar
+jackson-module-jaxb-annotations-2.6.7.jar
 jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
@@ -131,10 +135,13 @@ jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
+kubernetes-client-3.0.0.jar
+kubernetes-model-2.0.0.jar
 leveldbjni-all-1.8.jar
 libfb303-0.9.3.jar
 libthrift-0.9.3.jar
 log4j-1.2.17.jar
+logging-interceptor-3.8.1.jar
 lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
@@ -147,6 +154,8 @@ minlog-1.3.0.jar
 netty-3.9.9.Final.jar
 netty-all-4.1.17.Final.jar
 objenesis-2.1.jar
+okhttp-3.8.1.jar
+okio-1.13.0.jar
 opencsv-2.3.jar
 orc-core-1.4.1-nohive.jar
 orc-mapreduce-1.4.1-nohive.jar
@@ -172,6 +181,7 @@ scalap-2.11.8.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
+snakeyaml-1.15.jar
 snappy-0.2.jar
 snappy-java-1.1.2.6.jar
 spire-macros_2.11-0.13.0.jar
@@ -187,5 +197,6 @@ xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
 xmlenc-0.52.jar
 xz-1.0.jar
+zjsonpatch-0.3.0.jar
 zookeeper-3.4.6.jar
 zstd-jni-1.3.2-2.jar
diff --git a/dev/lint-java b/dev/lint-java
index c2e80538ef2a..1f0b0c8379ed 100755
--- a/dev/lint-java
+++ b/dev/lint-java
@@ -20,7 +20,7 @@
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)"
 
-ERRORS=$($SCRIPT_DIR/../build/mvn -Pkinesis-asl -Pmesos -Pyarn -Phive -Phive-thriftserver checkstyle:check | grep ERROR)
+ERRORS=$($SCRIPT_DIR/../build/mvn -Pkinesis-asl -Pmesos -Pkubernetes -Pyarn -Phive -Phive-thriftserver checkstyle:check | grep ERROR)
 
 if test ! -z "$ERRORS"; then
     echo -e "Checkstyle checks failed at following occurrences:\n$ERRORS"
diff --git a/dev/mima b/dev/mima
index 1e3ca9700bc0..cd2694ff4d3d 100755
--- a/dev/mima
+++ b/dev/mima
@@ -24,7 +24,7 @@ set -e
 FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 cd "$FWDIR"
 
-SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pyarn -Pflume -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
+SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pkubernetes -Pyarn -Pflume -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
 TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | tail -n1)"
 OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"
 
diff --git a/dev/scalastyle b/dev/scalastyle
index 89ecc8abd6f8..b8053df05fa2 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -24,6 +24,7 @@ ERRORS=$(echo -e "q\n" \
         -Pkinesis-asl \
         -Pmesos \
         -Pkafka-0-8 \
+        -Pkubernetes \
         -Pyarn \
         -Pflume \
         -Phive \
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index f834563da9dd..b900f0bd913c 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -400,6 +400,7 @@ def __hash__(self):
         "pyspark.sql.functions",
         "pyspark.sql.readwriter",
         "pyspark.sql.streaming",
+        "pyspark.sql.udf",
         "pyspark.sql.window",
         "pyspark.sql.tests",
     ]
@@ -539,7 +540,7 @@ def __hash__(self):
 kubernetes = Module(
     name="kubernetes",
     dependencies=[],
-    source_file_regexes=["resource-managers/kubernetes/core"],
+    source_file_regexes=["resource-managers/kubernetes"],
     build_profile_flags=["-Pkubernetes"],
     sbt_test_goals=["kubernetes/test"]
 )
diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index 58b295d4f6e0..3bf7618e1ea9 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -29,7 +29,7 @@ export LC_ALL=C
 # TODO: This would be much nicer to do in SBT, once SBT supports Maven-style resolution.
 
 # NOTE: These should match those in the release publishing script
-HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pyarn -Pflume -Phive"
+HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pkubernetes -Pyarn -Pflume -Phive"
 MVN="build/mvn"
 HADOOP_PROFILES=(
     hadoop-2.6
diff --git a/docs/_config.yml b/docs/_config.yml
index dcc211204d76..213579037f51 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.0-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.0
+SPARK_VERSION: 2.3.1-SNAPSHOT
+SPARK_VERSION_SHORT: 2.3.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/docs/cluster-overview.md b/docs/cluster-overview.md
index 658e67f99dd7..7277e2fb2731 100644
--- a/docs/cluster-overview.md
+++ b/docs/cluster-overview.md
@@ -52,8 +52,8 @@ The system currently supports three cluster managers:
 * [Apache Mesos](running-on-mesos.html) -- a general cluster manager that can also run Hadoop MapReduce
   and service applications.
 * [Hadoop YARN](running-on-yarn.html) -- the resource manager in Hadoop 2.
-* [Kubernetes](running-on-kubernetes.html) -- [Kubernetes](https://kubernetes.io/docs/concepts/overview/what-is-kubernetes/)
-is an open-source platform that provides container-centric infrastructure.
+* [Kubernetes](running-on-kubernetes.html) -- an open-source system for automating deployment, scaling,
+  and management of containerized applications.
 
 A third-party project (not supported by the Spark project) exists to add support for
 [Nomad](https://github.com/hashicorp/nomad-spark) as a cluster manager.
diff --git a/docs/configuration.md b/docs/configuration.md
index 1189aea2aa71..eecb39dcafc9 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -58,6 +58,10 @@ The following format is accepted:
     1t or 1tb (tebibytes = 1024 gibibytes)
     1p or 1pb (pebibytes = 1024 tebibytes)
 
+While numbers without units are generally interpreted as bytes, a few are interpreted as KiB or MiB.
+See documentation of individual configuration properties. Specifying units is desirable where 
+possible.
+
 ## Dynamically Loading Spark Properties
 
 In some cases, you may want to avoid hard-coding certain configurations in a `SparkConf`. For
@@ -136,9 +140,9 @@ of the most common options to set are:
   <td><code>spark.driver.maxResultSize</code></td>
   <td>1g</td>
   <td>
-    Limit of total size of serialized results of all partitions for each Spark action (e.g. collect).
-    Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total size
-    is above this limit.
+    Limit of total size of serialized results of all partitions for each Spark action (e.g. 
+    collect) in bytes. Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total 
+    size is above this limit.
     Having a high limit may cause out-of-memory errors in driver (depends on spark.driver.memory
     and memory overhead of objects in JVM). Setting a proper limit can protect the driver from
     out-of-memory errors.
@@ -148,10 +152,10 @@ of the most common options to set are:
   <td><code>spark.driver.memory</code></td>
   <td>1g</td>
   <td>
-    Amount of memory to use for the driver process, i.e. where SparkContext is initialized.
-    (e.g. <code>1g</code>, <code>2g</code>).
-
-    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
+    Amount of memory to use for the driver process, i.e. where SparkContext is initialized, in MiB 
+    unless otherwise specified (e.g. <code>1g</code>, <code>2g</code>).
+    <br />
+    <em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
     directly in your application, because the driver JVM has already started at that point.
     Instead, please set this through the <code>--driver-memory</code> command line option
     or in your default properties file.
@@ -161,27 +165,28 @@ of the most common options to set are:
   <td><code>spark.driver.memoryOverhead</code></td>
   <td>driverMemory * 0.10, with minimum of 384 </td>
   <td>
-    The amount of off-heap memory (in megabytes) to be allocated per driver in cluster mode. This is
-    memory that accounts for things like VM overheads, interned strings, other native overheads, etc.
-    This tends to grow with the container size (typically 6-10%). This option is currently supported
-    on YARN and Kubernetes.
+    The amount of off-heap memory to be allocated per driver in cluster mode, in MiB unless
+    otherwise specified. This is memory that accounts for things like VM overheads, interned strings, 
+    other native overheads, etc. This tends to grow with the container size (typically 6-10%). 
+    This option is currently supported on YARN and Kubernetes.
   </td>
 </tr>
 <tr>
   <td><code>spark.executor.memory</code></td>
   <td>1g</td>
   <td>
-    Amount of memory to use per executor process (e.g. <code>2g</code>, <code>8g</code>).
+    Amount of memory to use per executor process, in MiB unless otherwise specified.
+    (e.g. <code>2g</code>, <code>8g</code>).
   </td>
 </tr>
 <tr>
  <td><code>spark.executor.memoryOverhead</code></td>
   <td>executorMemory * 0.10, with minimum of 384 </td>
   <td>
-    The amount of off-heap memory (in megabytes) to be allocated per executor. This is memory that
-    accounts for things like VM overheads, interned strings, other native overheads, etc. This tends
-    to grow with the executor size (typically 6-10%). This option is currently supported on YARN and
-    Kubernetes.
+    The amount of off-heap memory to be allocated per executor, in MiB unless otherwise specified.
+    This is memory that accounts for things like VM overheads, interned strings, other native 
+    overheads, etc. This tends to grow with the executor size (typically 6-10%).
+    This option is currently supported on YARN and Kubernetes.
   </td>
 </tr>
 <tr>
@@ -431,8 +436,9 @@ Apart from these, the following properties are also available, and may be useful
   <td>512m</td>
   <td>
     Amount of memory to use per python worker process during aggregation, in the same
-    format as JVM memory strings (e.g. <code>512m</code>, <code>2g</code>). If the memory
-    used during aggregation goes above this amount, it will spill the data into disks.
+    format as JVM memory strings with a size unit suffix ("k", "m", "g" or "t")
+    (e.g. <code>512m</code>, <code>2g</code>).
+    If the memory used during aggregation goes above this amount, it will spill the data into disks.
   </td>
 </tr>
 <tr>
@@ -540,9 +546,10 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.reducer.maxSizeInFlight</code></td>
   <td>48m</td>
   <td>
-    Maximum size of map outputs to fetch simultaneously from each reduce task. Since
-    each output requires us to create a buffer to receive it, this represents a fixed memory
-    overhead per reduce task, so keep it small unless you have a large amount of memory.
+    Maximum size of map outputs to fetch simultaneously from each reduce task, in MiB unless 
+    otherwise specified. Since each output requires us to create a buffer to receive it, this 
+    represents a fixed memory overhead per reduce task, so keep it small unless you have a
+    large amount of memory.
   </td>
 </tr>
 <tr>
@@ -570,9 +577,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.maxRemoteBlockSizeFetchToMem</code></td>
   <td>Long.MaxValue</td>
   <td>
-    The remote block will be fetched to disk when size of the block is above this threshold.
+    The remote block will be fetched to disk when size of the block is above this threshold in bytes.
     This is to avoid a giant request takes too much memory. We can enable this config by setting
-    a specific value(e.g. 200m). Note this configuration will affect both shuffle fetch 
+    a specific value(e.g. 200m). Note this configuration will affect both shuffle fetch
     and block manager remote block fetch. For users who enabled external shuffle service,
     this feature can only be worked when external shuffle service is newer than Spark 2.2.
   </td>
@@ -589,8 +596,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.file.buffer</code></td>
   <td>32k</td>
   <td>
-    Size of the in-memory buffer for each shuffle file output stream. These buffers
-    reduce the number of disk seeks and system calls made in creating intermediate shuffle files.
+    Size of the in-memory buffer for each shuffle file output stream, in KiB unless otherwise 
+    specified. These buffers reduce the number of disk seeks and system calls made in creating 
+    intermediate shuffle files.
   </td>
 </tr>
 <tr>
@@ -651,7 +659,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.service.index.cache.size</code></td>
   <td>100m</td>
   <td>
-    Cache entries limited to the specified memory footprint.
+    Cache entries limited to the specified memory footprint in bytes.
   </td>
 </tr>
 <tr>
@@ -685,9 +693,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.shuffle.accurateBlockThreshold</code></td>
   <td>100 * 1024 * 1024</td>
   <td>
-    When we compress the size of shuffle blocks in HighlyCompressedMapStatus, we will record the
-    size accurately if it's above this config. This helps to prevent OOM by avoiding
-    underestimating shuffle block size when fetch shuffle blocks.
+    Threshold in bytes above which the size of shuffle blocks in HighlyCompressedMapStatus is 
+    accurately recorded. This helps to prevent OOM by avoiding underestimating shuffle 
+    block size when fetch shuffle blocks.
   </td>
 </tr>
 <tr>
@@ -779,7 +787,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.eventLog.buffer.kb</code></td>
   <td>100k</td>
   <td>
-    Buffer size in KB to use when writing to output streams.
+    Buffer size to use when writing to output streams, in KiB unless otherwise specified.
   </td>
 </tr>
 <tr>
@@ -917,7 +925,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.io.compression.lz4.blockSize</code></td>
   <td>32k</td>
   <td>
-    Block size used in LZ4 compression, in the case when LZ4 compression codec
+    Block size in bytes used in LZ4 compression, in the case when LZ4 compression codec
     is used. Lowering this block size will also lower shuffle memory usage when LZ4 is used.
   </td>
 </tr>
@@ -925,7 +933,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.io.compression.snappy.blockSize</code></td>
   <td>32k</td>
   <td>
-    Block size used in Snappy compression, in the case when Snappy compression codec
+    Block size in bytes used in Snappy compression, in the case when Snappy compression codec
     is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.
   </td>
 </tr>
@@ -941,7 +949,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.io.compression.zstd.bufferSize</code></td>
   <td>32k</td>
   <td>
-    Buffer size used in Zstd compression, in the case when Zstd compression codec
+    Buffer size in bytes used in Zstd compression, in the case when Zstd compression codec
     is used. Lowering this size will lower the shuffle memory usage when Zstd is used, but it
     might increase the compression cost because of excessive JNI call overhead.
   </td>
@@ -1001,8 +1009,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.kryoserializer.buffer.max</code></td>
   <td>64m</td>
   <td>
-    Maximum allowable size of Kryo serialization buffer. This must be larger than any
-    object you attempt to serialize and must be less than 2048m.
+    Maximum allowable size of Kryo serialization buffer, in MiB unless otherwise specified.
+    This must be larger than any object you attempt to serialize and must be less than 2048m.
     Increase this if you get a "buffer limit exceeded" exception inside Kryo.
   </td>
 </tr>
@@ -1010,9 +1018,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.kryoserializer.buffer</code></td>
   <td>64k</td>
   <td>
-    Initial size of Kryo's serialization buffer. Note that there will be one buffer
-     <i>per core</i> on each worker. This buffer will grow up to
-     <code>spark.kryoserializer.buffer.max</code> if needed.
+    Initial size of Kryo's serialization buffer, in KiB unless otherwise specified. 
+    Note that there will be one buffer <i>per core</i> on each worker. This buffer will grow up to
+    <code>spark.kryoserializer.buffer.max</code> if needed.
   </td>
 </tr>
 <tr>
@@ -1086,7 +1094,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.memory.offHeap.enabled</code></td>
   <td>false</td>
   <td>
-    If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory use is enabled, then <code>spark.memory.offHeap.size</code> must be positive.
+    If true, Spark will attempt to use off-heap memory for certain operations. If off-heap memory 
+    use is enabled, then <code>spark.memory.offHeap.size</code> must be positive.
   </td>
 </tr>
 <tr>
@@ -1094,7 +1103,8 @@ Apart from these, the following properties are also available, and may be useful
   <td>0</td>
   <td>
     The absolute amount of memory in bytes which can be used for off-heap allocation.
-    This setting has no impact on heap memory usage, so if your executors' total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly.
+    This setting has no impact on heap memory usage, so if your executors' total memory consumption 
+    must fit within some hard limit then be sure to shrink your JVM heap size accordingly.
     This must be set to a positive value when <code>spark.memory.offHeap.enabled=true</code>.
   </td>
 </tr>
@@ -1202,9 +1212,9 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.broadcast.blockSize</code></td>
   <td>4m</td>
   <td>
-    Size of each piece of a block for <code>TorrentBroadcastFactory</code>.
-    Too large a value decreases parallelism during broadcast (makes it slower); however, if it is
-    too small, <code>BlockManager</code> might take a performance hit.
+    Size of each piece of a block for <code>TorrentBroadcastFactory</code>, in KiB unless otherwise 
+    specified. Too large a value decreases parallelism during broadcast (makes it slower); however, 
+    if it is too small, <code>BlockManager</code> might take a performance hit.
   </td>
 </tr>
 <tr>
@@ -1312,7 +1322,7 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.storage.memoryMapThreshold</code></td>
   <td>2m</td>
   <td>
-    Size of a block above which Spark memory maps when reading a block from disk.
+    Size in bytes of a block above which Spark memory maps when reading a block from disk.
     This prevents Spark from memory mapping very small blocks. In general, memory
     mapping has high overhead for blocks close to or below the page size of the operating system.
   </td>
@@ -2490,4 +2500,4 @@ Also, you can modify or add configurations at runtime:
   --conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" \ 
   --conf spark.hadoop.abc.def=xyz \ 
   myApp.jar
-{% endhighlight %}
\ No newline at end of file
+{% endhighlight %}
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 72643137d96b..466a8fbe99cf 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -222,9 +222,9 @@ The `FeatureHasher` transformer operates on multiple columns. Each column may co
 numeric or categorical features. Behavior and handling of column data types is as follows:
 
 - Numeric columns: For numeric features, the hash value of the column name is used to map the
-feature value to its index in the feature vector. Numeric features are never treated as
-categorical, even when they are integers. You must explicitly convert numeric columns containing
-categorical features to strings first.
+feature value to its index in the feature vector. By default, numeric features are not treated
+as categorical (even when they are integers). To treat them as categorical, specify the relevant
+columns using the `categoricalCols` parameter.
 - String columns: For categorical features, the hash value of the string "column_name=value"
 is used to map to the vector index, with an indicator value of `1.0`. Thus, categorical features
 are "one-hot" encoded (similarly to using [OneHotEncoder](ml-features.html#onehotencoder) with
@@ -775,35 +775,43 @@ for more details on the API.
 </div>
 </div>
 
-## OneHotEncoder
+## OneHotEncoder (Deprecated since 2.3.0)
 
-[One-hot encoding](http://en.wikipedia.org/wiki/One-hot) maps a column of label indices to a column of binary vectors, with at most a single one-value. This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features.
+Because this existing `OneHotEncoder` is a stateless transformer, it is not usable on new data where the number of categories may differ from the training data. In order to fix this, a new `OneHotEncoderEstimator` was created that produces an `OneHotEncoderModel` when fitting. For more detail, please see [SPARK-13030](https://issues.apache.org/jira/browse/SPARK-13030).
+
+`OneHotEncoder` has been deprecated in 2.3.0 and will be removed in 3.0.0. Please use [OneHotEncoderEstimator](ml-features.html#onehotencoderestimator) instead.
+
+## OneHotEncoderEstimator
+
+[One-hot encoding](http://en.wikipedia.org/wiki/One-hot) maps a categorical feature, represented as a label index, to a binary vector with at most a single one-value indicating the presence of a specific feature value from among the set of all feature values. This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features. For string type input data, it is common to encode categorical features using [StringIndexer](ml-features.html#stringindexer) first.
+
+`OneHotEncoderEstimator` can transform multiple columns, returning an one-hot-encoded output vector column for each input column. It is common to merge these vectors into a single feature vector using [VectorAssembler](ml-features.html#vectorassembler).
+
+`OneHotEncoderEstimator` supports the `handleInvalid` parameter to choose how to handle invalid input during transforming data. Available options include 'keep' (any invalid inputs are assigned to an extra categorical index) and 'error' (throw an error).
 
 **Examples**
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 
-Refer to the [OneHotEncoder Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder)
-for more details on the API.
+Refer to the [OneHotEncoderEstimator Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoderEstimator) for more details on the API.
 
-{% include_example scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala %}
+{% include_example scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 
-Refer to the [OneHotEncoder Java docs](api/java/org/apache/spark/ml/feature/OneHotEncoder.html)
+Refer to the [OneHotEncoderEstimator Java docs](api/java/org/apache/spark/ml/feature/OneHotEncoderEstimator.html)
 for more details on the API.
 
-{% include_example java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java %}
+{% include_example java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
 
-Refer to the [OneHotEncoder Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoder)
-for more details on the API.
+Refer to the [OneHotEncoderEstimator Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoderEstimator) for more details on the API.
 
-{% include_example python/ml/onehot_encoder_example.py %}
+{% include_example python/ml/onehot_encoder_estimator_example.py %}
 </div>
 </div>
 
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index e491329136a3..3c7586e8544b 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -8,6 +8,10 @@ title: Running Spark on Kubernetes
 Spark can run on clusters managed by [Kubernetes](https://kubernetes.io). This feature makes use of native
 Kubernetes scheduler that has been added to Spark.
 
+**The Kubernetes scheduler is currently experimental.
+In future versions, there may be behavioral changes around configuration,
+container images and entrypoints.**
+
 # Prerequisites
 
 * A runnable distribution of Spark 2.3 or above.
@@ -16,6 +20,9 @@ Kubernetes scheduler that has been added to Spark.
 you may setup a test cluster on your local machine using
 [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/).
   * We recommend using the latest release of minikube with the DNS addon enabled.
+  * Be aware that the default minikube configuration is not enough for running Spark applications.
+  We recommend 3 CPUs and 4g of memory to be able to start a simple Spark application with a single
+  executor.
 * You must have appropriate permissions to list, create, edit and delete
 [pods](https://kubernetes.io/docs/user-guide/pods/) in your cluster. You can verify that you can list these resources
 by running `kubectl auth can-i <list|create|edit|delete> pods`.
@@ -38,11 +45,10 @@ logs and remains in "completed" state in the Kubernetes API until it's eventuall
 
 Note that in the completed state, the driver pod does *not* use any computational or memory resources.
 
-The driver and executor pod scheduling is handled by Kubernetes. It will be possible to affect Kubernetes scheduling
-decisions for driver and executor pods using advanced primitives like
-[node selectors](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector)
-and [node/pod affinities](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity)
-in a future release.
+The driver and executor pod scheduling is handled by Kubernetes. It is possible to schedule the
+driver and executor pods on a subset of available nodes through a [node selector](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector)
+using the configuration property for it. It will be possible to use more advanced
+scheduling hints like [node/pod affinities](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity) in a future release.
 
 # Submitting Applications to Kubernetes
 
@@ -50,20 +56,19 @@ in a future release.
 
 Kubernetes requires users to supply images that can be deployed into containers within pods. The images are built to
 be run in a container runtime environment that Kubernetes supports. Docker is a container runtime environment that is
-frequently used with Kubernetes. With Spark 2.3, there are Dockerfiles provided in the runnable distribution that can be customized
-and built for your usage.
+frequently used with Kubernetes. Spark (starting with version 2.3) ships with a Dockerfile that can be used for this
+purpose, or customized to match an individual application's needs. It can be found in the `kubernetes/dockerfiles/`
+directory.
 
-You may build these docker images from sources.
-There is a script, `sbin/build-push-docker-images.sh` that you can use to build and push
-customized Spark distribution images consisting of all the above components.
+Spark also ships with a `bin/docker-image-tool.sh` script that can be used to build and publish the Docker images to
+use with the Kubernetes backend.
 
 Example usage is:
 
-    ./sbin/build-push-docker-images.sh -r <repo> -t my-tag build
-    ./sbin/build-push-docker-images.sh -r <repo> -t my-tag push
-
-Docker files are under the `kubernetes/dockerfiles/` directory and can be customized further before
-building using the supplied script, or manually.
+```bash
+$ ./bin/docker-image-tool.sh -r <repo> -t my-tag build
+$ ./bin/docker-image-tool.sh -r <repo> -t my-tag push
+```
 
 ## Cluster Mode
 
@@ -76,8 +81,7 @@ $ bin/spark-submit \
     --name spark-pi \
     --class org.apache.spark.examples.SparkPi \
     --conf spark.executor.instances=5 \
-    --conf spark.kubernetes.driver.container.image=<driver-image> \
-    --conf spark.kubernetes.executor.container.image=<executor-image> \
+    --conf spark.kubernetes.container.image=<spark-image> \
     local:///path/to/examples.jar
 ```
 
@@ -95,7 +99,7 @@ must consist of lower case alphanumeric characters, `-`, and `.`  and must start
 If you have a Kubernetes cluster setup, one way to discover the apiserver URL is by executing `kubectl cluster-info`.
 
 ```bash
-kubectl cluster-info
+$ kubectl cluster-info
 Kubernetes master is running at http://127.0.0.1:6443
 ```
 
@@ -106,7 +110,7 @@ authenticating proxy, `kubectl proxy` to communicate to the Kubernetes API.
 The local proxy can be started by:
 
 ```bash
-kubectl proxy
+$ kubectl proxy
 ```
 
 If the local proxy is running at localhost:8001, `--master k8s://http://127.0.0.1:8001` can be used as the argument to
@@ -118,18 +122,15 @@ This URI is the location of the example jar that is already in the Docker image.
 If your application's dependencies are all hosted in remote locations like HDFS or HTTP servers, they may be referred to
 by their appropriate remote URIs. Also, application dependencies can be pre-mounted into custom-built Docker images.
 Those dependencies can be added to the classpath by referencing them with `local://` URIs and/or setting the
-`SPARK_EXTRA_CLASSPATH` environment variable in your Dockerfiles.
+`SPARK_EXTRA_CLASSPATH` environment variable in your Dockerfiles. The `local://` scheme is also required when referring to
+dependencies in custom-built Docker images in `spark-submit`. Note that using application dependencies from the submission
+client's local file system is currently not yet supported.
+
 
 ### Using Remote Dependencies
 When there are application dependencies hosted in remote locations like HDFS or HTTP servers, the driver and executor pods
 need a Kubernetes [init-container](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/) for downloading
-the dependencies so the driver and executor containers can use them locally. This requires users to specify the container
-image for the init-container using the configuration property `spark.kubernetes.initContainer.image`. For example, users
-simply add the following option to the `spark-submit` command to specify the init-container image:
-
-```
---conf spark.kubernetes.initContainer.image=<init-container image>
-```
+the dependencies so the driver and executor containers can use them locally.
 
 The init-container handles remote dependencies specified in `spark.jars` (or the `--jars` option of `spark-submit`) and
 `spark.files` (or the `--files` option of `spark-submit`). It also handles remotely hosted main application resources, e.g.,
@@ -144,9 +145,7 @@ $ bin/spark-submit \
     --jars https://path/to/dependency1.jar,https://path/to/dependency2.jar
     --files hdfs://host:port/path/to/file1,hdfs://host:port/path/to/file2
     --conf spark.executor.instances=5 \
-    --conf spark.kubernetes.driver.container.image=<driver-image> \
-    --conf spark.kubernetes.executor.container.image=<executor-image> \
-    --conf spark.kubernetes.initContainer.image=<init-container image>
+    --conf spark.kubernetes.container.image=<spark-image> \
     https://path/to/examples.jar
 ```
 
@@ -179,7 +178,7 @@ Logs can be accessed using the Kubernetes API and the `kubectl` CLI. When a Spar
 to stream logs from the application using:
 
 ```bash
-kubectl -n=<namespace> logs -f <driver-pod-name>
+$ kubectl -n=<namespace> logs -f <driver-pod-name>
 ```
 
 The same logs can also be accessed through the
@@ -192,12 +191,12 @@ The UI associated with any application can be accessed locally using
 [`kubectl port-forward`](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/#forward-a-local-port-to-a-port-on-the-pod).
 
 ```bash
-kubectl port-forward <driver-pod-name> 4040:4040
+$ kubectl port-forward <driver-pod-name> 4040:4040
 ```
 
 Then, the Spark driver UI can be accessed on `http://localhost:4040`.
 
-### Debugging 
+### Debugging
 
 There may be several kinds of failures. If the Kubernetes API server rejects the request made from spark-submit, or the
 connection is refused for a different reason, the submission logic should indicate the error encountered. However, if there
@@ -206,17 +205,17 @@ are errors during the running of the application, often, the best way to investi
 To get some basic information about the scheduling decisions made around the driver pod, you can run:
 
 ```bash
-kubectl describe pod <spark-driver-pod>
+$ kubectl describe pod <spark-driver-pod>
 ```
 
 If the pod has encountered a runtime error, the status can be probed further using:
 
 ```bash
-kubectl logs <spark-driver-pod>
+$ kubectl logs <spark-driver-pod>
 ```
 
-Status and logs of failed executor pods can be checked in similar ways. Finally, deleting the driver pod will clean up the entire spark 
-application, includling all executors, associated service, etc. The driver pod can be thought of as the Kubernetes representation of 
+Status and logs of failed executor pods can be checked in similar ways. Finally, deleting the driver pod will clean up the entire spark
+application, including all executors, associated service, etc. The driver pod can be thought of as the Kubernetes representation of
 the Spark application.
 
 ## Kubernetes Features
@@ -260,7 +259,7 @@ To create a custom service account, a user can use the `kubectl create serviceac
 following command creates a service account named `spark`:
 
 ```bash
-kubectl create serviceaccount spark
+$ kubectl create serviceaccount spark
 ```
 
 To grant a service account a `Role` or `ClusterRole`, a `RoleBinding` or `ClusterRoleBinding` is needed. To create
@@ -269,7 +268,7 @@ for `ClusterRoleBinding`) command. For example, the following command creates an
 namespace and grants it to the `spark` service account created above:
 
 ```bash
-kubectl create clusterrolebinding spark-role --clusterrole=edit --serviceaccount=default:spark --namespace=default
+$ kubectl create clusterrolebinding spark-role --clusterrole=edit --serviceaccount=default:spark --namespace=default
 ```
 
 Note that a `Role` can only be used to grant access to resources (like pods) within a single namespace, whereas a
@@ -319,21 +318,27 @@ specific to Spark on Kubernetes.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.driver.container.image</code></td>
+  <td><code>spark.kubernetes.container.image</code></td>
   <td><code>(none)</code></td>
   <td>
-    Container image to use for the driver.
-    This is usually of the form <code>example.com/repo/spark-driver:v1.0.0</code>.
-    This configuration is required and must be provided by the user.
+    Container image to use for the Spark application.
+    This is usually of the form <code>example.com/repo/spark:v1.0.0</code>.
+    This configuration is required and must be provided by the user, unless explicit
+    images are provided for each different container type.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driver.container.image</code></td>
+  <td><code>(value of spark.kubernetes.container.image)</code></td>
+  <td>
+    Custom container image to use for the driver.
   </td>
 </tr>
 <tr>
   <td><code>spark.kubernetes.executor.container.image</code></td>
-  <td><code>(none)</code></td>
+  <td><code>(value of spark.kubernetes.container.image)</code></td>
   <td>
-    Container image to use for the executors.
-    This is usually of the form <code>example.com/repo/spark-executor:v1.0.0</code>.
-    This configuration is required and must be provided by the user.
+    Custom container image to use for executors.
   </td>
 </tr>
 <tr>
@@ -543,14 +548,6 @@ specific to Spark on Kubernetes.
     to avoid name conflicts.
   </td>
 </tr>
-<tr>
-  <td><code>spark.kubernetes.executor.podNamePrefix</code></td>
-  <td>(none)</td>
-  <td>
-    Prefix for naming the executor pods.
-    If not set, the executor pod name is set to driver pod name suffixed by an integer.
-  </td>
-</tr>
 <tr>
   <td><code>spark.kubernetes.executor.lostCheck.maxAttempts</code></td>
   <td><code>10</code></td>
@@ -640,9 +637,9 @@ specific to Spark on Kubernetes.
 </tr>
 <tr>
   <td><code>spark.kubernetes.initContainer.image</code></td>
-  <td>(none)</td>
+  <td><code>(value of spark.kubernetes.container.image)</code></td>
   <td>
-   Container image for the <a href="https://kubernetes.io/docs/concepts/workloads/pods/init-containers/">init-container</a> of the driver and executors for downloading dependencies. This is usually of the form <code>example.com/repo/spark-init:v1.0.0</code>. This configuration is optional and must be provided by the user if any non-container local dependency is used and must be downloaded remotely.
+   Custom container image for the init container of both driver and executors.
   </td>
 </tr>
 <tr>
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 997ea60fb6cf..6685b585a393 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -596,7 +596,7 @@ The following example shows how to save/load a MLlib model by SparkR.
 
 # Structured Streaming
 
-SparkR supports the Structured Streaming API (experimental). Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. For more information see the R API on the [Structured Streaming Programming Guide](structured-streaming-programming-guide.html)
+SparkR supports the Structured Streaming API. Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. For more information see the R API on the [Structured Streaming Programming Guide](structured-streaming-programming-guide.html)
 
 # R Function Name Conflicts
 
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index dc3e384008d2..3e2e48a0ef24 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -915,6 +915,14 @@ spark.catalog.refreshTable("my_table")
 
 </div>
 
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+refreshTable("my_table")
+{% endhighlight %}
+
+</div>
+
 <div data-lang="sql"  markdown="1">
 
 {% highlight sql %}
@@ -953,8 +961,10 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
   <td><code>spark.sql.parquet.compression.codec</code></td>
   <td>snappy</td>
   <td>
-    Sets the compression codec use when writing Parquet files. Acceptable values include:
-    uncompressed, snappy, gzip, lzo.
+    Sets the compression codec used when writing Parquet files. If either `compression` or
+    `parquet.compression` is specified in the table-specific options/properties, the precedence would be
+    `compression`, `parquet.compression`, `spark.sql.parquet.compression.codec`. Acceptable values include:
+    none, uncompressed, snappy, gzip, lzo.
   </td>
 </tr>
 <tr>
@@ -1496,10 +1506,10 @@ that these options will be deprecated in future release as more optimizations ar
 ## Broadcast Hint for SQL Queries
 
 The `BROADCAST` hint guides Spark to broadcast each specified table when joining them with another table or view.
-When Spark deciding the join methods, the broadcast hash join (i.e., BHJ) is preferred, 
+When Spark deciding the join methods, the broadcast hash join (i.e., BHJ) is preferred,
 even if the statistics is above the configuration `spark.sql.autoBroadcastJoinThreshold`.
 When both sides of a join are specified, Spark broadcasts the one having the lower statistics.
-Note Spark does not guarantee BHJ is always chosen, since not all cases (e.g. full outer join) 
+Note Spark does not guarantee BHJ is always chosen, since not all cases (e.g. full outer join)
 support BHJ. When the broadcast nested loop join is selected, we still respect the hint.
 
 <div class="codetabs">
@@ -1778,10 +1788,15 @@ options.
     Note that, for <b>DecimalType(38,0)*</b>, the table above intentionally does not cover all other combinations of scales and precisions because currently we only infer decimal type like `BigInteger`/`BigInt`. For example, 1.1 is inferred as double type.
   - In PySpark, now we need Pandas 0.19.2 or upper if you want to use Pandas related functionalities, such as `toPandas`, `createDataFrame` from Pandas DataFrame, etc.
   - In PySpark, the behavior of timestamp values for Pandas related functionalities was changed to respect session timezone. If you want to use the old behavior, you need to set a configuration `spark.sql.execution.pandas.respectSessionTimeZone` to `False`. See [SPARK-22395](https://issues.apache.org/jira/browse/SPARK-22395) for details.
- 
- - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
+  - In PySpark, `na.fill()` or `fillna` also accepts boolean and replaces nulls with booleans. In prior Spark versions, PySpark just ignores it and returns the original Dataset/DataFrame.  
+  - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
+  - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
+  - Since Spark 2.3, when all inputs are binary, SQL `elt()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.eltOutputAsString` to `true`.
 
- - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
+ - Since Spark 2.3, by default arithmetic operations between decimals return a rounded value if an exact representation is not possible (instead of returning NULL). This is compliant to SQL ANSI 2011 specification and Hive's new behavior introduced in Hive 2.2 (HIVE-15331). This involves the following changes
+    - The rules to determine the result type of an arithmetic operation have been updated. In particular, if the precision / scale needed are out of the range of available values, the scale is reduced up to 6, in order to prevent the truncation of the integer part of the decimals. All the arithmetic operations are affected by the change, ie. addition (`+`), subtraction (`-`), multiplication (`*`), division (`/`), remainder (`%`) and positive module (`pmod`).
+    - Literal values used in SQL operations are converted to DECIMAL with the exact precision and scale needed by them.
+    - The configuration `spark.sql.decimalOperations.allowPrecisionLoss` has been introduced. It defaults to `true`, which means the new behavior described here; if set to `false`, Spark uses previous rules, ie. it doesn't adjust the needed scale to represent the values and it returns NULL if an exact representation of the value is not possible.
 
 ## Upgrading From Spark SQL 2.1 to 2.2
 
@@ -2163,7 +2178,7 @@ Not all the APIs of the Hive UDF/UDTF/UDAF are supported by Spark SQL. Below are
   Spark SQL currently does not support the reuse of aggregation.
 * `getWindowingEvaluator` (`GenericUDAFEvaluator`) is a function to optimize aggregation by evaluating
   an aggregate over a fixed window.
-  
+
 ### Incompatible Hive UDF
 
 Below are the scenarios in which Hive and Spark generate different results:
diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md
index bab0be8ddeb9..461c29ce1ba8 100644
--- a/docs/structured-streaming-kafka-integration.md
+++ b/docs/structured-streaming-kafka-integration.md
@@ -61,7 +61,7 @@ df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 {% highlight java %}
 
 // Subscribe to 1 topic
-DataFrame<Row> df = spark
+Dataset<Row> df = spark
   .readStream()
   .format("kafka")
   .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
@@ -70,7 +70,7 @@ DataFrame<Row> df = spark
 df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 
 // Subscribe to multiple topics
-DataFrame<Row> df = spark
+Dataset<Row> df = spark
   .readStream()
   .format("kafka")
   .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
@@ -79,7 +79,7 @@ DataFrame<Row> df = spark
 df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 
 // Subscribe to a pattern
-DataFrame<Row> df = spark
+Dataset<Row> df = spark
   .readStream()
   .format("kafka")
   .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
@@ -171,7 +171,7 @@ df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 {% highlight java %}
 
 // Subscribe to 1 topic defaults to the earliest and latest offsets
-DataFrame<Row> df = spark
+Dataset<Row> df = spark
   .read()
   .format("kafka")
   .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
@@ -180,7 +180,7 @@ DataFrame<Row> df = spark
 df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
 
 // Subscribe to multiple topics, specifying explicit Kafka offsets
-DataFrame<Row> df = spark
+Dataset<Row> df = spark
   .read()
   .format("kafka")
   .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
@@ -191,7 +191,7 @@ DataFrame<Row> df = spark
 df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)");
 
 // Subscribe to a pattern, at the earliest and latest offsets
-DataFrame<Row> df = spark
+Dataset<Row> df = spark
   .read()
   .format("kafka")
   .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 31fcfabb9cac..2ddba2f0d942 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -10,7 +10,9 @@ title: Structured Streaming Programming Guide
 # Overview
 Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. You can express your streaming computation the same way you would express a batch computation on static data. The Spark SQL engine will take care of running it incrementally and continuously and updating the final result as streaming data continues to arrive. You can use the [Dataset/DataFrame API](sql-programming-guide.html) in Scala, Java, Python or R to express streaming aggregations, event-time windows, stream-to-batch joins, etc. The computation is executed on the same optimized Spark SQL engine. Finally, the system ensures end-to-end exactly-once fault-tolerance guarantees through checkpointing and Write Ahead Logs. In short, *Structured Streaming provides fast, scalable, fault-tolerant, end-to-end exactly-once stream processing without the user having to reason about streaming.*
 
-In this guide, we are going to walk you through the programming model and the APIs. First, let's start with a simple example - a streaming word count.
+Internally, by default, Structured Streaming queries are processed using a *micro-batch processing* engine, which processes data streams as a series of small batch jobs thereby achieving end-to-end latencies as low as 100 milliseconds and exactly-once fault-tolerance guarantees. However, since Spark 2.3, we have introduced a new low-latency processing mode called **Continuous Processing**, which can achieve end-to-end latencies as low as 1 millisecond with at-least-once guarantees. Without changing the Dataset/DataFrame operations in your queries, you will be able choose the mode based on your application requirements. 
+
+In this guide, we are going to walk you through the programming model and the APIs. We are going to explain the concepts mostly using the default micro-batch processing model, and then [later](#continuous-processing-experimental) discuss Continuous Processing model. First, let's start with a simple example of a Structured Streaming query - a streaming word count.
 
 # Quick Example
 Let’s say you want to maintain a running word count of text data received from a data server listening on a TCP socket. Let’s see how you can express this using Structured Streaming. You can see the full code in
@@ -827,8 +829,8 @@ df.isStreaming()
 {% endhighlight %}
 </div>
 <div data-lang="r"  markdown="1">
-{% highlight bash %}
-Not available.
+{% highlight r %}
+isStreaming(df)
 {% endhighlight %}
 </div>
 </div>
@@ -885,6 +887,19 @@ windowedCounts = words.groupBy(
 ).count()
 {% endhighlight %}
 
+</div>
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+words <- ...  # streaming DataFrame of schema { timestamp: Timestamp, word: String }
+
+# Group the data by window and word and compute the count of each group
+windowedCounts <- count(
+                    groupBy(
+                      words,
+                      window(words$timestamp, "10 minutes", "5 minutes"),
+                      words$word))
+{% endhighlight %}
+
 </div>
 </div>
 
@@ -959,6 +974,21 @@ windowedCounts = words \
     .count()
 {% endhighlight %}
 
+</div>
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+words <- ...  # streaming DataFrame of schema { timestamp: Timestamp, word: String }
+
+# Group the data by window and word and compute the count of each group
+
+words <- withWatermark(words, "timestamp", "10 minutes")
+windowedCounts <- count(
+                    groupBy(
+                      words,
+                      window(words$timestamp, "10 minutes", "5 minutes"),
+                      words$word))
+{% endhighlight %}
+
 </div>
 </div>
 
@@ -1023,7 +1053,19 @@ output mode.
 
 
 ### Join Operations
-Streaming DataFrames can be joined with static DataFrames to create new streaming DataFrames. Here are a few examples.
+Structured Streaming supports joining a streaming Dataset/DataFrame with a static Dataset/DataFrame
+as well as another streaming Dataset/DataFrame. The result of the streaming join is generated
+incrementally, similar to the results of streaming aggregations in the previous section. In this
+section we will explore what type of joins (i.e. inner, outer, etc.) are supported in the above
+cases. Note that in all the supported join types, the result of the join with a streaming
+Dataset/DataFrame will be the exactly the same as if it was with a static Dataset/DataFrame
+containing the same data in the stream.
+
+
+#### Stream-static joins
+
+Since the introduction in Spark 2.0, Structured Streaming has supported joins (inner join and some
+type of outer joins) between a streaming and a static DataFrame/Dataset. Here is a simple example.
 
 <div class="codetabs">
 <div data-lang="scala"  markdown="1">
@@ -1061,6 +1103,300 @@ streamingDf.join(staticDf, "type", "right_join")  # right outer join with a stat
 </div>
 </div>
 
+Note that stream-static joins are not stateful, so no state management is necessary.
+However, a few types of stream-static outer joins are not yet supported.
+These are listed at the [end of this Join section](#support-matrix-for-joins-in-streaming-queries).
+
+#### Stream-stream Joins
+In Spark 2.3, we have added support for stream-stream joins, that is, you can join two streaming
+Datasets/DataFrames. The challenge of generating join results between two data streams is that,
+at any point of time, the view of the dataset is incomplete for both sides of the join making
+it much harder to find matches between inputs. Any row received from one input stream can match
+with any future, yet-to-be-received row from the other input stream. Hence, for both the input
+streams, we buffer past input as streaming state, so that we can match every future input with
+past input and accordingly generate joined results. Furthermore, similar to streaming aggregations,
+we automatically handle late, out-of-order data and can limit the state using watermarks.
+Let’s discuss the different types of supported stream-stream joins and how to use them.
+
+##### Inner Joins with optional Watermarking
+Inner joins on any kind of columns along with any kind of join conditions are supported.
+However, as the stream runs, the size of streaming state will keep growing indefinitely as
+*all* past input must be saved as the any new input can match with any input from the past.
+To avoid unbounded state, you have to define additional join conditions such that indefinitely
+old inputs cannot match with future inputs and therefore can be cleared from the state.
+In other words, you will have to do the following additional steps in the join.
+
+1. Define watermark delays on both inputs such that the engine knows how delayed the input can be
+(similar to streaming aggregations)
+
+1. Define a constraint on event-time across the two inputs such that the engine can figure out when
+old rows of one input is not going to be required (i.e. will not satisfy the time constraint) for
+matches with the other input. This constraint can be defined in one of the two ways.
+
+    1. Time range join conditions (e.g. `...JOIN ON leftTime BETWEN rightTime AND rightTime + INTERVAL 1 HOUR`),
+
+    1. Join on event-time windows (e.g. `...JOIN ON leftTimeWindow = rightTimeWindow`).
+
+Let’s understand this with an example.
+
+Let’s say we want to join a stream of advertisement impressions (when an ad was shown) with
+another stream of user clicks on advertisements to correlate when impressions led to
+monetizable clicks. To allow the state cleanup in this stream-stream join, you will have to
+specify the watermarking delays and the time constraints as follows.
+
+1. Watermark delays: Say, the impressions and the corresponding clicks can be late/out-of-order
+in event-time by at most 2 and 3 hours, respectively.
+
+1. Event-time range condition: Say, a click can occur within a time range of 0 seconds to 1 hour
+after the corresponding impression.
+
+The code would look like this.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.sql.functions.expr
+
+val impressions = spark.readStream. ...
+val clicks = spark.readStream. ...
+
+// Apply watermarks on event-time columns
+val impressionsWithWatermark = impressions.withWatermark("impressionTime", "2 hours")
+val clicksWithWatermark = clicks.withWatermark("clickTime", "3 hours")
+
+// Join with event-time constraints
+impressionsWithWatermark.join(
+  clicksWithWatermark,
+  expr("""
+    clickAdId = impressionAdId AND
+    clickTime >= impressionTime AND
+    clickTime <= impressionTime + interval 1 hour
+    """)
+)
+
+{% endhighlight %}
+
+</div>
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+import static org.apache.spark.sql.functions.expr
+
+Dataset<Row> impressions = spark.readStream(). ...
+Dataset<Row> clicks = spark.readStream(). ...
+
+// Apply watermarks on event-time columns
+Dataset<Row> impressionsWithWatermark = impressions.withWatermark("impressionTime", "2 hours");
+Dataset<Row> clicksWithWatermark = clicks.withWatermark("clickTime", "3 hours");
+
+// Join with event-time constraints
+impressionsWithWatermark.join(
+  clicksWithWatermark,
+  expr(
+    "clickAdId = impressionAdId AND " +
+    "clickTime >= impressionTime AND " +
+    "clickTime <= impressionTime + interval 1 hour ")
+);
+
+{% endhighlight %}
+
+
+</div>
+<div data-lang="python"  markdown="1">
+
+{% highlight python %}
+from pyspark.sql.functions import expr
+
+impressions = spark.readStream. ...
+clicks = spark.readStream. ...
+
+# Apply watermarks on event-time columns
+impressionsWithWatermark = impressions.withWatermark("impressionTime", "2 hours")
+clicksWithWatermark = clicks.withWatermark("clickTime", "3 hours")
+
+# Join with event-time constraints
+impressionsWithWatermark.join(
+  clicksWithWatermark,
+  expr("""
+    clickAdId = impressionAdId AND
+    clickTime >= impressionTime AND
+    clickTime <= impressionTime + interval 1 hour
+    """)
+)
+
+{% endhighlight %}
+
+</div>
+</div>
+
+##### Outer Joins with Watermarking
+While the watermark + event-time constraints is optional for inner joins, for left and right outer
+joins they must be specified. This is because for generating the NULL results in outer join, the
+engine must know when an input row is not going to match with anything in future. Hence, the
+watermark + event-time constraints must be specified for generating correct results. Therefore,
+a query with outer-join will look quite like the ad-monetization example earlier, except that
+there will be an additional parameter specifying it to be an outer-join.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+
+impressionsWithWatermark.join(
+  clicksWithWatermark,
+  expr("""
+    clickAdId = impressionAdId AND
+    clickTime >= impressionTime AND
+    clickTime <= impressionTime + interval 1 hour
+    """),
+  joinType = "leftOuter"      // can be "inner", "leftOuter", "rightOuter"
+ )
+
+{% endhighlight %}
+
+</div>
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+impressionsWithWatermark.join(
+  clicksWithWatermark,
+  expr(
+    "clickAdId = impressionAdId AND " +
+    "clickTime >= impressionTime AND " +
+    "clickTime <= impressionTime + interval 1 hour "),
+  "leftOuter"                 // can be "inner", "leftOuter", "rightOuter"
+);
+
+{% endhighlight %}
+
+
+</div>
+<div data-lang="python"  markdown="1">
+
+{% highlight python %}
+impressionsWithWatermark.join(
+  clicksWithWatermark,
+  expr("""
+    clickAdId = impressionAdId AND
+    clickTime >= impressionTime AND
+    clickTime <= impressionTime + interval 1 hour
+    """),
+  "leftOuter"                 # can be "inner", "leftOuter", "rightOuter"
+)
+
+{% endhighlight %}
+
+</div>
+</div>
+
+However, note that the outer NULL results will be generated with a delay (depends on the specified
+watermark delay and the time range condition) because the engine has to wait for that long to ensure
+there were no matches and there will be no more matches in future.
+
+##### Support matrix for joins in streaming queries
+
+<table class ="table">
+  <tr>
+    <th>Left Input</th>
+    <th>Right Input</th>
+    <th>Join Type</th>
+    <th></th>
+  </tr>
+  <tr>
+      <td style="vertical-align: middle;">Static</td>
+      <td style="vertical-align: middle;">Static</td>
+      <td style="vertical-align: middle;">All types</td>
+      <td style="vertical-align: middle;">
+        Supported, since its not on streaming data even though it
+        can be present in a streaming query
+      </td>
+  </tr>
+  <tr>
+    <td rowspan="4" style="vertical-align: middle;">Stream</td>
+    <td rowspan="4" style="vertical-align: middle;">Static</td>
+    <td style="vertical-align: middle;">Inner</td>
+    <td style="vertical-align: middle;">Supported, not stateful</td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Left Outer</td>
+    <td style="vertical-align: middle;">Supported, not stateful</td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Right Outer</td>
+    <td style="vertical-align: middle;">Not supported</td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Full Outer</td>
+    <td style="vertical-align: middle;">Not supported</td>
+  </tr>
+  <tr>
+    <td rowspan="4" style="vertical-align: middle;">Static</td>
+    <td rowspan="4" style="vertical-align: middle;">Stream</td>
+    <td style="vertical-align: middle;">Inner</td>
+    <td style="vertical-align: middle;">Supported, not stateful</td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Left Outer</td>
+    <td style="vertical-align: middle;">Not supported</td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Right Outer</td>
+    <td style="vertical-align: middle;">Supported, not stateful</td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Full Outer</td>
+    <td style="vertical-align: middle;">Not supported</td>
+  </tr>
+  <tr>
+    <td rowspan="4" style="vertical-align: middle;">Stream</td>
+    <td rowspan="4" style="vertical-align: middle;">Stream</td>
+    <td style="vertical-align: middle;">Inner</td>
+    <td style="vertical-align: middle;">
+      Supported, optionally specify watermark on both sides +
+      time constraints for state cleanup
+    </td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Left Outer</td>
+    <td style="vertical-align: middle;">
+      Conditionally supported, must specify watermark on right + time constraints for correct
+      results, optionally specify watermark on left for all state cleanup
+    </td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Right Outer</td>
+    <td style="vertical-align: middle;">
+      Conditionally supported, must specify watermark on left + time constraints for correct
+      results, optionally specify watermark on right for all state cleanup
+    </td>
+  </tr>
+  <tr>
+    <td style="vertical-align: middle;">Full Outer</td>
+    <td style="vertical-align: middle;">Not supported</td>
+  </tr>
+ <tr>
+    <td></td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+</table>
+
+Additional details on supported joins:
+
+- Joins can be cascaded, that is, you can do `df1.join(df2, ...).join(df3, ...).join(df4, ....)`.
+
+- As of Spark 2.3, you can use joins only when the query is in Append output mode. Other output modes are not yet supported.
+
+- As of Spark 2.3, you cannot use other non-map-like operations before joins. Here are a few examples of
+  what cannot be used.
+
+  - Cannot use streaming aggregations before joins.
+
+  - Cannot use mapGroupsWithState and flatMapGroupsWithState in Update mode before joins.
+
+
 ### Streaming Deduplication
 You can deduplicate records in data streams using a unique identifier in the events. This is exactly same as deduplication on static using a unique identifier column. The query will store the necessary amount of data from previous records such that it can filter duplicate records. Similar to aggregations, you can use deduplication with or without watermarking.
 
@@ -1132,15 +1468,9 @@ Some of them are as follows.
 
 - Sorting operations are supported on streaming Datasets only after an aggregation and in Complete Output Mode.
 
-- Outer joins between a streaming and a static Datasets are conditionally supported.
-
-    + Full outer join with a streaming Dataset is not supported
-
-    + Left outer join with a streaming Dataset on the right is not supported
-
-    + Right outer join with a streaming Dataset on the left is not supported
-
-- Any kind of joins between two streaming Datasets is not yet supported.
+- Few types of outer joins on streaming Datasets are not supported. See the
+  <a href="#support-matrix-for-joins-in-streaming-queries">support matrix in the Join Operations section</a>
+  for more details.
 
 In addition, there are some Dataset methods that will not work on streaming Datasets. They are actions that will immediately run queries and return results, which does not make sense on a streaming Dataset. Rather, those functionalities can be done by explicitly starting a streaming query (see the next section regarding that).
 
@@ -1248,6 +1578,15 @@ Here is the compatibility matrix.
       Aggregations not allowed after <code>flatMapGroupsWithState</code>.
     </td>
   </tr>
+  <tr>
+      <td colspan="2" style="vertical-align: middle;">Queries with <code>joins</code></td>
+      <td style="vertical-align: middle;">Append</td>
+      <td style="vertical-align: middle;">
+        Update and Complete mode not supported yet. See the
+        <a href="#support-matrix-for-joins-in-streaming-queries">support matrix in the Join Operations section</a>
+         for more details on what types of joins are supported.
+      </td>
+    </tr>
   <tr>
     <td colspan="2" style="vertical-align: middle;">Other queries</td>
     <td style="vertical-align: middle;">Append, Update</td>
@@ -2097,6 +2436,100 @@ write.stream(aggDF, "memory", outputMode = "complete", checkpointLocation = "pat
 </div>
 </div>
 
+# Continuous Processing [Experimental]
+**Continuous processing** is a new, experimental streaming execution mode introduced in Spark 2.3 that enables low (~1 ms) end-to-end latency with at-least-once fault-tolerance guarantees. Compare this with the default *micro-batch processing* engine which can achieve exactly-once guarantees but achieve latencies of ~100ms at best. For some types of queries (discussed below), you can choose which mode to execute them in without modifying the application logic (i.e. without changing the DataFrame/Dataset operations). 
+
+To run a supported query in continuous processing mode, all you need to do is specify a **continuous trigger** with the desired checkpoint interval as a parameter. For example, 
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+{% highlight scala %}
+import org.apache.spark.sql.streaming.Trigger
+
+spark
+  .readStream
+  .format("rate")
+  .option("rowsPerSecond", "10")
+  .option("")
+
+spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1")
+  .load()
+  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .writeStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("topic", "topic1")
+  .trigger(Trigger.Continuous("1 second"))  // only change in query
+  .start()
+{% endhighlight %}
+</div>
+<div data-lang="java"  markdown="1">  
+{% highlight java %}
+import org.apache.spark.sql.streaming.Trigger;
+
+spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1")
+  .load()
+  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .writeStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("topic", "topic1")
+  .trigger(Trigger.Continuous("1 second"))  // only change in query
+  .start();
+{% endhighlight %}
+</div>
+<div data-lang="python"  markdown="1">  
+{% highlight python %}
+spark \
+  .readStream \
+  .format("kafka") \
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
+  .option("subscribe", "topic1") \
+  .load() \
+  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
+  .writeStream \
+  .format("kafka") \
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
+  .option("topic", "topic1") \
+  .trigger(continuous="1 second") \     # only change in query
+  .start()
+
+{% endhighlight %}
+</div>
+</div>
+
+A checkpoint interval of 1 second means that the continuous processing engine will records the progress of the query every second. The resulting checkpoints are in a format compatible with the micro-batch engine, hence any query can be restarted with any trigger. For example, a supported query started with the micro-batch mode can be restarted in continuous mode, and vice versa. Note that any time you switch to continuous mode, you will get at-least-once fault-tolerance guarantees.
+
+## Supported Queries
+As of Spark 2.3, only the following type of queries are supported in the continuous processing mode.
+
+- *Operations*: Only map-like Dataset/DataFrame operations are supported in continuous mode, that is, only projections (`select`, `map`, `flatMap`, `mapPartitions`, etc.) and selections (`where`, `filter`, etc.).
+  + All SQL functions are supported except aggregation functions (since aggregations are not yet supported), `current_timestamp()` and `current_date()` (deterministic computations using time is challenging).
+
+- *Sources*:
+  + Kafka source: All options are supported.
+  + Rate source: Good for testing. Only options that are supported in the continuous mode are `numPartitions` and `rowsPerSecond`.
+
+- *Sinks*: 
+  + Kafka sink: All options are supported.
+  + Memory sink: Good for debugging.
+  + Console sink: Good for debugging. All options are supported. Note that the console will print every checkpoint interval that you have specified in the continuous trigger. 
+
+See [Input Sources](#input-sources) and [Output Sinks](#output-sinks) sections for more details on them. While the console sink is good for testing, the end-to-end low-latency processing can be best observed with Kafka as the source and sink, as this allows the engine to process the data and make the results available in the output topic within milliseconds of the input data being available in the input topic.
+
+## Caveats
+- Continuous processing engine launches multiple long-running tasks that continuously read data from sources, process it and continuously write to sinks. The number of tasks required by the query depends on how many partitions the query can read from the sources in parallel. Therefore, before starting a continuous processing query, you must ensure there are enough cores in the cluster to all the tasks in parallel. For example, if you are reading from a Kafka topic that has 10 partitions, then the cluster must have at least 10 cores for the query to make progress.
+- Stopping a continuous processing stream may produce spurious task termination warnings. These can be safely ignored.
+- There are currently no automatic retries of failed tasks. Any failure will lead to the query being stopped and it needs to be manually restarted from the checkpoint.
+
 # Additional Information
 
 **Further Reading**
@@ -2114,6 +2547,11 @@ write.stream(aggDF, "memory", outputMode = "complete", checkpointLocation = "pat
 
 **Talks**
 
-- Spark Summit 2017 Talk - [Easy, Scalable, Fault-tolerant Stream Processing with Structured Streaming in Apache Spark](https://spark-summit.org/2017/events/easy-scalable-fault-tolerant-stream-processing-with-structured-streaming-in-apache-spark/)
-- Spark Summit 2016 Talk - [A Deep Dive into Structured Streaming](https://spark-summit.org/2016/events/a-deep-dive-into-structured-streaming/)
+- Spark Summit Europe 2017
+  - Easy, Scalable, Fault-tolerant Stream Processing with Structured Streaming in Apache Spark -
+    [Part 1 slides/video](https://databricks.com/session/easy-scalable-fault-tolerant-stream-processing-with-structured-streaming-in-apache-spark), [Part 2 slides/video](https://databricks.com/session/easy-scalable-fault-tolerant-stream-processing-with-structured-streaming-in-apache-spark-continues)
+  - Deep Dive into Stateful Stream Processing in Structured Streaming - [slides/video](https://databricks.com/session/deep-dive-into-stateful-stream-processing-in-structured-streaming)
+- Spark Summit 2016
+  - A Deep Dive into Structured Streaming - [slides/video](https://spark-summit.org/2016/events/a-deep-dive-into-structured-streaming/)
+
 
diff --git a/examples/pom.xml b/examples/pom.xml
index 1791dbaad775..e584feed1b33 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java
similarity index 62%
rename from examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
rename to examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java
index 99af37676ba9..6f93cff94b72 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderEstimatorExample.java
@@ -23,9 +23,8 @@
 import java.util.Arrays;
 import java.util.List;
 
-import org.apache.spark.ml.feature.OneHotEncoder;
-import org.apache.spark.ml.feature.StringIndexer;
-import org.apache.spark.ml.feature.StringIndexerModel;
+import org.apache.spark.ml.feature.OneHotEncoderEstimator;
+import org.apache.spark.ml.feature.OneHotEncoderModel;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
@@ -35,41 +34,37 @@
 import org.apache.spark.sql.types.StructType;
 // $example off$
 
-public class JavaOneHotEncoderExample {
+public class JavaOneHotEncoderEstimatorExample {
   public static void main(String[] args) {
     SparkSession spark = SparkSession
       .builder()
-      .appName("JavaOneHotEncoderExample")
+      .appName("JavaOneHotEncoderEstimatorExample")
       .getOrCreate();
 
+    // Note: categorical features are usually first encoded with StringIndexer
     // $example on$
     List<Row> data = Arrays.asList(
-      RowFactory.create(0, "a"),
-      RowFactory.create(1, "b"),
-      RowFactory.create(2, "c"),
-      RowFactory.create(3, "a"),
-      RowFactory.create(4, "a"),
-      RowFactory.create(5, "c")
+      RowFactory.create(0.0, 1.0),
+      RowFactory.create(1.0, 0.0),
+      RowFactory.create(2.0, 1.0),
+      RowFactory.create(0.0, 2.0),
+      RowFactory.create(0.0, 1.0),
+      RowFactory.create(2.0, 0.0)
     );
 
     StructType schema = new StructType(new StructField[]{
-      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
-      new StructField("category", DataTypes.StringType, false, Metadata.empty())
+      new StructField("categoryIndex1", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField("categoryIndex2", DataTypes.DoubleType, false, Metadata.empty())
     });
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
 
-    StringIndexerModel indexer = new StringIndexer()
-      .setInputCol("category")
-      .setOutputCol("categoryIndex")
-      .fit(df);
-    Dataset<Row> indexed = indexer.transform(df);
+    OneHotEncoderEstimator encoder = new OneHotEncoderEstimator()
+      .setInputCols(new String[] {"categoryIndex1", "categoryIndex2"})
+      .setOutputCols(new String[] {"categoryVec1", "categoryVec2"});
 
-    OneHotEncoder encoder = new OneHotEncoder()
-      .setInputCol("categoryIndex")
-      .setOutputCol("categoryVec");
-
-    Dataset<Row> encoded = encoder.transform(indexed);
+    OneHotEncoderModel model = encoder.fit(df);
+    Dataset<Row> encoded = model.transform(df);
     encoded.show();
     // $example off$
 
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
index dd20cac62110..43cc30c1a899 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
@@ -66,7 +66,7 @@ public static void main(String[] args) {
       .setNumBuckets(3);
 
     Dataset<Row> result = discretizer.fit(df).transform(df);
-    result.show();
+    result.show(false);
     // $example off$
     spark.stop();
   }
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_estimator_example.py
similarity index 65%
rename from examples/src/main/python/ml/onehot_encoder_example.py
rename to examples/src/main/python/ml/onehot_encoder_estimator_example.py
index e1996c7f0a55..2723e681cea7 100644
--- a/examples/src/main/python/ml/onehot_encoder_example.py
+++ b/examples/src/main/python/ml/onehot_encoder_estimator_example.py
@@ -18,32 +18,31 @@
 from __future__ import print_function
 
 # $example on$
-from pyspark.ml.feature import OneHotEncoder, StringIndexer
+from pyspark.ml.feature import OneHotEncoderEstimator
 # $example off$
 from pyspark.sql import SparkSession
 
 if __name__ == "__main__":
     spark = SparkSession\
         .builder\
-        .appName("OneHotEncoderExample")\
+        .appName("OneHotEncoderEstimatorExample")\
         .getOrCreate()
 
+    # Note: categorical features are usually first encoded with StringIndexer
     # $example on$
     df = spark.createDataFrame([
-        (0, "a"),
-        (1, "b"),
-        (2, "c"),
-        (3, "a"),
-        (4, "a"),
-        (5, "c")
-    ], ["id", "category"])
+        (0.0, 1.0),
+        (1.0, 0.0),
+        (2.0, 1.0),
+        (0.0, 2.0),
+        (0.0, 1.0),
+        (2.0, 0.0)
+    ], ["categoryIndex1", "categoryIndex2"])
 
-    stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
-    model = stringIndexer.fit(df)
-    indexed = model.transform(df)
-
-    encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
-    encoded = encoder.transform(indexed)
+    encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"],
+                                     outputCols=["categoryVec1", "categoryVec2"])
+    model = encoder.fit(df)
+    encoded = model.transform(df)
     encoded.show()
     # $example off$
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala b/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
index e64dcbd182d9..2332a661f26a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
@@ -60,10 +60,6 @@ object SimpleSkewedGroupByTest {
     pairs1.count
 
     println(s"RESULT: ${pairs1.groupByKey(numReducers).count}")
-    // Print how many keys each reducer got (for debugging)
-    // println("RESULT: " + pairs1.groupByKey(numReducers)
-    //                           .map{case (k,v) => (k, v.size)}
-    //                           .collectAsMap)
 
     spark.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala b/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
index 92936bd30dbc..815404d1218b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
@@ -145,9 +145,11 @@ object Analytics extends Logging {
           // TriangleCount requires the graph to be partitioned
           .partitionBy(partitionStrategy.getOrElse(RandomVertexCut)).cache()
         val triangles = TriangleCount.run(graph)
-        println("Triangles: " + triangles.vertices.map {
+        val triangleTypes = triangles.vertices.map {
           case (vid, data) => data.toLong
-        }.reduce(_ + _) / 3)
+        }.reduce(_ + _) / 3
+
+        println(s"Triangles: ${triangleTypes}")
         sc.stop()
 
       case _ =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala b/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala
index 6d2228c8742a..57b2edf99220 100644
--- a/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala
@@ -52,7 +52,7 @@ object SynthBenchmark {
       arg =>
         arg.dropWhile(_ == '-').split('=') match {
           case Array(opt, v) => (opt -> v)
-          case _ => throw new IllegalArgumentException("Invalid argument: " + arg)
+          case _ => throw new IllegalArgumentException(s"Invalid argument: $arg")
         }
     }
 
@@ -76,7 +76,7 @@ object SynthBenchmark {
       case ("sigma", v) => sigma = v.toDouble
       case ("degFile", v) => degFile = v
       case ("seed", v) => seed = v.toInt
-      case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
+      case (opt, _) => throw new IllegalArgumentException(s"Invalid option: $opt")
     }
 
     val conf = new SparkConf()
@@ -86,7 +86,7 @@ object SynthBenchmark {
     val sc = new SparkContext(conf)
 
     // Create the graph
-    println(s"Creating graph...")
+    println("Creating graph...")
     val unpartitionedGraph = GraphGenerators.logNormalGraph(sc, numVertices,
       numEPart.getOrElse(sc.defaultParallelism), mu, sigma, seed)
     // Repartition the graph
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala
index dcee1e427ce5..5146fd031646 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala
@@ -52,9 +52,9 @@ object ChiSquareTestExample {
 
     val df = data.toDF("label", "features")
     val chi = ChiSquareTest.test(df, "features", "label").head
-    println("pValues = " + chi.getAs[Vector](0))
-    println("degreesOfFreedom = " + chi.getSeq[Int](1).mkString("[", ",", "]"))
-    println("statistics = " + chi.getAs[Vector](2))
+    println(s"pValues = ${chi.getAs[Vector](0)}")
+    println(s"degreesOfFreedom ${chi.getSeq[Int](1).mkString("[", ",", "]")}")
+    println(s"statistics ${chi.getAs[Vector](2)}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala
index 3f57dc342eb0..d7f1fc8ed74d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala
@@ -51,10 +51,10 @@ object CorrelationExample {
 
     val df = data.map(Tuple1.apply).toDF("features")
     val Row(coeff1: Matrix) = Correlation.corr(df, "features").head
-    println("Pearson correlation matrix:\n" + coeff1.toString)
+    println(s"Pearson correlation matrix:\n $coeff1")
 
     val Row(coeff2: Matrix) = Correlation.corr(df, "features", "spearman").head
-    println("Spearman correlation matrix:\n" + coeff2.toString)
+    println(s"Spearman correlation matrix:\n $coeff2")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
index 0658bddf1696..ee4469faab3a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
@@ -47,7 +47,7 @@ object DataFrameExample {
     val parser = new OptionParser[Params]("DataFrameExample") {
       head("DataFrameExample: an example app using DataFrame for ML.")
       opt[String]("input")
-        .text(s"input path to dataframe")
+        .text("input path to dataframe")
         .action((x, c) => c.copy(input = x))
       checkConfig { params =>
         success
@@ -93,7 +93,7 @@ object DataFrameExample {
     // Load the records back.
     println(s"Loading Parquet file with UDT from $outputDir.")
     val newDF = spark.read.parquet(outputDir)
-    println(s"Schema from Parquet:")
+    println("Schema from Parquet:")
     newDF.printSchema()
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
index bc6d3275933e..276cedab11ab 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
@@ -83,10 +83,10 @@ object DecisionTreeClassificationExample {
       .setPredictionCol("prediction")
       .setMetricName("accuracy")
     val accuracy = evaluator.evaluate(predictions)
-    println("Test Error = " + (1.0 - accuracy))
+    println(s"Test Error = ${(1.0 - accuracy)}")
 
     val treeModel = model.stages(2).asInstanceOf[DecisionTreeClassificationModel]
-    println("Learned classification tree model:\n" + treeModel.toDebugString)
+    println(s"Learned classification tree model:\n ${treeModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
index ee61200ad1d0..aaaecaea4708 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
@@ -73,10 +73,10 @@ object DecisionTreeRegressionExample {
       .setPredictionCol("prediction")
       .setMetricName("rmse")
     val rmse = evaluator.evaluate(predictions)
-    println("Root Mean Squared Error (RMSE) on test data = " + rmse)
+    println(s"Root Mean Squared Error (RMSE) on test data = $rmse")
 
     val treeModel = model.stages(1).asInstanceOf[DecisionTreeRegressionModel]
-    println("Learned regression tree model:\n" + treeModel.toDebugString)
+    println(s"Learned regression tree model:\n ${treeModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index d94d837d10e9..2dc11b07d88e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -53,7 +53,7 @@ object DeveloperApiExample {
     // Create a LogisticRegression instance. This instance is an Estimator.
     val lr = new MyLogisticRegression()
     // Print out the parameters, documentation, and any default values.
-    println("MyLogisticRegression parameters:\n" + lr.explainParams() + "\n")
+    println(s"MyLogisticRegression parameters:\n ${lr.explainParams()}")
 
     // We may set parameters using setter methods.
     lr.setMaxIter(10)
@@ -169,10 +169,10 @@ private class MyLogisticRegressionModel(
     Vectors.dense(-margin, margin)
   }
 
-  /** Number of classes the label can take. 2 indicates binary classification. */
+  // Number of classes the label can take. 2 indicates binary classification.
   override val numClasses: Int = 2
 
-  /** Number of features the model was trained on. */
+  // Number of features the model was trained on.
   override val numFeatures: Int = coefficients.size
 
   /**
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
index f18d86e1a692..e5d91f132a3f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
@@ -46,7 +46,7 @@ object EstimatorTransformerParamExample {
     // Create a LogisticRegression instance. This instance is an Estimator.
     val lr = new LogisticRegression()
     // Print out the parameters, documentation, and any default values.
-    println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
+    println(s"LogisticRegression parameters:\n ${lr.explainParams()}\n")
 
     // We may set parameters using setter methods.
     lr.setMaxIter(10)
@@ -58,7 +58,7 @@ object EstimatorTransformerParamExample {
     // we can view the parameters it used during fit().
     // This prints the parameter (name: value) pairs, where names are unique IDs for this
     // LogisticRegression instance.
-    println("Model 1 was fit using parameters: " + model1.parent.extractParamMap)
+    println(s"Model 1 was fit using parameters: ${model1.parent.extractParamMap}")
 
     // We may alternatively specify parameters using a ParamMap,
     // which supports several methods for specifying parameters.
@@ -73,7 +73,7 @@ object EstimatorTransformerParamExample {
     // Now learn a new model using the paramMapCombined parameters.
     // paramMapCombined overrides all parameters set earlier via lr.set* methods.
     val model2 = lr.fit(training, paramMapCombined)
-    println("Model 2 was fit using parameters: " + model2.parent.extractParamMap)
+    println(s"Model 2 was fit using parameters: ${model2.parent.extractParamMap}")
 
     // Prepare test data.
     val test = spark.createDataFrame(Seq(
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
index 3656773c8b81..ef78c0a1145e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
@@ -86,10 +86,10 @@ object GradientBoostedTreeClassifierExample {
       .setPredictionCol("prediction")
       .setMetricName("accuracy")
     val accuracy = evaluator.evaluate(predictions)
-    println("Test Error = " + (1.0 - accuracy))
+    println(s"Test Error = ${1.0 - accuracy}")
 
     val gbtModel = model.stages(2).asInstanceOf[GBTClassificationModel]
-    println("Learned classification GBT model:\n" + gbtModel.toDebugString)
+    println(s"Learned classification GBT model:\n ${gbtModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
index e53aab7f326d..3feb2343f6a8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
@@ -73,10 +73,10 @@ object GradientBoostedTreeRegressorExample {
       .setPredictionCol("prediction")
       .setMetricName("rmse")
     val rmse = evaluator.evaluate(predictions)
-    println("Root Mean Squared Error (RMSE) on test data = " + rmse)
+    println(s"Root Mean Squared Error (RMSE) on test data = $rmse")
 
     val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel]
-    println("Learned regression GBT model:\n" + gbtModel.toDebugString)
+    println(s"Learned regression GBT model:\n ${gbtModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala
index 42f0ace7a353..3e61dbe628c2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala
@@ -48,7 +48,7 @@ object MulticlassLogisticRegressionWithElasticNetExample {
 
     // Print the coefficients and intercept for multinomial logistic regression
     println(s"Coefficients: \n${lrModel.coefficientMatrix}")
-    println(s"Intercepts: ${lrModel.interceptVector}")
+    println(s"Intercepts: \n${lrModel.interceptVector}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
index 6fce82d294f8..646f46a92506 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
@@ -66,7 +66,7 @@ object MultilayerPerceptronClassifierExample {
     val evaluator = new MulticlassClassificationEvaluator()
       .setMetricName("accuracy")
 
-    println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels))
+    println(s"Test set accuracy = ${evaluator.evaluate(predictionAndLabels)}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
index bd9fcc420a66..50c70c626b12 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
@@ -52,7 +52,7 @@ object NaiveBayesExample {
       .setPredictionCol("prediction")
       .setMetricName("accuracy")
     val accuracy = evaluator.evaluate(predictions)
-    println("Test set accuracy = " + accuracy)
+    println(s"Test set accuracy = $accuracy")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala
similarity index 65%
rename from examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
rename to examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala
index 274cc1268f4d..45d816808ed8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderEstimatorExample.scala
@@ -19,38 +19,34 @@
 package org.apache.spark.examples.ml
 
 // $example on$
-import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
+import org.apache.spark.ml.feature.OneHotEncoderEstimator
 // $example off$
 import org.apache.spark.sql.SparkSession
 
-object OneHotEncoderExample {
+object OneHotEncoderEstimatorExample {
   def main(args: Array[String]): Unit = {
     val spark = SparkSession
       .builder
-      .appName("OneHotEncoderExample")
+      .appName("OneHotEncoderEstimatorExample")
       .getOrCreate()
 
+    // Note: categorical features are usually first encoded with StringIndexer
     // $example on$
     val df = spark.createDataFrame(Seq(
-      (0, "a"),
-      (1, "b"),
-      (2, "c"),
-      (3, "a"),
-      (4, "a"),
-      (5, "c")
-    )).toDF("id", "category")
-
-    val indexer = new StringIndexer()
-      .setInputCol("category")
-      .setOutputCol("categoryIndex")
-      .fit(df)
-    val indexed = indexer.transform(df)
-
-    val encoder = new OneHotEncoder()
-      .setInputCol("categoryIndex")
-      .setOutputCol("categoryVec")
-
-    val encoded = encoder.transform(indexed)
+      (0.0, 1.0),
+      (1.0, 0.0),
+      (2.0, 1.0),
+      (0.0, 2.0),
+      (0.0, 1.0),
+      (2.0, 0.0)
+    )).toDF("categoryIndex1", "categoryIndex2")
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("categoryIndex1", "categoryIndex2"))
+      .setOutputCols(Array("categoryVec1", "categoryVec2"))
+    val model = encoder.fit(df)
+
+    val encoded = model.transform(df)
     encoded.show()
     // $example off$
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
index aedb9e7d3bb7..0fe16fb6dfa9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
@@ -36,7 +36,7 @@ object QuantileDiscretizerExample {
     // Output of QuantileDiscretizer for such small datasets can depend on the number of
     // partitions. Here we force a single partition to ensure consistent results.
     // Note this is not necessary for normal use cases
-        .repartition(1)
+      .repartition(1)
 
     // $example on$
     val discretizer = new QuantileDiscretizer()
@@ -45,7 +45,7 @@ object QuantileDiscretizerExample {
       .setNumBuckets(3)
 
     val result = discretizer.fit(df).transform(df)
-    result.show()
+    result.show(false)
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
index 5eafda8ce428..6265f8390252 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
@@ -85,10 +85,10 @@ object RandomForestClassifierExample {
       .setPredictionCol("prediction")
       .setMetricName("accuracy")
     val accuracy = evaluator.evaluate(predictions)
-    println("Test Error = " + (1.0 - accuracy))
+    println(s"Test Error = ${(1.0 - accuracy)}")
 
     val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel]
-    println("Learned classification forest model:\n" + rfModel.toDebugString)
+    println(s"Learned classification forest model:\n ${rfModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
index 9a0a001c26ef..2679fcb353a8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
@@ -72,10 +72,10 @@ object RandomForestRegressorExample {
       .setPredictionCol("prediction")
       .setMetricName("rmse")
     val rmse = evaluator.evaluate(predictions)
-    println("Root Mean Squared Error (RMSE) on test data = " + rmse)
+    println(s"Root Mean Squared Error (RMSE) on test data = $rmse")
 
     val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel]
-    println("Learned regression forest model:\n" + rfModel.toDebugString)
+    println(s"Learned regression forest model:\n ${rfModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
index afa761aee0b9..96bb8ea2338a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
@@ -41,8 +41,8 @@ object VectorIndexerExample {
     val indexerModel = indexer.fit(data)
 
     val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet
-    println(s"Chose ${categoricalFeatures.size} categorical features: " +
-      categoricalFeatures.mkString(", "))
+    println(s"Chose ${categoricalFeatures.size} " +
+      s"categorical features: ${categoricalFeatures.mkString(", ")}")
 
     // Create new column "indexed" with categorical values transformed to indices
     val indexedData = indexerModel.transform(data)
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala
index ff44de56839e..a07535bb5a38 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala
@@ -42,9 +42,8 @@ object AssociationRulesExample {
     val results = ar.run(freqItemsets)
 
     results.collect().foreach { rule =>
-      println("[" + rule.antecedent.mkString(",")
-        + "=>"
-        + rule.consequent.mkString(",") + "]," + rule.confidence)
+    println(s"[${rule.antecedent.mkString(",")}=>${rule.consequent.mkString(",")} ]" +
+        s" ${rule.confidence}")
     }
     // $example off$
 
@@ -53,3 +52,4 @@ object AssociationRulesExample {
 
 }
 // scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
index b9263ac6fcff..c6312d71cc91 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
@@ -86,7 +86,7 @@ object BinaryClassificationMetricsExample {
 
     // AUPRC
     val auPRC = metrics.areaUnderPR
-    println("Area under precision-recall curve = " + auPRC)
+    println(s"Area under precision-recall curve = $auPRC")
 
     // Compute thresholds used in ROC and PR curves
     val thresholds = precision.map(_._1)
@@ -96,7 +96,7 @@ object BinaryClassificationMetricsExample {
 
     // AUROC
     val auROC = metrics.areaUnderROC
-    println("Area under ROC = " + auROC)
+    println(s"Area under ROC = $auROC")
     // $example off$
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
index b50b4592777c..c2f89b72c9a2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
@@ -55,8 +55,8 @@ object DecisionTreeClassificationExample {
       (point.label, prediction)
     }
     val testErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / testData.count()
-    println("Test Error = " + testErr)
-    println("Learned classification tree model:\n" + model.toDebugString)
+    println(s"Test Error = $testErr")
+    println(s"Learned classification tree model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
index 2af45afae3d5..1ecf6426e1f9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
@@ -54,8 +54,8 @@ object DecisionTreeRegressionExample {
       (point.label, prediction)
     }
     val testMSE = labelsAndPredictions.map{ case (v, p) => math.pow(v - p, 2) }.mean()
-    println("Test Mean Squared Error = " + testMSE)
-    println("Learned regression tree model:\n" + model.toDebugString)
+    println(s"Test Mean Squared Error = $testMSE")
+    println(s"Learned regression tree model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
index 6435abc12775..f724ee1030f0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
@@ -74,7 +74,7 @@ object FPGrowthExample {
     println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")
 
     model.freqItemsets.collect().foreach { itemset =>
-      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
+      println(s"${itemset.items.mkString("[", ",", "]")}, ${itemset.freq}")
     }
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala
index 00bb3348d2a3..3c56e1941aec 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala
@@ -54,8 +54,8 @@ object GradientBoostingClassificationExample {
       (point.label, prediction)
     }
     val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
-    println("Test Error = " + testErr)
-    println("Learned classification GBT model:\n" + model.toDebugString)
+    println(s"Test Error = $testErr")
+    println(s"Learned classification GBT model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myGradientBoostingClassificationModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala
index d8c263460839..c288bf29bf25 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala
@@ -53,8 +53,8 @@ object GradientBoostingRegressionExample {
       (point.label, prediction)
     }
     val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
-    println("Test Mean Squared Error = " + testMSE)
-    println("Learned regression GBT model:\n" + model.toDebugString)
+    println(s"Test Mean Squared Error = $testMSE")
+    println(s"Learned regression GBT model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myGradientBoostingRegressionModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
index 0d391a3637c0..add171973953 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -68,7 +68,7 @@ object HypothesisTestingExample {
     // against the label.
     val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
     featureTestResults.zipWithIndex.foreach { case (k, v) =>
-      println("Column " + (v + 1).toString + ":")
+      println(s"Column ${(v + 1)} :")
       println(k)
     }  // summary of the test
     // $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
index 4aee951f5b04..a10d6f0dda88 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
@@ -56,7 +56,7 @@ object IsotonicRegressionExample {
 
     // Calculate mean squared error between predicted and real labels.
     val meanSquaredError = predictionAndLabel.map { case (p, l) => math.pow((p - l), 2) }.mean()
-    println("Mean Squared Error = " + meanSquaredError)
+    println(s"Mean Squared Error = $meanSquaredError")
 
     // Save and load model
     model.save(sc, "target/tmp/myIsotonicRegressionModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala
index c4d71d862f37..b0a6f1671a89 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala
@@ -43,7 +43,7 @@ object KMeansExample {
 
     // Evaluate clustering by computing Within Set Sum of Squared Errors
     val WSSSE = clusters.computeCost(parsedData)
-    println("Within Set Sum of Squared Errors = " + WSSSE)
+    println(s"Within Set Sum of Squared Errors = $WSSSE")
 
     // Save and load model
     clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LBFGSExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LBFGSExample.scala
index fedcefa09838..123782fa6b9c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LBFGSExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LBFGSExample.scala
@@ -82,7 +82,7 @@ object LBFGSExample {
 
     println("Loss of each step in training process")
     loss.foreach(println)
-    println("Area under ROC = " + auROC)
+    println(s"Area under ROC = $auROC")
     // $example off$
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala
index f2c8ec01439f..d25962c5500e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala
@@ -42,11 +42,13 @@ object LatentDirichletAllocationExample {
     val ldaModel = new LDA().setK(3).run(corpus)
 
     // Output topics. Each is a distribution over words (matching word count vectors)
-    println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):")
+    println(s"Learned topics (as distributions over vocab of ${ldaModel.vocabSize} words):")
     val topics = ldaModel.topicsMatrix
     for (topic <- Range(0, 3)) {
-      print("Topic " + topic + ":")
-      for (word <- Range(0, ldaModel.vocabSize)) { print(" " + topics(word, topic)); }
+      print(s"Topic $topic :")
+      for (word <- Range(0, ldaModel.vocabSize)) {
+        print(s"${topics(word, topic)}")
+      }
       println()
     }
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala
index d39961809448..449b725d1d17 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala
@@ -52,7 +52,7 @@ object LinearRegressionWithSGDExample {
       (point.label, prediction)
     }
     val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean()
-    println("training Mean Squared Error = " + MSE)
+    println(s"training Mean Squared Error $MSE")
 
     // Save and load model
     model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala
index eb36697d94ba..eff2393cc3ab 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala
@@ -65,8 +65,8 @@ object PCAExample {
     val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean()
     val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean()
 
-    println("Mean Squared Error = " + MSE)
-    println("PCA Mean Squared Error = " + MSE_pca)
+    println(s"Mean Squared Error = $MSE")
+    println(s"PCA Mean Squared Error = $MSE_pca")
     // $example off$
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala
index d74d74a37fb1..96deafd469bc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala
@@ -41,7 +41,7 @@ object PMMLModelExportExample {
     val clusters = KMeans.train(parsedData, numClusters, numIterations)
 
     // Export to PMML to a String in PMML format
-    println("PMML Model:\n" + clusters.toPMML)
+    println(s"PMML Model:\n ${clusters.toPMML}")
 
     // Export the model to a local file in PMML format
     clusters.toPMML("/tmp/kmeans.xml")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala
index 69c72c433657..8b789277774a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala
@@ -42,8 +42,8 @@ object PrefixSpanExample {
     val model = prefixSpan.run(sequences)
     model.freqSequences.collect().foreach { freqSequence =>
       println(
-        freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") +
-          ", " + freqSequence.freq)
+        s"${freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")}," +
+          s" ${freqSequence.freq}")
     }
     // $example off$
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala
index f1ebdf1a733e..246e71de2561 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala
@@ -55,8 +55,8 @@ object RandomForestClassificationExample {
       (point.label, prediction)
     }
     val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
-    println("Test Error = " + testErr)
-    println("Learned classification forest model:\n" + model.toDebugString)
+    println(s"Test Error = $testErr")
+    println(s"Learned classification forest model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myRandomForestClassificationModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala
index 11d612e651b4..770e30276bc3 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala
@@ -55,8 +55,8 @@ object RandomForestRegressionExample {
       (point.label, prediction)
     }
     val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
-    println("Test Mean Squared Error = " + testMSE)
-    println("Learned regression forest model:\n" + model.toDebugString)
+    println(s"Test Mean Squared Error = $testMSE")
+    println(s"Learned regression forest model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myRandomForestRegressionModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala
index 6df742d737e7..0bb2b8c8c2b4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala
@@ -56,7 +56,7 @@ object RecommendationExample {
       val err = (r1 - r2)
       err * err
     }.mean()
-    println("Mean Squared Error = " + MSE)
+    println(s"Mean Squared Error = $MSE")
 
     // Save and load model
     model.save(sc, "target/tmp/myCollaborativeFilter")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala
index b73fe9b2b3fa..285e2ce51263 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala
@@ -57,7 +57,7 @@ object SVMWithSGDExample {
     val metrics = new BinaryClassificationMetrics(scoreAndLabels)
     val auROC = metrics.areaUnderROC()
 
-    println("Area under ROC = " + auROC)
+    println(s"Area under ROC = $auROC")
 
     // Save and load model
     model.save(sc, "target/tmp/scalaSVMWithSGDModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala
index b5c3033bcba0..694c3bb18b04 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala
@@ -42,15 +42,13 @@ object SimpleFPGrowth {
     val model = fpg.run(transactions)
 
     model.freqItemsets.collect().foreach { itemset =>
-      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
+      println(s"${itemset.items.mkString("[", ",", "]")},${itemset.freq}")
     }
 
     val minConfidence = 0.8
     model.generateAssociationRules(minConfidence).collect().foreach { rule =>
-      println(
-        rule.antecedent.mkString("[", ",", "]")
-          + " => " + rule.consequent .mkString("[", ",", "]")
-          + ", " + rule.confidence)
+      println(s"${rule.antecedent.mkString("[", ",", "]")}=> " +
+        s"${rule.consequent .mkString("[", ",", "]")},${rule.confidence}")
     }
     // $example off$
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
index 16b074ef6069..3d41bef0af88 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -41,10 +41,10 @@ object StratifiedSamplingExample {
     val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions)
     // $example off$
 
-    println("approxSample size is " + approxSample.collect().size.toString)
+    println(s"approxSample size is ${approxSample.collect().size}")
     approxSample.collect().foreach(println)
 
-    println("exactSample its size is " + exactSample.collect().size.toString)
+    println(s"exactSample its size is ${exactSample.collect().size}")
     exactSample.collect().foreach(println)
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
index 03bc675299c5..071d341b8161 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
@@ -54,7 +54,7 @@ object TallSkinnyPCA {
     // Compute principal components.
     val pc = mat.computePrincipalComponents(mat.numCols().toInt)
 
-    println("Principal components are:\n" + pc)
+    println(s"Principal components are:\n $pc")
 
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
index 067e49b9599e..8ae6de16d80e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
@@ -54,7 +54,7 @@ object TallSkinnySVD {
     // Compute SVD.
     val svd = mat.computeSVD(mat.numCols().toInt)
 
-    println("Singular values are " + svd.s)
+    println(s"Singular values are ${svd.s}")
 
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
index 43044d01b120..25c7bf287197 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
@@ -82,9 +82,9 @@ class CustomReceiver(host: String, port: Int)
    var socket: Socket = null
    var userInput: String = null
    try {
-     logInfo("Connecting to " + host + ":" + port)
+     logInfo(s"Connecting to $host : $port")
      socket = new Socket(host, port)
-     logInfo("Connected to " + host + ":" + port)
+     logInfo(s"Connected to $host : $port")
      val reader = new BufferedReader(
        new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8))
      userInput = reader.readLine()
@@ -98,7 +98,7 @@ class CustomReceiver(host: String, port: Int)
      restart("Trying to connect again")
    } catch {
      case e: java.net.ConnectException =>
-       restart("Error connecting to " + host + ":" + port, e)
+       restart(s"Error connecting to $host : $port", e)
      case t: Throwable =>
        restart("Error receiving data", t)
    }
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala
index 5322929d177b..437ccf0898d7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala
@@ -54,7 +54,7 @@ object RawNetworkGrep {
       ssc.rawSocketStream[String](host, port, StorageLevel.MEMORY_ONLY_SER_2)).toArray
     val union = ssc.union(rawStreams)
     union.filter(_.contains("the")).count().foreachRDD(r =>
-      println("Grep count: " + r.collect().mkString))
+      println(s"Grep count: ${r.collect().mkString}"))
     ssc.start()
     ssc.awaitTermination()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
index 49c042732113..f018f3a26d2e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
@@ -130,10 +130,10 @@ object RecoverableNetworkWordCount {
           true
         }
       }.collect().mkString("[", ", ", "]")
-      val output = "Counts at time " + time + " " + counts
+      val output = s"Counts at time $time $counts"
       println(output)
-      println("Dropped " + droppedWordsCounter.value + " word(s) totally")
-      println("Appending to " + outputFile.getAbsolutePath)
+      println(s"Dropped ${droppedWordsCounter.value} word(s) totally")
+      println(s"Appending to ${outputFile.getAbsolutePath}")
       Files.append(output + "\n", outputFile, Charset.defaultCharset())
     }
     ssc
@@ -141,7 +141,7 @@ object RecoverableNetworkWordCount {
 
   def main(args: Array[String]) {
     if (args.length != 4) {
-      System.err.println("Your arguments were " + args.mkString("[", ", ", "]"))
+      System.err.println(s"Your arguments were ${args.mkString("[", ", ", "]")}")
       System.err.println(
         """
           |Usage: RecoverableNetworkWordCount <hostname> <port> <checkpoint-directory>
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
index 0ddd065f0db2..2108bc63edea 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
@@ -90,13 +90,13 @@ object PageViewGenerator {
     val viewsPerSecond = args(1).toFloat
     val sleepDelayMs = (1000.0 / viewsPerSecond).toInt
     val listener = new ServerSocket(port)
-    println("Listening on port: " + port)
+    println(s"Listening on port: $port")
 
     while (true) {
       val socket = listener.accept()
       new Thread() {
         override def run(): Unit = {
-          println("Got client connected from: " + socket.getInetAddress)
+          println(s"Got client connected from: ${socket.getInetAddress}")
           val out = new PrintWriter(socket.getOutputStream(), true)
 
           while (true) {
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
index 1ba093f57b32..b8e7c7e9e915 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
@@ -104,8 +104,8 @@ object PageViewStream {
           .foreachRDD((rdd, time) => rdd.join(userList)
             .map(_._2._2)
             .take(10)
-            .foreach(u => println("Saw user %s at time %s".format(u, time))))
-      case _ => println("Invalid metric entered: " + metric)
+            .foreach(u => println(s"Saw user $u at time $time")))
+      case _ => println(s"Invalid metric entered: $metric")
     }
 
     ssc.start()
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 485b562dce99..6c4031b392f6 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker/spark-test/base/Dockerfile b/external/docker/spark-test/base/Dockerfile
index 5a95a9387c31..c70cd7136767 100644
--- a/external/docker/spark-test/base/Dockerfile
+++ b/external/docker/spark-test/base/Dockerfile
@@ -15,14 +15,14 @@
 # limitations under the License.
 #
 
-FROM ubuntu:precise
+FROM ubuntu:xenial
 
 # Upgrade package index
-# install a few other useful packages plus Open Jdk 7
+# install a few other useful packages plus Open Jdk 8
 # Remove unneeded /var/lib/apt/lists/* after install to reduce the
 # docker image size (by ~30MB)
 RUN apt-get update && \
-    apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server && \
+    apt-get install -y less openjdk-8-jre-headless iproute2 vim-tiny sudo openssh-server && \
     rm -rf /var/lib/apt/lists/*
 
 ENV SCALA_VERSION 2.11.8
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 71016bc645ca..6935ade33546 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 12630840e79d..60aa5fb1a103 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 87a09642405a..d8ed3364c4bd 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index d6f97316b326..15d550d3bb44 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 0c9f0aa765a3..7ee7156efe22 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
new file mode 100644
index 000000000000..fc977977504f
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.{util => ju}
+import java.util.concurrent.TimeoutException
+
+import org.apache.kafka.clients.consumer.{ConsumerRecord, OffsetOutOfRangeException}
+import org.apache.kafka.common.TopicPartition
+
+import org.apache.spark.TaskContext
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.kafka010.KafkaSource.{INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE, INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE}
+import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * A [[ContinuousReader]] for data from kafka.
+ *
+ * @param offsetReader  a reader used to get kafka offsets. Note that the actual data will be
+ *                      read by per-task consumers generated later.
+ * @param kafkaParams   String params for per-task Kafka consumers.
+ * @param sourceOptions The [[org.apache.spark.sql.sources.v2.DataSourceV2Options]] params which
+ *                      are not Kafka consumer params.
+ * @param metadataPath Path to a directory this reader can use for writing metadata.
+ * @param initialOffsets The Kafka offsets to start reading data at.
+ * @param failOnDataLoss Flag indicating whether reading should fail in data loss
+ *                       scenarios, where some offsets after the specified initial ones can't be
+ *                       properly read.
+ */
+class KafkaContinuousReader(
+    offsetReader: KafkaOffsetReader,
+    kafkaParams: ju.Map[String, Object],
+    sourceOptions: Map[String, String],
+    metadataPath: String,
+    initialOffsets: KafkaOffsetRangeLimit,
+    failOnDataLoss: Boolean)
+  extends ContinuousReader with SupportsScanUnsafeRow with Logging {
+
+  private lazy val session = SparkSession.getActiveSession.get
+  private lazy val sc = session.sparkContext
+
+  private val pollTimeoutMs = sourceOptions.getOrElse("kafkaConsumer.pollTimeoutMs", "512").toLong
+
+  // Initialized when creating read tasks. If this diverges from the partitions at the latest
+  // offsets, we need to reconfigure.
+  // Exposed outside this object only for unit tests.
+  private[sql] var knownPartitions: Set[TopicPartition] = _
+
+  override def readSchema: StructType = KafkaOffsetReader.kafkaSchema
+
+  private var offset: Offset = _
+  override def setOffset(start: ju.Optional[Offset]): Unit = {
+    offset = start.orElse {
+      val offsets = initialOffsets match {
+        case EarliestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchEarliestOffsets())
+        case LatestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchLatestOffsets())
+        case SpecificOffsetRangeLimit(p) => offsetReader.fetchSpecificOffsets(p, reportDataLoss)
+      }
+      logInfo(s"Initial offsets: $offsets")
+      offsets
+    }
+  }
+
+  override def getStartOffset(): Offset = offset
+
+  override def deserializeOffset(json: String): Offset = {
+    KafkaSourceOffset(JsonUtils.partitionOffsets(json))
+  }
+
+  override def createUnsafeRowReadTasks(): ju.List[ReadTask[UnsafeRow]] = {
+    import scala.collection.JavaConverters._
+
+    val oldStartPartitionOffsets = KafkaSourceOffset.getPartitionOffsets(offset)
+
+    val currentPartitionSet = offsetReader.fetchEarliestOffsets().keySet
+    val newPartitions = currentPartitionSet.diff(oldStartPartitionOffsets.keySet)
+    val newPartitionOffsets = offsetReader.fetchEarliestOffsets(newPartitions.toSeq)
+
+    val deletedPartitions = oldStartPartitionOffsets.keySet.diff(currentPartitionSet)
+    if (deletedPartitions.nonEmpty) {
+      reportDataLoss(s"Some partitions were deleted: $deletedPartitions")
+    }
+
+    val startOffsets = newPartitionOffsets ++
+      oldStartPartitionOffsets.filterKeys(!deletedPartitions.contains(_))
+    knownPartitions = startOffsets.keySet
+
+    startOffsets.toSeq.map {
+      case (topicPartition, start) =>
+        KafkaContinuousReadTask(
+          topicPartition, start, kafkaParams, pollTimeoutMs, failOnDataLoss)
+          .asInstanceOf[ReadTask[UnsafeRow]]
+    }.asJava
+  }
+
+  /** Stop this source and free any resources it has allocated. */
+  def stop(): Unit = synchronized {
+    offsetReader.close()
+  }
+
+  override def commit(end: Offset): Unit = {}
+
+  override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
+    val mergedMap = offsets.map {
+      case KafkaSourcePartitionOffset(p, o) => Map(p -> o)
+    }.reduce(_ ++ _)
+    KafkaSourceOffset(mergedMap)
+  }
+
+  override def needsReconfiguration(): Boolean = {
+    knownPartitions != null && offsetReader.fetchLatestOffsets().keySet != knownPartitions
+  }
+
+  override def toString(): String = s"KafkaSource[$offsetReader]"
+
+  /**
+   * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`.
+   * Otherwise, just log a warning.
+   */
+  private def reportDataLoss(message: String): Unit = {
+    if (failOnDataLoss) {
+      throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE")
+    } else {
+      logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE")
+    }
+  }
+}
+
+/**
+ * A read task for continuous Kafka processing. This will be serialized and transformed into a
+ * full reader on executors.
+ *
+ * @param topicPartition The (topic, partition) pair this task is responsible for.
+ * @param startOffset The offset to start reading from within the partition.
+ * @param kafkaParams Kafka consumer params to use.
+ * @param pollTimeoutMs The timeout for Kafka consumer polling.
+ * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
+ *                       are skipped.
+ */
+case class KafkaContinuousReadTask(
+    topicPartition: TopicPartition,
+    startOffset: Long,
+    kafkaParams: ju.Map[String, Object],
+    pollTimeoutMs: Long,
+    failOnDataLoss: Boolean) extends ReadTask[UnsafeRow] {
+  override def createDataReader(): KafkaContinuousDataReader = {
+    new KafkaContinuousDataReader(
+      topicPartition, startOffset, kafkaParams, pollTimeoutMs, failOnDataLoss)
+  }
+}
+
+/**
+ * A per-task data reader for continuous Kafka processing.
+ *
+ * @param topicPartition The (topic, partition) pair this data reader is responsible for.
+ * @param startOffset The offset to start reading from within the partition.
+ * @param kafkaParams Kafka consumer params to use.
+ * @param pollTimeoutMs The timeout for Kafka consumer polling.
+ * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
+ *                       are skipped.
+ */
+class KafkaContinuousDataReader(
+    topicPartition: TopicPartition,
+    startOffset: Long,
+    kafkaParams: ju.Map[String, Object],
+    pollTimeoutMs: Long,
+    failOnDataLoss: Boolean) extends ContinuousDataReader[UnsafeRow] {
+  private val topic = topicPartition.topic
+  private val kafkaPartition = topicPartition.partition
+  private val consumer = CachedKafkaConsumer.createUncached(topic, kafkaPartition, kafkaParams)
+
+  private val sharedRow = new UnsafeRow(7)
+  private val bufferHolder = new BufferHolder(sharedRow)
+  private val rowWriter = new UnsafeRowWriter(bufferHolder, 7)
+
+  private var nextKafkaOffset = startOffset
+  private var currentRecord: ConsumerRecord[Array[Byte], Array[Byte]] = _
+
+  override def next(): Boolean = {
+    var r: ConsumerRecord[Array[Byte], Array[Byte]] = null
+    while (r == null) {
+      if (TaskContext.get().isInterrupted() || TaskContext.get().isCompleted()) return false
+      // Our consumer.get is not interruptible, so we have to set a low poll timeout, leaving
+      // interrupt points to end the query rather than waiting for new data that might never come.
+      try {
+        r = consumer.get(
+          nextKafkaOffset,
+          untilOffset = Long.MaxValue,
+          pollTimeoutMs,
+          failOnDataLoss)
+      } catch {
+        // We didn't read within the timeout. We're supposed to block indefinitely for new data, so
+        // swallow and ignore this.
+        case _: TimeoutException =>
+
+        // This is a failOnDataLoss exception. Retry if nextKafkaOffset is within the data range,
+        // or if it's the endpoint of the data range (i.e. the "true" next offset).
+        case e: IllegalStateException  if e.getCause.isInstanceOf[OffsetOutOfRangeException] =>
+          val range = consumer.getAvailableOffsetRange()
+          if (range.latest >= nextKafkaOffset && range.earliest <= nextKafkaOffset) {
+            // retry
+          } else {
+            throw e
+          }
+      }
+    }
+    nextKafkaOffset = r.offset + 1
+    currentRecord = r
+    true
+  }
+
+  override def get(): UnsafeRow = {
+    bufferHolder.reset()
+
+    if (currentRecord.key == null) {
+      rowWriter.setNullAt(0)
+    } else {
+      rowWriter.write(0, currentRecord.key)
+    }
+    rowWriter.write(1, currentRecord.value)
+    rowWriter.write(2, UTF8String.fromString(currentRecord.topic))
+    rowWriter.write(3, currentRecord.partition)
+    rowWriter.write(4, currentRecord.offset)
+    rowWriter.write(5,
+      DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(currentRecord.timestamp)))
+    rowWriter.write(6, currentRecord.timestampType.id)
+    sharedRow.setTotalSize(bufferHolder.totalSize)
+    sharedRow
+  }
+
+  override def getOffset(): KafkaSourcePartitionOffset = {
+    KafkaSourcePartitionOffset(topicPartition, nextKafkaOffset)
+  }
+
+  override def close(): Unit = {
+    consumer.close()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
new file mode 100644
index 000000000000..9843f469c5b2
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import org.apache.kafka.clients.producer.{Callback, ProducerRecord, RecordMetadata}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal, UnsafeProjection}
+import org.apache.spark.sql.kafka010.KafkaSourceProvider.{kafkaParamsForProducer, TOPIC_OPTION_KEY}
+import org.apache.spark.sql.kafka010.KafkaWriter.validateQuery
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.writer._
+import org.apache.spark.sql.streaming.OutputMode
+import org.apache.spark.sql.types.{BinaryType, StringType, StructType}
+
+/**
+ * Dummy commit message. The DataSourceV2 framework requires a commit message implementation but we
+ * don't need to really send one.
+ */
+case object KafkaWriterCommitMessage extends WriterCommitMessage
+
+/**
+ * A [[ContinuousWriter]] for Kafka writing. Responsible for generating the writer factory.
+ * @param topic The topic this writer is responsible for. If None, topic will be inferred from
+ *              a `topic` field in the incoming data.
+ * @param producerParams Parameters for Kafka producers in each task.
+ * @param schema The schema of the input data.
+ */
+class KafkaContinuousWriter(
+    topic: Option[String], producerParams: Map[String, String], schema: StructType)
+  extends ContinuousWriter with SupportsWriteInternalRow {
+
+  validateQuery(schema.toAttributes, producerParams.toMap[String, Object].asJava, topic)
+
+  override def createInternalRowWriterFactory(): KafkaContinuousWriterFactory =
+    KafkaContinuousWriterFactory(topic, producerParams, schema)
+
+  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}
+  override def abort(messages: Array[WriterCommitMessage]): Unit = {}
+}
+
+/**
+ * A [[DataWriterFactory]] for Kafka writing. Will be serialized and sent to executors to generate
+ * the per-task data writers.
+ * @param topic The topic that should be written to. If None, topic will be inferred from
+ *              a `topic` field in the incoming data.
+ * @param producerParams Parameters for Kafka producers in each task.
+ * @param schema The schema of the input data.
+ */
+case class KafkaContinuousWriterFactory(
+    topic: Option[String], producerParams: Map[String, String], schema: StructType)
+  extends DataWriterFactory[InternalRow] {
+
+  override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
+    new KafkaContinuousDataWriter(topic, producerParams, schema.toAttributes)
+  }
+}
+
+/**
+ * A [[DataWriter]] for Kafka writing. One data writer will be created in each partition to
+ * process incoming rows.
+ *
+ * @param targetTopic The topic that this data writer is targeting. If None, topic will be inferred
+ *                    from a `topic` field in the incoming data.
+ * @param producerParams Parameters to use for the Kafka producer.
+ * @param inputSchema The attributes in the input data.
+ */
+class KafkaContinuousDataWriter(
+    targetTopic: Option[String], producerParams: Map[String, String], inputSchema: Seq[Attribute])
+  extends KafkaRowWriter(inputSchema, targetTopic) with DataWriter[InternalRow] {
+  import scala.collection.JavaConverters._
+
+  private lazy val producer = CachedKafkaProducer.getOrCreate(
+    new java.util.HashMap[String, Object](producerParams.asJava))
+
+  def write(row: InternalRow): Unit = {
+    checkForErrors()
+    sendRow(row, producer)
+  }
+
+  def commit(): WriterCommitMessage = {
+    // Send is asynchronous, but we can't commit until all rows are actually in Kafka.
+    // This requires flushing and then checking that no callbacks produced errors.
+    // We also check for errors before to fail as soon as possible - the check is cheap.
+    checkForErrors()
+    producer.flush()
+    checkForErrors()
+    KafkaWriterCommitMessage
+  }
+
+  def abort(): Unit = {}
+
+  def close(): Unit = {
+    checkForErrors()
+    if (producer != null) {
+      producer.flush()
+      checkForErrors()
+      CachedKafkaProducer.close(new java.util.HashMap[String, Object](producerParams.asJava))
+    }
+  }
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
index 3e65949a6fd1..551641cfdbca 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
@@ -117,10 +117,14 @@ private[kafka010] class KafkaOffsetReader(
    * Resolves the specific offsets based on Kafka seek positions.
    * This method resolves offset value -1 to the latest and -2 to the
    * earliest Kafka seek position.
+   *
+   * @param partitionOffsets the specific offsets to resolve
+   * @param reportDataLoss callback to either report or log data loss depending on setting
    */
   def fetchSpecificOffsets(
-      partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] =
-    runUninterruptibly {
+      partitionOffsets: Map[TopicPartition, Long],
+      reportDataLoss: String => Unit): KafkaSourceOffset = {
+    val fetched = runUninterruptibly {
       withRetriesWithoutInterrupt {
         // Poll to get the latest assigned partitions
         consumer.poll(0)
@@ -145,6 +149,19 @@ private[kafka010] class KafkaOffsetReader(
       }
     }
 
+    partitionOffsets.foreach {
+      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
+        off != KafkaOffsetRangeLimit.EARLIEST =>
+        if (fetched(tp) != off) {
+          reportDataLoss(
+            s"startingOffsets for $tp was $off but consumer reset to ${fetched(tp)}")
+        }
+      case _ =>
+        // no real way to check that beginning or end is reasonable
+    }
+    KafkaSourceOffset(fetched)
+  }
+
   /**
    * Fetch the earliest offsets for the topic partitions that are indicated
    * in the [[ConsumerStrategy]].
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index e9cff04ba5f2..169a5d006fb0 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -130,7 +130,7 @@ private[kafka010] class KafkaSource(
       val offsets = startingOffsets match {
         case EarliestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchEarliestOffsets())
         case LatestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchLatestOffsets())
-        case SpecificOffsetRangeLimit(p) => fetchAndVerify(p)
+        case SpecificOffsetRangeLimit(p) => kafkaReader.fetchSpecificOffsets(p, reportDataLoss)
       }
       metadataLog.add(0, offsets)
       logInfo(s"Initial offsets: $offsets")
@@ -138,21 +138,6 @@ private[kafka010] class KafkaSource(
     }.partitionToOffsets
   }
 
-  private def fetchAndVerify(specificOffsets: Map[TopicPartition, Long]) = {
-    val result = kafkaReader.fetchSpecificOffsets(specificOffsets)
-    specificOffsets.foreach {
-      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
-          off != KafkaOffsetRangeLimit.EARLIEST =>
-        if (result(tp) != off) {
-          reportDataLoss(
-            s"startingOffsets for $tp was $off but consumer reset to ${result(tp)}")
-        }
-      case _ =>
-      // no real way to check that beginning or end is reasonable
-    }
-    KafkaSourceOffset(result)
-  }
-
   private var currentPartitionOffsets: Option[Map[TopicPartition, Long]] = None
 
   override def schema: StructType = KafkaOffsetReader.kafkaSchema
@@ -223,6 +208,14 @@ private[kafka010] class KafkaSource(
 
     logInfo(s"GetBatch called with start = $start, end = $end")
     val untilPartitionOffsets = KafkaSourceOffset.getPartitionOffsets(end)
+    // On recovery, getBatch will get called before getOffset
+    if (currentPartitionOffsets.isEmpty) {
+      currentPartitionOffsets = Some(untilPartitionOffsets)
+    }
+    if (start.isDefined && start.get == end) {
+      return sqlContext.internalCreateDataFrame(
+        sqlContext.sparkContext.emptyRDD, schema, isStreaming = true)
+    }
     val fromPartitionOffsets = start match {
       case Some(prevBatchEndOffset) =>
         KafkaSourceOffset.getPartitionOffsets(prevBatchEndOffset)
@@ -305,11 +298,6 @@ private[kafka010] class KafkaSource(
     logInfo("GetBatch generating RDD of offset range: " +
       offsetRanges.sortBy(_.topicPartition.toString).mkString(", "))
 
-    // On recovery, getBatch will get called before getOffset
-    if (currentPartitionOffsets.isEmpty) {
-      currentPartitionOffsets = Some(untilPartitionOffsets)
-    }
-
     sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
   }
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
index b5da415b3097..c82154cfbad7 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
@@ -20,17 +20,22 @@ package org.apache.spark.sql.kafka010
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
+import org.apache.spark.sql.sources.v2.streaming.reader.{Offset => OffsetV2, PartitionOffset}
 
 /**
  * An [[Offset]] for the [[KafkaSource]]. This one tracks all partitions of subscribed topics and
  * their offsets.
  */
 private[kafka010]
-case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends Offset {
+case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends OffsetV2 {
 
   override val json = JsonUtils.partitionOffsets(partitionToOffsets)
 }
 
+private[kafka010]
+case class KafkaSourcePartitionOffset(topicPartition: TopicPartition, partitionOffset: Long)
+  extends PartitionOffset
+
 /** Companion object of the [[KafkaSourceOffset]] */
 private[kafka010] object KafkaSourceOffset {
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 3cb4d8cad12c..3914370a9659 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.kafka010
 
 import java.{util => ju}
-import java.util.{Locale, UUID}
+import java.util.{Locale, Optional, UUID}
 
 import scala.collection.JavaConverters._
 
@@ -27,9 +27,12 @@ import org.apache.kafka.clients.producer.ProducerConfig
 import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer}
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
-import org.apache.spark.sql.execution.streaming.{Sink, Source}
+import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SparkSession, SQLContext}
+import org.apache.spark.sql.execution.streaming.{Offset, Sink, Source}
 import org.apache.spark.sql.sources._
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
@@ -43,6 +46,8 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     with StreamSinkProvider
     with RelationProvider
     with CreatableRelationProvider
+    with ContinuousWriteSupport
+    with ContinuousReadSupport
     with Logging {
   import KafkaSourceProvider._
 
@@ -101,6 +106,43 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
       failOnDataLoss(caseInsensitiveParams))
   }
 
+  override def createContinuousReader(
+      schema: Optional[StructType],
+      metadataPath: String,
+      options: DataSourceV2Options): KafkaContinuousReader = {
+    val parameters = options.asMap().asScala.toMap
+    validateStreamOptions(parameters)
+    // Each running query should use its own group id. Otherwise, the query may be only assigned
+    // partial data since Kafka will assign partitions to multiple consumers having the same group
+    // id. Hence, we should generate a unique id for each query.
+    val uniqueGroupId = s"spark-kafka-source-${UUID.randomUUID}-${metadataPath.hashCode}"
+
+    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
+    val specifiedKafkaParams =
+      parameters
+        .keySet
+        .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
+        .map { k => k.drop(6).toString -> parameters(k) }
+        .toMap
+
+    val startingStreamOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(caseInsensitiveParams,
+      STARTING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit)
+
+    val kafkaOffsetReader = new KafkaOffsetReader(
+      strategy(caseInsensitiveParams),
+      kafkaParamsForDriver(specifiedKafkaParams),
+      parameters,
+      driverGroupIdPrefix = s"$uniqueGroupId-driver")
+
+    new KafkaContinuousReader(
+      kafkaOffsetReader,
+      kafkaParamsForExecutors(specifiedKafkaParams, uniqueGroupId),
+      parameters,
+      metadataPath,
+      startingStreamOffsets,
+      failOnDataLoss(caseInsensitiveParams))
+  }
+
   /**
    * Returns a new base relation with the given parameters.
    *
@@ -181,26 +223,22 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     }
   }
 
-  private def kafkaParamsForProducer(parameters: Map[String, String]): Map[String, String] = {
-    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
-    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
-      throw new IllegalArgumentException(
-        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
-          + "are serialized with ByteArraySerializer.")
-    }
+  override def createContinuousWriter(
+      queryId: String,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): Optional[ContinuousWriter] = {
+    import scala.collection.JavaConverters._
 
-    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
-    {
-      throw new IllegalArgumentException(
-        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
-          + "value are serialized with ByteArraySerializer.")
-    }
-    parameters
-      .keySet
-      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
-      .map { k => k.drop(6).toString -> parameters(k) }
-      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
-        ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
+    val spark = SparkSession.getActiveSession.get
+    val topic = Option(options.get(TOPIC_OPTION_KEY).orElse(null)).map(_.trim)
+    // We convert the options argument from V2 -> Java map -> scala mutable -> scala immutable.
+    val producerParams = kafkaParamsForProducer(options.asMap.asScala.toMap)
+
+    KafkaWriter.validateQuery(
+      schema.toAttributes, new java.util.HashMap[String, Object](producerParams.asJava), topic)
+
+    Optional.of(new KafkaContinuousWriter(topic, producerParams, schema))
   }
 
   private def strategy(caseInsensitiveParams: Map[String, String]) =
@@ -450,4 +488,27 @@ private[kafka010] object KafkaSourceProvider extends Logging {
 
     def build(): ju.Map[String, Object] = map
   }
+
+  private[kafka010] def kafkaParamsForProducer(
+      parameters: Map[String, String]): Map[String, String] = {
+    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
+          + "are serialized with ByteArraySerializer.")
+    }
+
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
+    {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
+          + "value are serialized with ByteArraySerializer.")
+    }
+    parameters
+      .keySet
+      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
+      .map { k => k.drop(6).toString -> parameters(k) }
+      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
+      ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
+  }
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
index 6fd333e2f43b..baa60febf661 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
@@ -33,10 +33,8 @@ import org.apache.spark.sql.types.{BinaryType, StringType}
 private[kafka010] class KafkaWriteTask(
     producerConfiguration: ju.Map[String, Object],
     inputSchema: Seq[Attribute],
-    topic: Option[String]) {
+    topic: Option[String]) extends KafkaRowWriter(inputSchema, topic) {
   // used to synchronize with Kafka callbacks
-  @volatile private var failedWrite: Exception = null
-  private val projection = createProjection
   private var producer: KafkaProducer[Array[Byte], Array[Byte]] = _
 
   /**
@@ -46,23 +44,7 @@ private[kafka010] class KafkaWriteTask(
     producer = CachedKafkaProducer.getOrCreate(producerConfiguration)
     while (iterator.hasNext && failedWrite == null) {
       val currentRow = iterator.next()
-      val projectedRow = projection(currentRow)
-      val topic = projectedRow.getUTF8String(0)
-      val key = projectedRow.getBinary(1)
-      val value = projectedRow.getBinary(2)
-      if (topic == null) {
-        throw new NullPointerException(s"null topic present in the data. Use the " +
-        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
-      }
-      val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
-      val callback = new Callback() {
-        override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
-          if (failedWrite == null && e != null) {
-            failedWrite = e
-          }
-        }
-      }
-      producer.send(record, callback)
+      sendRow(currentRow, producer)
     }
   }
 
@@ -74,8 +56,49 @@ private[kafka010] class KafkaWriteTask(
       producer = null
     }
   }
+}
+
+private[kafka010] abstract class KafkaRowWriter(
+    inputSchema: Seq[Attribute], topic: Option[String]) {
+
+  // used to synchronize with Kafka callbacks
+  @volatile protected var failedWrite: Exception = _
+  protected val projection = createProjection
+
+  private val callback = new Callback() {
+    override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
+      if (failedWrite == null && e != null) {
+        failedWrite = e
+      }
+    }
+  }
 
-  private def createProjection: UnsafeProjection = {
+  /**
+   * Send the specified row to the producer, with a callback that will save any exception
+   * to failedWrite. Note that send is asynchronous; subclasses must flush() their producer before
+   * assuming the row is in Kafka.
+   */
+  protected def sendRow(
+      row: InternalRow, producer: KafkaProducer[Array[Byte], Array[Byte]]): Unit = {
+    val projectedRow = projection(row)
+    val topic = projectedRow.getUTF8String(0)
+    val key = projectedRow.getBinary(1)
+    val value = projectedRow.getBinary(2)
+    if (topic == null) {
+      throw new NullPointerException(s"null topic present in the data. Use the " +
+        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
+    }
+    val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
+    producer.send(record, callback)
+  }
+
+  protected def checkForErrors(): Unit = {
+    if (failedWrite != null) {
+      throw failedWrite
+    }
+  }
+
+  private def createProjection = {
     val topicExpression = topic.map(Literal(_)).orElse {
       inputSchema.find(_.name == KafkaWriter.TOPIC_ATTRIBUTE_NAME)
     }.getOrElse {
@@ -112,11 +135,5 @@ private[kafka010] class KafkaWriteTask(
       Seq(topicExpression, Cast(keyExpression, BinaryType),
         Cast(valueExpression, BinaryType)), inputSchema)
   }
-
-  private def checkForErrors(): Unit = {
-    if (failedWrite != null) {
-      throw failedWrite
-    }
-  }
 }
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
index 5e9ae35b3f00..15cd44812cb0 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
@@ -43,10 +43,9 @@ private[kafka010] object KafkaWriter extends Logging {
   override def toString: String = "KafkaWriter"
 
   def validateQuery(
-      queryExecution: QueryExecution,
+      schema: Seq[Attribute],
       kafkaParameters: ju.Map[String, Object],
       topic: Option[String] = None): Unit = {
-    val schema = queryExecution.analyzed.output
     schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse(
       if (topic.isEmpty) {
         throw new AnalysisException(s"topic option required when no " +
@@ -84,7 +83,7 @@ private[kafka010] object KafkaWriter extends Logging {
       kafkaParameters: ju.Map[String, Object],
       topic: Option[String] = None): Unit = {
     val schema = queryExecution.analyzed.output
-    validateQuery(queryExecution, kafkaParameters, topic)
+    validateQuery(schema, kafkaParameters, topic)
     queryExecution.toRdd.foreachPartition { iter =>
       val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic)
       Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
new file mode 100644
index 000000000000..8487a6985123
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
@@ -0,0 +1,476 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.Locale
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.kafka.clients.producer.ProducerConfig
+import org.apache.kafka.common.serialization.ByteArraySerializer
+import org.scalatest.time.SpanSugar._
+import scala.collection.JavaConverters._
+
+import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode}
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, SpecificInternalRow, UnsafeProjection}
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.streaming._
+import org.apache.spark.sql.types.{BinaryType, DataType}
+import org.apache.spark.util.Utils
+
+/**
+ * This is a temporary port of KafkaSinkSuite, since we do not yet have a V2 memory stream.
+ * Once we have one, this will be changed to a specialization of KafkaSinkSuite and we won't have
+ * to duplicate all the code.
+ */
+class KafkaContinuousSinkSuite extends KafkaContinuousTest {
+  import testImplicits._
+
+  override val streamingTimeout = 30.seconds
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    testUtils = new KafkaTestUtils(
+      withBrokerProps = Map("auto.create.topics.enable" -> "false"))
+    testUtils.setup()
+  }
+
+  override def afterAll(): Unit = {
+    if (testUtils != null) {
+      testUtils.teardown()
+      testUtils = null
+    }
+    super.afterAll()
+  }
+
+  test("streaming - write to kafka with topic field") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = None,
+      withOutputMode = Some(OutputMode.Append))(
+      withSelectExpr = s"'$topic' as topic", "value")
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
+      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      }
+      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
+      eventually(timeout(streamingTimeout)) {
+        checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+      }
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("streaming - write w/o topic field, with topic option") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = Some(topic),
+      withOutputMode = Some(OutputMode.Append()))()
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
+      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      }
+      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
+      eventually(timeout(streamingTimeout)) {
+        checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+      }
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("streaming - topic field and topic option") {
+    /* The purpose of this test is to ensure that the topic option
+     * overrides the topic field. We begin by writing some data that
+     * includes a topic field and value (e.g., 'foo') along with a topic
+     * option. Then when we read from the topic specified in the option
+     * we should see the data i.e., the data was written to the topic
+     * option, and not to the topic in the data e.g., foo
+     */
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = Some(topic),
+      withOutputMode = Some(OutputMode.Append()))(
+      withSelectExpr = "'foo' as topic", "CAST(value as STRING) value")
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .selectExpr("CAST(key AS INT)", "CAST(value AS INT)")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      }
+      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
+      eventually(timeout(streamingTimeout)) {
+        checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+      }
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("null topic attribute") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    /* No topic field or topic option */
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      writer = createKafkaWriter(input.toDF())(
+        withSelectExpr = "CAST(null as STRING) as topic", "value"
+      )
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getCause.getCause.getMessage
+      .toLowerCase(Locale.ROOT)
+      .contains("null topic present in the data."))
+  }
+
+  test("streaming - write data with bad schema") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    /* No topic field or topic option */
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      writer = createKafkaWriter(input.toDF())(
+        withSelectExpr = "value as key", "value"
+      )
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage
+      .toLowerCase(Locale.ROOT)
+      .contains("topic option required when no 'topic' attribute is present"))
+
+    try {
+      /* No value field */
+      writer = createKafkaWriter(input.toDF())(
+        withSelectExpr = s"'$topic' as topic", "value as key"
+      )
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+      "required attribute 'value' not found"))
+  }
+
+  test("streaming - write data with valid schema but wrong types") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+      .selectExpr("CAST(value as STRING) value")
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      /* topic field wrong type */
+      writer = createKafkaWriter(input.toDF())(
+        withSelectExpr = s"CAST('1' as INT) as topic", "value"
+      )
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("topic type must be a string"))
+
+    try {
+      /* value field wrong type */
+      writer = createKafkaWriter(input.toDF())(
+        withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as value"
+      )
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+      "value attribute type must be a string or binarytype"))
+
+    try {
+      /* key field wrong type */
+      writer = createKafkaWriter(input.toDF())(
+        withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as key", "value"
+      )
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+      "key attribute type must be a string or binarytype"))
+  }
+
+  test("streaming - write to non-existing topic") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+    val topic = newTopic()
+
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF(), withTopic = Some(topic))()
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        eventually(timeout(streamingTimeout)) {
+          assert(writer.exception.isDefined)
+        }
+        throw writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("job aborted"))
+  }
+
+  test("streaming - exception on config serializer") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+    testUtils.sendMessages(inputTopic, Array("0"))
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .load()
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      writer = createKafkaWriter(
+        input.toDF(),
+        withOptions = Map("kafka.key.serializer" -> "foo"))()
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+      assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+        "kafka option 'key.serializer' is not supported"))
+    } finally {
+      writer.stop()
+    }
+
+    try {
+      writer = createKafkaWriter(
+        input.toDF(),
+        withOptions = Map("kafka.value.serializer" -> "foo"))()
+      eventually(timeout(streamingTimeout)) {
+        assert(writer.exception.isDefined)
+        ex = writer.exception.get
+      }
+      assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+        "kafka option 'value.serializer' is not supported"))
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("generic - write big data with small producer buffer") {
+    /* This test ensures that we understand the semantics of Kafka when
+    * is comes to blocking on a call to send when the send buffer is full.
+    * This test will configure the smallest possible producer buffer and
+    * indicate that we should block when it is full. Thus, no exception should
+    * be thrown in the case of a full buffer.
+    */
+    val topic = newTopic()
+    testUtils.createTopic(topic, 1)
+    val options = new java.util.HashMap[String, String]
+    options.put("bootstrap.servers", testUtils.brokerAddress)
+    options.put("buffer.memory", "16384") // min buffer size
+    options.put("block.on.buffer.full", "true")
+    options.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
+    options.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
+    val inputSchema = Seq(AttributeReference("value", BinaryType)())
+    val data = new Array[Byte](15000) // large value
+    val writeTask = new KafkaContinuousDataWriter(Some(topic), options.asScala.toMap, inputSchema)
+    try {
+      val fieldTypes: Array[DataType] = Array(BinaryType)
+      val converter = UnsafeProjection.create(fieldTypes)
+      val row = new SpecificInternalRow(fieldTypes)
+      row.update(0, data)
+      val iter = Seq.fill(1000)(converter.apply(row)).iterator
+      iter.foreach(writeTask.write(_))
+      writeTask.commit()
+    } finally {
+      writeTask.close()
+    }
+  }
+
+  private def createKafkaReader(topic: String): DataFrame = {
+    spark.read
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("startingOffsets", "earliest")
+      .option("endingOffsets", "latest")
+      .option("subscribe", topic)
+      .load()
+  }
+
+  private def createKafkaWriter(
+      input: DataFrame,
+      withTopic: Option[String] = None,
+      withOutputMode: Option[OutputMode] = None,
+      withOptions: Map[String, String] = Map[String, String]())
+      (withSelectExpr: String*): StreamingQuery = {
+    var stream: DataStreamWriter[Row] = null
+    val checkpointDir = Utils.createTempDir()
+    var df = input.toDF()
+    if (withSelectExpr.length > 0) {
+      df = df.selectExpr(withSelectExpr: _*)
+    }
+    stream = df.writeStream
+      .format("kafka")
+      .option("checkpointLocation", checkpointDir.getCanonicalPath)
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      // We need to reduce blocking time to efficiently test non-existent partition behavior.
+      .option("kafka.max.block.ms", "1000")
+      .trigger(Trigger.Continuous(1000))
+      .queryName("kafkaStream")
+    withTopic.foreach(stream.option("topic", _))
+    withOutputMode.foreach(stream.outputMode(_))
+    withOptions.foreach(opt => stream.option(opt._1, opt._2))
+    stream.start()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
new file mode 100644
index 000000000000..b3dade414f62
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.Properties
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.scalatest.time.SpanSugar._
+import scala.collection.mutable
+import scala.util.Random
+
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.execution.streaming.StreamExecution
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.streaming.{StreamTest, Trigger}
+import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
+
+// Run tests in KafkaSourceSuiteBase in continuous execution mode.
+class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest
+
+class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
+  import testImplicits._
+
+  override val brokerProps = Map("auto.create.topics.enable" -> "false")
+
+  test("subscribing topic by pattern with topic deletions") {
+    val topicPrefix = newTopic()
+    val topic = topicPrefix + "-seems"
+    val topic2 = topicPrefix + "-bad"
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", s"$topicPrefix-.*")
+      .option("failOnDataLoss", "false")
+
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      Execute { query =>
+        testUtils.deleteTopic(topic)
+        testUtils.createTopic(topic2, partitions = 5)
+        eventually(timeout(streamingTimeout)) {
+          assert(
+            query.lastExecution.logical.collectFirst {
+              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
+            }.exists { r =>
+              // Ensure the new topic is present and the old topic is gone.
+              r.knownPartitions.exists(_.topic == topic2)
+            },
+            s"query never reconfigured to new topic $topic2")
+        }
+      },
+      AddKafkaData(Set(topic2), 4, 5, 6),
+      CheckAnswer(2, 3, 4, 5, 6, 7)
+    )
+  }
+}
+
+class KafkaContinuousSourceStressForDontFailOnDataLossSuite
+    extends KafkaSourceStressForDontFailOnDataLossSuite {
+  override protected def startStream(ds: Dataset[Int]) = {
+    ds.writeStream
+      .format("memory")
+      .queryName("memory")
+      .start()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala
new file mode 100644
index 000000000000..5a1a14f7a307
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.spark.SparkContext
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.execution.streaming.StreamExecution
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.streaming.Trigger
+import org.apache.spark.sql.test.TestSparkSession
+
+// Trait to configure StreamTest for kafka continuous execution tests.
+trait KafkaContinuousTest extends KafkaSourceTest {
+  override val defaultTrigger = Trigger.Continuous(1000)
+  override val defaultUseV2Sink = true
+
+  // We need more than the default local[2] to be able to schedule all partitions simultaneously.
+  override protected def createSparkSession = new TestSparkSession(
+    new SparkContext(
+      "local[10]",
+      "continuous-stream-test-sql-context",
+      sparkConf.set("spark.sql.testkey", "true")))
+
+  // In addition to setting the partitions in Kafka, we have to wait until the query has
+  // reconfigured to the new count so the test framework can hook in properly.
+  override protected def setTopicPartitions(
+      topic: String, newCount: Int, query: StreamExecution) = {
+    testUtils.addPartitions(topic, newCount)
+    eventually(timeout(streamingTimeout)) {
+      assert(
+        query.lastExecution.logical.collectFirst {
+          case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
+        }.exists(_.knownPartitions.size == newCount),
+        s"query never reconfigured to $newCount partitions")
+    }
+  }
+
+  // Continuous processing tasks end asynchronously, so test that they actually end.
+  private val tasksEndedListener = new SparkListener() {
+    val activeTaskIdCount = new AtomicInteger(0)
+
+    override def onTaskStart(start: SparkListenerTaskStart): Unit = {
+      activeTaskIdCount.incrementAndGet()
+    }
+
+    override def onTaskEnd(end: SparkListenerTaskEnd): Unit = {
+      activeTaskIdCount.decrementAndGet()
+    }
+  }
+
+  override def beforeEach(): Unit = {
+    super.beforeEach()
+    spark.sparkContext.addSparkListener(tasksEndedListener)
+  }
+
+  override def afterEach(): Unit = {
+    eventually(timeout(streamingTimeout)) {
+      assert(tasksEndedListener.activeTaskIdCount.get() == 0)
+    }
+    spark.sparkContext.removeSparkListener(tasksEndedListener)
+    super.afterEach()
+  }
+
+
+  test("ensure continuous stream is being used") {
+    val query = spark.readStream
+      .format("rate")
+      .option("numPartitions", "1")
+      .option("rowsPerSecond", "1")
+      .load()
+
+    testStream(query)(
+      Execute(q => assert(q.isInstanceOf[ContinuousExecution]))
+    )
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 2034b9be07f2..27dbb3f7a8f3 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -34,11 +34,14 @@ import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.ForeachWriter
+import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, WriteToDataSourceV2Exec}
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryWriter
 import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.kafka010.KafkaSourceProvider._
-import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
+import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest, Trigger}
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
 import org.apache.spark.util.Utils
@@ -49,9 +52,11 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
 
   override val streamingTimeout = 30.seconds
 
+  protected val brokerProps = Map[String, Object]()
+
   override def beforeAll(): Unit = {
     super.beforeAll()
-    testUtils = new KafkaTestUtils
+    testUtils = new KafkaTestUtils(brokerProps)
     testUtils.setup()
   }
 
@@ -59,18 +64,25 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
     if (testUtils != null) {
       testUtils.teardown()
       testUtils = null
-      super.afterAll()
     }
+    super.afterAll()
   }
 
   protected def makeSureGetOffsetCalled = AssertOnQuery { q =>
     // Because KafkaSource's initialPartitionOffsets is set lazily, we need to make sure
-    // its "getOffset" is called before pushing any data. Otherwise, because of the race contion,
+    // its "getOffset" is called before pushing any data. Otherwise, because of the race condition,
     // we don't know which data should be fetched when `startingOffsets` is latest.
-    q.processAllAvailable()
+    q match {
+      case c: ContinuousExecution => c.awaitEpoch(0)
+      case m: MicroBatchExecution => m.processAllAvailable()
+    }
     true
   }
 
+  protected def setTopicPartitions(topic: String, newCount: Int, query: StreamExecution) : Unit = {
+    testUtils.addPartitions(topic, newCount)
+  }
+
   /**
    * Add data to Kafka.
    *
@@ -82,10 +94,11 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
       message: String = "",
       topicAction: (String, Option[Int]) => Unit = (_, _) => {}) extends AddData {
 
-    override def addData(query: Option[StreamExecution]): (Source, Offset) = {
-      if (query.get.isActive) {
+    override def addData(query: Option[StreamExecution]): (BaseStreamingSource, Offset) = {
+      query match {
         // Make sure no Spark job is running when deleting a topic
-        query.get.processAllAvailable()
+        case Some(m: MicroBatchExecution) => m.processAllAvailable()
+        case _ =>
       }
 
       val existingTopics = testUtils.getAllTopicsAndPartitionSize().toMap
@@ -97,16 +110,18 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
         topicAction(existingTopicPartitions._1, Some(existingTopicPartitions._2))
       }
 
-      // Read all topics again in case some topics are delete.
-      val allTopics = testUtils.getAllTopicsAndPartitionSize().toMap.keys
       require(
         query.nonEmpty,
         "Cannot add data when there is no query for finding the active kafka source")
 
       val sources = query.get.logicalPlan.collect {
-        case StreamingExecutionRelation(source, _) if source.isInstanceOf[KafkaSource] =>
-          source.asInstanceOf[KafkaSource]
-      }
+        case StreamingExecutionRelation(source: KafkaSource, _) => source
+      } ++ (query.get.lastExecution match {
+        case null => Seq()
+        case e => e.logical.collect {
+          case DataSourceV2Relation(_, reader: KafkaContinuousReader) => reader
+        }
+      })
       if (sources.isEmpty) {
         throw new Exception(
           "Could not find Kafka source in the StreamExecution logical plan to add data to")
@@ -137,14 +152,158 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
     override def toString: String =
       s"AddKafkaData(topics = $topics, data = $data, message = $message)"
   }
-}
 
+  private val topicId = new AtomicInteger(0)
+  protected def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
+}
 
-class KafkaSourceSuite extends KafkaSourceTest {
+class KafkaMicroBatchSourceSuite extends KafkaSourceSuiteBase {
 
   import testImplicits._
 
-  private val topicId = new AtomicInteger(0)
+  test("(de)serialization of initial offsets") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", topic)
+
+    testStream(reader.load)(
+      makeSureGetOffsetCalled,
+      StopStream,
+      StartStream(),
+      StopStream)
+  }
+
+  test("maxOffsetsPerTrigger") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 3)
+    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
+    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
+    testUtils.sendMessages(topic, Array("1"), Some(2))
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("maxOffsetsPerTrigger", 10)
+      .option("subscribe", topic)
+      .option("startingOffsets", "earliest")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
+
+    val clock = new StreamManualClock
+
+    val waitUntilBatchProcessed = AssertOnQuery { q =>
+      eventually(Timeout(streamingTimeout)) {
+        if (!q.exception.isDefined) {
+          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
+        }
+      }
+      if (q.exception.isDefined) {
+        throw q.exception.get
+      }
+      true
+    }
+
+    testStream(mapped)(
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // 1 from smallest, 1 from middle, 8 from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
+      ),
+      StopStream,
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
+      ),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
+      )
+    )
+  }
+
+  test("input row metrics") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("subscribe", topic)
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+    testStream(mapped)(
+      StartStream(trigger = ProcessingTime(1)),
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      AssertOnQuery { query =>
+        val recordsRead = query.recentProgress.map(_.numInputRows).sum
+        recordsRead == 3
+      }
+    )
+  }
+
+  test("subscribing topic by pattern with topic deletions") {
+    val topicPrefix = newTopic()
+    val topic = topicPrefix + "-seems"
+    val topic2 = topicPrefix + "-bad"
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", s"$topicPrefix-.*")
+      .option("failOnDataLoss", "false")
+
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      Assert {
+        testUtils.deleteTopic(topic)
+        testUtils.createTopic(topic2, partitions = 5)
+        true
+      },
+      AddKafkaData(Set(topic2), 4, 5, 6),
+      CheckAnswer(2, 3, 4, 5, 6, 7)
+    )
+  }
 
   testWithUninterruptibleThread(
     "deserialization of initial offset with Spark 2.1.0") {
@@ -237,42 +396,115 @@ class KafkaSourceSuite extends KafkaSourceTest {
     }
   }
 
-  test("(de)serialization of initial offsets") {
+  test("KafkaSource with watermark") {
+    val now = System.currentTimeMillis()
     val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 64)
+    testUtils.createTopic(newTopic(), partitions = 1)
+    testUtils.sendMessages(topic, Array(1).map(_.toString))
 
-    val reader = spark
+    val kafka = spark
       .readStream
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("startingOffsets", s"earliest")
       .option("subscribe", topic)
+      .load()
 
-    testStream(reader.load)(
-      makeSureGetOffsetCalled,
-      StopStream,
-      StartStream(),
-      StopStream)
+    val windowedAggregation = kafka
+      .withWatermark("timestamp", "10 seconds")
+      .groupBy(window($"timestamp", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start") as 'window, $"count")
+
+    val query = windowedAggregation
+      .writeStream
+      .format("memory")
+      .outputMode("complete")
+      .queryName("kafkaWatermark")
+      .start()
+    query.processAllAvailable()
+    val rows = spark.table("kafkaWatermark").collect()
+    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+    val row = rows(0)
+    // We cannot check the exact window start time as it depands on the time that messages were
+    // inserted by the producer. So here we just use a low bound to make sure the internal
+    // conversion works.
+    assert(
+      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
+      s"Unexpected results: $row")
+    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
+    query.stop()
   }
 
-  test("maxOffsetsPerTrigger") {
+  test("delete a topic when a Spark job is running") {
+    KafkaSourceSuite.collectedData.clear()
+
     val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 3)
-    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
-    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
-    testUtils.sendMessages(topic, Array("1"), Some(2))
+    testUtils.createTopic(topic, partitions = 1)
+    testUtils.sendMessages(topic, (1 to 10).map(_.toString).toArray)
 
     val reader = spark
       .readStream
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
-      .option("maxOffsetsPerTrigger", 10)
       .option("subscribe", topic)
+      // If a topic is deleted and we try to poll data starting from offset 0,
+      // the Kafka consumer will just block until timeout and return an empty result.
+      // So set the timeout to 1 second to make this test fast.
+      .option("kafkaConsumer.pollTimeoutMs", "1000")
       .option("startingOffsets", "earliest")
+      .option("failOnDataLoss", "false")
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
       .as[(String, String)]
-    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
+    KafkaSourceSuite.globalTestUtils = testUtils
+    // The following ForeachWriter will delete the topic before fetching data from Kafka
+    // in executors.
+    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
+      override def open(partitionId: Long, version: Long): Boolean = {
+        KafkaSourceSuite.globalTestUtils.deleteTopic(topic)
+        true
+      }
+
+      override def process(value: Int): Unit = {
+        KafkaSourceSuite.collectedData.add(value)
+      }
+
+      override def close(errorOrNull: Throwable): Unit = {}
+    }).start()
+    query.processAllAvailable()
+    query.stop()
+    // `failOnDataLoss` is `false`, we should not fail the query
+    assert(query.exception.isEmpty)
+  }
+
+  test("SPARK-22956: currentPartitionOffsets should be set when no new data comes in") {
+    def getSpecificDF(range: Range.Inclusive): org.apache.spark.sql.Dataset[Int] = {
+      val topic = newTopic()
+      testUtils.createTopic(topic, partitions = 1)
+      testUtils.sendMessages(topic, range.map(_.toString).toArray, Some(0))
+
+      val reader = spark
+        .readStream
+        .format("kafka")
+        .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+        .option("kafka.metadata.max.age.ms", "1")
+        .option("maxOffsetsPerTrigger", 5)
+        .option("subscribe", topic)
+        .option("startingOffsets", "earliest")
+
+      reader.load()
+        .selectExpr("CAST(value AS STRING)")
+        .as[String]
+        .map(k => k.toInt)
+    }
+
+    val df1 = getSpecificDF(0 to 9)
+    val df2 = getSpecificDF(100 to 199)
+
+    val kafka = df1.union(df2)
 
     val clock = new StreamManualClock
 
@@ -288,35 +520,35 @@ class KafkaSourceSuite extends KafkaSourceTest {
       true
     }
 
-    testStream(mapped)(
+    testStream(kafka)(
       StartStream(ProcessingTime(100), clock),
       waitUntilBatchProcessed,
-      // 1 from smallest, 1 from middle, 8 from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
+      // 5 from smaller topic, 5 from bigger one
+      CheckLastBatch((0 to 4) ++ (100 to 104): _*),
       AdvanceManualClock(100),
       waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
-      ),
+      // 5 from smaller topic, 5 from bigger one
+      CheckLastBatch((5 to 9) ++ (105 to 109): _*),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smaller topic empty, 5 from bigger one
+      CheckLastBatch(110 to 114: _*),
       StopStream,
       StartStream(ProcessingTime(100), clock),
       waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
-      ),
+      // smallest now empty, 5 from bigger one
+      CheckLastBatch(115 to 119: _*),
       AdvanceManualClock(100),
       waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
-        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
-      )
+      // smallest now empty, 5 from bigger one
+      CheckLastBatch(120 to 124: _*)
     )
   }
+}
+
+class KafkaSourceSuiteBase extends KafkaSourceTest {
+
+  import testImplicits._
 
   test("cannot stop Kafka stream") {
     val topic = newTopic()
@@ -328,7 +560,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"topic-.*")
+      .option("subscribePattern", s"$topic.*")
 
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
@@ -422,65 +654,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     }
   }
 
-  test("subscribing topic by pattern with topic deletions") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-seems"
-    val topic2 = topicPrefix + "-bad"
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"$topicPrefix-.*")
-      .option("failOnDataLoss", "false")
-
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
-
-    testStream(mapped)(
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      Assert {
-        testUtils.deleteTopic(topic)
-        testUtils.createTopic(topic2, partitions = 5)
-        true
-      },
-      AddKafkaData(Set(topic2), 4, 5, 6),
-      CheckAnswer(2, 3, 4, 5, 6, 7)
-    )
-  }
-
-  test("starting offset is latest by default") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("0"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", topic)
-
-    val kafka = reader.load()
-      .selectExpr("CAST(value AS STRING)")
-      .as[String]
-    val mapped = kafka.map(_.toInt)
-
-    testStream(mapped)(
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(1, 2, 3)  // should not have 0
-    )
-  }
-
   test("bad source options") {
     def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = {
       val ex = intercept[IllegalArgumentException] {
@@ -540,77 +713,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     testUnsupportedConfig("kafka.auto.offset.reset", "latest")
   }
 
-  test("input row metrics") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val kafka = spark
-      .readStream
-      .format("kafka")
-      .option("subscribe", topic)
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
-    testStream(mapped)(
-      StartStream(trigger = ProcessingTime(1)),
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      AssertOnQuery { query =>
-        val recordsRead = query.recentProgress.map(_.numInputRows).sum
-        recordsRead == 3
-      }
-    )
-  }
-
-  test("delete a topic when a Spark job is running") {
-    KafkaSourceSuite.collectedData.clear()
-
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 1)
-    testUtils.sendMessages(topic, (1 to 10).map(_.toString).toArray)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribe", topic)
-      // If a topic is deleted and we try to poll data starting from offset 0,
-      // the Kafka consumer will just block until timeout and return an empty result.
-      // So set the timeout to 1 second to make this test fast.
-      .option("kafkaConsumer.pollTimeoutMs", "1000")
-      .option("startingOffsets", "earliest")
-      .option("failOnDataLoss", "false")
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    KafkaSourceSuite.globalTestUtils = testUtils
-    // The following ForeachWriter will delete the topic before fetching data from Kafka
-    // in executors.
-    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
-      override def open(partitionId: Long, version: Long): Boolean = {
-        KafkaSourceSuite.globalTestUtils.deleteTopic(topic)
-        true
-      }
-
-      override def process(value: Int): Unit = {
-        KafkaSourceSuite.collectedData.add(value)
-      }
-
-      override def close(errorOrNull: Throwable): Unit = {}
-    }).start()
-    query.processAllAvailable()
-    query.stop()
-    // `failOnDataLoss` is `false`, we should not fail the query
-    assert(query.exception.isEmpty)
-  }
-
   test("get offsets from case insensitive parameters") {
     for ((optionKey, optionValue, answer) <- Seq(
       (STARTING_OFFSETS_OPTION_KEY, "earLiEst", EarliestOffsetRangeLimit),
@@ -629,8 +731,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     }
   }
 
-  private def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
-
   private def assignString(topic: String, partitions: Iterable[Int]): String = {
     JsonUtils.partitions(partitions.map(p => new TopicPartition(topic, p)))
   }
@@ -676,6 +776,10 @@ class KafkaSourceSuite extends KafkaSourceTest {
 
     testStream(mapped)(
       makeSureGetOffsetCalled,
+      Execute { q =>
+        // wait to reach the last offset in every partition
+        q.awaitOffset(0, KafkaSourceOffset(partitionOffsets.mapValues(_ => 3L)))
+      },
       CheckAnswer(-20, -21, -22, 0, 1, 2, 11, 12, 22),
       StopStream,
       StartStream(),
@@ -704,13 +808,14 @@ class KafkaSourceSuite extends KafkaSourceTest {
     val query = kafka
       .writeStream
       .format("memory")
-      .outputMode("append")
       .queryName("kafkaColumnTypes")
+      .trigger(defaultTrigger)
       .start()
-    query.processAllAvailable()
-    val rows = spark.table("kafkaColumnTypes").collect()
-    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
-    val row = rows(0)
+    eventually(timeout(streamingTimeout)) {
+      assert(spark.table("kafkaColumnTypes").count == 1,
+        s"Unexpected results: ${spark.table("kafkaColumnTypes").collectAsList()}")
+    }
+    val row = spark.table("kafkaColumnTypes").head()
     assert(row.getAs[Array[Byte]]("key") === null, s"Unexpected results: $row")
     assert(row.getAs[Array[Byte]]("value") === "1".getBytes(UTF_8), s"Unexpected results: $row")
     assert(row.getAs[String]("topic") === topic, s"Unexpected results: $row")
@@ -723,47 +828,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     query.stop()
   }
 
-  test("KafkaSource with watermark") {
-    val now = System.currentTimeMillis()
-    val topic = newTopic()
-    testUtils.createTopic(newTopic(), partitions = 1)
-    testUtils.sendMessages(topic, Array(1).map(_.toString))
-
-    val kafka = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("startingOffsets", s"earliest")
-      .option("subscribe", topic)
-      .load()
-
-    val windowedAggregation = kafka
-      .withWatermark("timestamp", "10 seconds")
-      .groupBy(window($"timestamp", "5 seconds") as 'window)
-      .agg(count("*") as 'count)
-      .select($"window".getField("start") as 'window, $"count")
-
-    val query = windowedAggregation
-      .writeStream
-      .format("memory")
-      .outputMode("complete")
-      .queryName("kafkaWatermark")
-      .start()
-    query.processAllAvailable()
-    val rows = spark.table("kafkaWatermark").collect()
-    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
-    val row = rows(0)
-    // We cannot check the exact window start time as it depands on the time that messages were
-    // inserted by the producer. So here we just use a low bound to make sure the internal
-    // conversion works.
-    assert(
-      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
-      s"Unexpected results: $row")
-    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
-    query.stop()
-  }
-
   private def testFromLatestOffsets(
       topic: String,
       addPartitions: Boolean,
@@ -800,9 +864,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       AddKafkaData(Set(topic), 7, 8),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        if (addPartitions) {
-          testUtils.addPartitions(topic, 10)
-        }
+        if (addPartitions) setTopicPartitions(topic, 10, query)
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -843,9 +905,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       StartStream(),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        if (addPartitions) {
-          testUtils.addPartitions(topic, 10)
-        }
+        if (addPartitions) setTopicPartitions(topic, 10, query)
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -977,20 +1037,8 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
     }
   }
 
-  test("stress test for failOnDataLoss=false") {
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", "failOnDataLoss.*")
-      .option("startingOffsets", "earliest")
-      .option("failOnDataLoss", "false")
-      .option("fetchOffset.retryIntervalMs", "3000")
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
+  protected def startStream(ds: Dataset[Int]) = {
+    ds.writeStream.foreach(new ForeachWriter[Int] {
 
       override def open(partitionId: Long, version: Long): Boolean = {
         true
@@ -1004,6 +1052,22 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
       override def close(errorOrNull: Throwable): Unit = {
       }
     }).start()
+  }
+
+  test("stress test for failOnDataLoss=false") {
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", "failOnDataLoss.*")
+      .option("startingOffsets", "earliest")
+      .option("failOnDataLoss", "false")
+      .option("fetchOffset.retryIntervalMs", "3000")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val query = startStream(kafka.map(kv => kv._2.toInt))
 
     val testTime = 1.minutes
     val startTime = System.currentTimeMillis()
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 6eb7ba5f0092..1f28798fd245 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 786349474389..ad3cd0059e18 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 849c8b465f99..53d903fc58ae 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 48783d65826a..4206a1ac4ad9 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 40a751a652fa..7e838812bdc2 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 36d555066b18..ee7eb98e3dfc 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index cb30e4a4af4b..2ea2e4a7adbd 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index aa36dd4774d8..35ddfac1d662 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index e9b46c4cf0ff..29c1e53f480a 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
index 0d6a73a3da3e..acd64c962604 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
@@ -17,7 +17,6 @@
 
 package org.apache.spark.launcher;
 
-import java.io.IOException;
 import java.lang.reflect.Method;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.logging.Level;
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 043d13609fd2..61eb9dbcf6f7 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index a906c9e02cd4..edba0a72ee6a 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala
index 074622d41e28..bd1e3426c878 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala
@@ -30,24 +30,27 @@ import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.expressions.UserDefinedFunction
 import org.apache.spark.sql.functions.{col, lit, udf}
-import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType}
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 
 /** Private trait for params and common methods for OneHotEncoderEstimator and OneHotEncoderModel */
 private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid
     with HasInputCols with HasOutputCols {
 
   /**
-   * Param for how to handle invalid data.
+   * Param for how to handle invalid data during transform().
    * Options are 'keep' (invalid data presented as an extra categorical feature) or
    * 'error' (throw an error).
+   * Note that this Param is only used during transform; during fitting, invalid data
+   * will result in an error.
    * Default: "error"
    * @group param
    */
   @Since("2.3.0")
   override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
-    "How to handle invalid data " +
+    "How to handle invalid data during transform(). " +
     "Options are 'keep' (invalid data presented as an extra categorical feature) " +
-    "or error (throw an error).",
+    "or error (throw an error). Note that this Param is only used during transform; " +
+    "during fitting, invalid data will result in an error.",
     ParamValidators.inArray(OneHotEncoderEstimator.supportedHandleInvalids))
 
   setDefault(handleInvalid, OneHotEncoderEstimator.ERROR_INVALID)
@@ -66,10 +69,11 @@ private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid
   def getDropLast: Boolean = $(dropLast)
 
   protected def validateAndTransformSchema(
-      schema: StructType, dropLast: Boolean, keepInvalid: Boolean): StructType = {
+      schema: StructType,
+      dropLast: Boolean,
+      keepInvalid: Boolean): StructType = {
     val inputColNames = $(inputCols)
     val outputColNames = $(outputCols)
-    val existingFields = schema.fields
 
     require(inputColNames.length == outputColNames.length,
       s"The number of input columns ${inputColNames.length} must be the same as the number of " +
@@ -197,6 +201,10 @@ object OneHotEncoderEstimator extends DefaultParamsReadable[OneHotEncoderEstimat
   override def load(path: String): OneHotEncoderEstimator = super.load(path)
 }
 
+/**
+ * @param categorySizes  Original number of categories for each feature being encoded.
+ *                       The array contains one value for each input column, in order.
+ */
 @Since("2.3.0")
 class OneHotEncoderModel private[ml] (
     @Since("2.3.0") override val uid: String,
@@ -205,60 +213,58 @@ class OneHotEncoderModel private[ml] (
 
   import OneHotEncoderModel._
 
-  // Returns the category size for a given index with `dropLast` and `handleInvalid`
+  // Returns the category size for each index with `dropLast` and `handleInvalid`
   // taken into account.
-  private def configedCategorySize(orgCategorySize: Int, idx: Int): Int = {
+  private def getConfigedCategorySizes: Array[Int] = {
     val dropLast = getDropLast
     val keepInvalid = getHandleInvalid == OneHotEncoderEstimator.KEEP_INVALID
 
     if (!dropLast && keepInvalid) {
       // When `handleInvalid` is "keep", an extra category is added as last category
       // for invalid data.
-      orgCategorySize + 1
+      categorySizes.map(_ + 1)
     } else if (dropLast && !keepInvalid) {
       // When `dropLast` is true, the last category is removed.
-      orgCategorySize - 1
+      categorySizes.map(_ - 1)
     } else {
       // When `dropLast` is true and `handleInvalid` is "keep", the extra category for invalid
       // data is removed. Thus, it is the same as the plain number of categories.
-      orgCategorySize
+      categorySizes
     }
   }
 
   private def encoder: UserDefinedFunction = {
-    val oneValue = Array(1.0)
-    val emptyValues = Array.empty[Double]
-    val emptyIndices = Array.empty[Int]
-    val dropLast = getDropLast
-    val handleInvalid = getHandleInvalid
-    val keepInvalid = handleInvalid == OneHotEncoderEstimator.KEEP_INVALID
+    val keepInvalid = getHandleInvalid == OneHotEncoderEstimator.KEEP_INVALID
+    val configedSizes = getConfigedCategorySizes
+    val localCategorySizes = categorySizes
 
     // The udf performed on input data. The first parameter is the input value. The second
-    // parameter is the index of input.
-    udf { (label: Double, idx: Int) =>
-      val plainNumCategories = categorySizes(idx)
-      val size = configedCategorySize(plainNumCategories, idx)
-
-      if (label < 0) {
-        throw new SparkException(s"Negative value: $label. Input can't be negative.")
-      } else if (label == size && dropLast && !keepInvalid) {
-        // When `dropLast` is true and `handleInvalid` is not "keep",
-        // the last category is removed.
-        Vectors.sparse(size, emptyIndices, emptyValues)
-      } else if (label >= plainNumCategories && keepInvalid) {
-        // When `handleInvalid` is "keep", encodes invalid data to last category (and removed
-        // if `dropLast` is true)
-        if (dropLast) {
-          Vectors.sparse(size, emptyIndices, emptyValues)
+    // parameter is the index in inputCols of the column being encoded.
+    udf { (label: Double, colIdx: Int) =>
+      val origCategorySize = localCategorySizes(colIdx)
+      // idx: index in vector of the single 1-valued element
+      val idx = if (label >= 0 && label < origCategorySize) {
+        label
+      } else {
+        if (keepInvalid) {
+          origCategorySize
         } else {
-          Vectors.sparse(size, Array(size - 1), oneValue)
+          if (label < 0) {
+            throw new SparkException(s"Negative value: $label. Input can't be negative. " +
+              s"To handle invalid values, set Param handleInvalid to " +
+              s"${OneHotEncoderEstimator.KEEP_INVALID}")
+          } else {
+            throw new SparkException(s"Unseen value: $label. To handle unseen values, " +
+              s"set Param handleInvalid to ${OneHotEncoderEstimator.KEEP_INVALID}.")
+          }
         }
-      } else if (label < plainNumCategories) {
-        Vectors.sparse(size, Array(label.toInt), oneValue)
+      }
+
+      val size = configedSizes(colIdx)
+      if (idx < size) {
+        Vectors.sparse(size, Array(idx.toInt), Array(1.0))
       } else {
-        assert(handleInvalid == OneHotEncoderEstimator.ERROR_INVALID)
-        throw new SparkException(s"Unseen value: $label. To handle unseen values, " +
-          s"set Param handleInvalid to ${OneHotEncoderEstimator.KEEP_INVALID}.")
+        Vectors.sparse(size, Array.empty[Int], Array.empty[Double])
       }
     }
   }
@@ -282,7 +288,6 @@ class OneHotEncoderModel private[ml] (
   @Since("2.3.0")
   override def transformSchema(schema: StructType): StructType = {
     val inputColNames = $(inputCols)
-    val outputColNames = $(outputCols)
 
     require(inputColNames.length == categorySizes.length,
       s"The number of input columns ${inputColNames.length} must be the same as the number of " +
@@ -300,6 +305,7 @@ class OneHotEncoderModel private[ml] (
    * account. Mismatched numbers will cause exception.
    */
   private def verifyNumOfValues(schema: StructType): StructType = {
+    val configedSizes = getConfigedCategorySizes
     $(outputCols).zipWithIndex.foreach { case (outputColName, idx) =>
       val inputColName = $(inputCols)(idx)
       val attrGroup = AttributeGroup.fromStructField(schema(outputColName))
@@ -308,9 +314,9 @@ class OneHotEncoderModel private[ml] (
       // comparing with expected category number with `handleInvalid` and
       // `dropLast` taken into account.
       if (attrGroup.attributes.nonEmpty) {
-        val numCategories = configedCategorySize(categorySizes(idx), idx)
+        val numCategories = configedSizes(idx)
         require(attrGroup.size == numCategories, "OneHotEncoderModel expected " +
-          s"$numCategories categorical values for input column ${inputColName}, " +
+          s"$numCategories categorical values for input column $inputColName, " +
             s"but the input column had metadata specifying ${attrGroup.size} values.")
       }
     }
@@ -322,7 +328,7 @@ class OneHotEncoderModel private[ml] (
     val transformedSchema = transformSchema(dataset.schema, logging = true)
     val keepInvalid = $(handleInvalid) == OneHotEncoderEstimator.KEEP_INVALID
 
-    val encodedColumns = (0 until $(inputCols).length).map { idx =>
+    val encodedColumns = $(inputCols).indices.map { idx =>
       val inputColName = $(inputCols)(idx)
       val outputColName = $(outputCols)(idx)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 7da3339f8b48..1155ea5fdd85 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -25,7 +25,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
 import org.apache.spark.ml.attribute.AttributeGroup
-import org.apache.spark.ml.linalg.VectorUDT
+import org.apache.spark.ml.linalg.{Vector, VectorUDT}
 import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasHandleInvalid, HasLabelCol}
 import org.apache.spark.ml.util._
@@ -199,6 +199,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
     val parsedFormula = RFormulaParser.parse($(formula))
     val resolvedFormula = parsedFormula.resolve(dataset.schema)
     val encoderStages = ArrayBuffer[PipelineStage]()
+    val oneHotEncodeColumns = ArrayBuffer[(String, String)]()
 
     val prefixesToRewrite = mutable.Map[String, String]()
     val tempColumns = ArrayBuffer[String]()
@@ -210,8 +211,8 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
 
     // First we index each string column referenced by the input terms.
     val indexed: Map[String, String] = resolvedFormula.terms.flatten.distinct.map { term =>
-      dataset.schema(term) match {
-        case column if column.dataType == StringType =>
+      dataset.schema(term).dataType match {
+        case _: StringType =>
           val indexCol = tmpColumn("stridx")
           encoderStages += new StringIndexer()
             .setInputCol(term)
@@ -220,6 +221,18 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
             .setHandleInvalid($(handleInvalid))
           prefixesToRewrite(indexCol + "_") = term + "_"
           (term, indexCol)
+        case _: VectorUDT =>
+          val group = AttributeGroup.fromStructField(dataset.schema(term))
+          val size = if (group.size < 0) {
+            dataset.select(term).first().getAs[Vector](0).size
+          } else {
+            group.size
+          }
+          encoderStages += new VectorSizeHint(uid)
+            .setHandleInvalid("optimistic")
+            .setInputCol(term)
+            .setSize(size)
+          (term, term)
         case _ =>
           (term, term)
       }
@@ -230,16 +243,17 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
     val encodedTerms = resolvedFormula.terms.map {
       case Seq(term) if dataset.schema(term).dataType == StringType =>
         val encodedCol = tmpColumn("onehot")
-        var encoder = new OneHotEncoder()
-          .setInputCol(indexed(term))
-          .setOutputCol(encodedCol)
         // Formula w/o intercept, one of the categories in the first category feature is
         // being used as reference category, we will not drop any category for that feature.
         if (!hasIntercept && !keepReferenceCategory) {
-          encoder = encoder.setDropLast(false)
+          encoderStages += new OneHotEncoderEstimator(uid)
+            .setInputCols(Array(indexed(term)))
+            .setOutputCols(Array(encodedCol))
+            .setDropLast(false)
           keepReferenceCategory = true
+        } else {
+          oneHotEncodeColumns += indexed(term) -> encodedCol
         }
-        encoderStages += encoder
         prefixesToRewrite(encodedCol + "_") = term + "_"
         encodedCol
       case Seq(term) =>
@@ -253,6 +267,14 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
         interactionCol
     }
 
+    if (oneHotEncodeColumns.nonEmpty) {
+      val (inputCols, outputCols) = oneHotEncodeColumns.toArray.unzip
+      encoderStages += new OneHotEncoderEstimator(uid)
+        .setInputCols(inputCols)
+        .setOutputCols(outputCols)
+        .setDropLast(true)
+    }
+
     encoderStages += new VectorAssembler(uid)
       .setInputCols(encodedTerms.toArray)
       .setOutputCol($(featuresCol))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index a5d57a15317e..6ad44af9ef7e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -63,7 +63,9 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[Array[String]]("outputCols", "output column names"),
       ParamDesc[Int]("checkpointInterval", "set checkpoint interval (>= 1) or " +
         "disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed " +
-        "every 10 iterations", isValid = "(interval: Int) => interval == -1 || interval >= 1"),
+        "every 10 iterations. Note: this setting will be ignored if the checkpoint directory " +
+        "is not set in the SparkContext",
+        isValid = "(interval: Int) => interval == -1 || interval >= 1"),
       ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")),
       ParamDesc[String]("handleInvalid", "how to handle invalid entries. Options are skip (which " +
         "will filter out rows with bad values), or error (which will throw an error). More " +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index 13425dacc9f1..be8b2f273164 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -282,10 +282,10 @@ trait HasOutputCols extends Params {
 trait HasCheckpointInterval extends Params {
 
   /**
-   * Param for set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * Param for set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.
    * @group param
    */
-  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations", (interval: Int) => interval == -1 || interval >= 1)
+  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext", (interval: Int) => interval == -1 || interval >= 1)
 
   /** @group getParam */
   final def getCheckpointInterval: Int = $(checkpointInterval)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 095b54c0fe83..a0b507d2e718 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -160,8 +160,10 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
         } (executionContext)
       }
 
-      // Wait for metrics to be calculated before unpersisting validation dataset
+      // Wait for metrics to be calculated
       val foldMetrics = foldMetricFutures.map(ThreadUtils.awaitResult(_, Duration.Inf))
+
+      // Unpersist training & validation set once all metrics have been produced
       trainingDataset.unpersist()
       validationDataset.unpersist()
       foldMetrics
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
index c73bd1847547..8826ef3271bc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
@@ -143,24 +143,13 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St
 
     // Fit models in a Future for training in parallel
     logDebug(s"Train split with multiple sets of parameters.")
-    val modelFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
-      Future[Model[_]] {
+    val metricFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
+      Future[Double] {
         val model = est.fit(trainingDataset, paramMap).asInstanceOf[Model[_]]
 
         if (collectSubModelsParam) {
           subModels.get(paramIndex) = model
         }
-        model
-      } (executionContext)
-    }
-
-    // Unpersist training data only when all models have trained
-    Future.sequence[Model[_], Iterable](modelFutures)(implicitly, executionContext)
-      .onComplete { _ => trainingDataset.unpersist() } (executionContext)
-
-    // Evaluate models in a Future that will calulate a metric and allow model to be cleaned up
-    val metricFutures = modelFutures.zip(epm).map { case (modelFuture, paramMap) =>
-      modelFuture.map { model =>
         // TODO: duplicate evaluator to take extra params from input
         val metric = eval.evaluate(model.transform(validationDataset, paramMap))
         logDebug(s"Got metric $metric for model trained with $paramMap.")
@@ -171,7 +160,8 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St
     // Wait for all metrics to be calculated
     val metrics = metricFutures.map(ThreadUtils.awaitResult(_, Duration.Inf))
 
-    // Unpersist validation set once all metrics have been produced
+    // Unpersist training & validation set once all metrics have been produced
+    trainingDataset.unpersist()
     validationDataset.unpersist()
 
     logInfo(s"Train validation split metrics: ${metrics.toSeq}")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 5d09c90ec6df..bfe38d32dd77 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -17,18 +17,29 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.SparkException
 import org.apache.spark.ml.attribute._
-import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
+import org.apache.spark.sql.{DataFrame, Encoder, Row}
 import org.apache.spark.sql.types.DoubleType
 
-class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class RFormulaSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
+  def testRFormulaTransform[A: Encoder](
+      dataframe: DataFrame,
+      formulaModel: RFormulaModel,
+      expected: DataFrame): Unit = {
+    val (first +: rest) = expected.schema.fieldNames.toSeq
+    val expectedRows = expected.collect()
+    testTransformerByGlobalCheckFunc[A](dataframe, formulaModel, first, rest: _*) { rows =>
+      assert(rows === expectedRows)
+    }
+  }
+
   test("params") {
     ParamsSuite.checkParams(new RFormula())
   }
@@ -47,7 +58,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     // TODO(ekl) make schema comparisons ignore metadata, to avoid .toString
     assert(result.schema.toString == resultSchema.toString)
     assert(resultSchema == expected.schema)
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, Double, Double)](original, model, expected)
   }
 
   test("features column already exists") {
@@ -109,7 +120,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       (7, 8.0, 9.0, Vectors.dense(8.0, 9.0))
     ).toDF("id", "a", "b", "features")
     assert(result.schema.toString == resultSchema.toString)
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, Double, Double)](original, model, expected)
   }
 
   test("encodes string terms") {
@@ -126,7 +137,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
         (4, "baz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0)
       ).toDF("id", "a", "b", "features", "label")
     assert(result.schema.toString == resultSchema.toString)
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, String, Int)](original, model, expected)
   }
 
   test("encodes string terms with string indexer order type") {
@@ -167,7 +178,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       val result = model.transform(original)
       val resultSchema = model.transformSchema(original.schema)
       assert(result.schema.toString == resultSchema.toString)
-      assert(result.collect() === expected(idx).collect())
+      testRFormulaTransform[(Int, String, Int)](original, model, expected(idx))
       idx += 1
     }
   }
@@ -210,7 +221,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     val result = model.transform(original)
     val resultSchema = model.transformSchema(original.schema)
     assert(result.schema.toString == resultSchema.toString)
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, String, Int)](original, model, expected)
   }
 
   test("formula w/o intercept, we should output reference category when encoding string terms") {
@@ -253,7 +264,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       (4, "baz", "zz", 5, Vectors.dense(0.0, 1.0, 0.0, 1.0, 5.0), 4.0)
     ).toDF("id", "a", "b", "c", "features", "label")
     assert(result1.schema.toString == resultSchema1.toString)
-    assert(result1.collect() === expected1.collect())
+    testRFormulaTransform[(Int, String, String, Int)](original, model1, expected1)
 
     val attrs1 = AttributeGroup.fromStructField(result1.schema("features"))
     val expectedAttrs1 = new AttributeGroup(
@@ -280,7 +291,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       (4, "baz", "zz", 5, Vectors.sparse(7, Array(2, 6), Array(1.0, 5.0)), 4.0)
     ).toDF("id", "a", "b", "c", "features", "label")
     assert(result2.schema.toString == resultSchema2.toString)
-    assert(result2.collect() === expected2.collect())
+    testRFormulaTransform[(Int, String, String, Int)](original, model2, expected2)
 
     val attrs2 = AttributeGroup.fromStructField(result2.schema("features"))
     val expectedAttrs2 = new AttributeGroup(
@@ -302,7 +313,6 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       Seq(("male", "foo", 4), ("female", "bar", 4), ("female", "bar", 5), ("male", "baz", 5))
         .toDF("id", "a", "b")
     val model = formula.fit(original)
-    val result = model.transform(original)
     val expected = Seq(
         ("male", "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 1.0),
         ("female", "bar", 4, Vectors.dense(1.0, 0.0, 4.0), 0.0),
@@ -310,7 +320,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
         ("male", "baz", 5, Vectors.dense(0.0, 0.0, 5.0), 1.0)
     ).toDF("id", "a", "b", "features", "label")
     // assert(result.schema.toString == resultSchema.toString)
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(String, String, Int)](original, model, expected)
   }
 
   test("force to index label even it is numeric type") {
@@ -319,7 +329,6 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       Seq((1.0, "foo", 4), (1.0, "bar", 4), (0.0, "bar", 5), (1.0, "baz", 5))
     ).toDF("id", "a", "b")
     val model = formula.fit(original)
-    val result = model.transform(original)
     val expected = spark.createDataFrame(
       Seq(
         (1.0, "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 0.0),
@@ -327,7 +336,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
         (0.0, "bar", 5, Vectors.dense(1.0, 0.0, 5.0), 1.0),
         (1.0, "baz", 5, Vectors.dense(0.0, 0.0, 5.0), 0.0))
     ).toDF("id", "a", "b", "features", "label")
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Double, String, Int)](original, model, expected)
   }
 
   test("attribute generation") {
@@ -391,7 +400,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       (1, 2, 4, 2, Vectors.dense(16.0), 1.0),
       (2, 3, 4, 1, Vectors.dense(12.0), 2.0)
     ).toDF("a", "b", "c", "d", "features", "label")
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, Int, Int, Int)](original, model, expected)
     val attrs = AttributeGroup.fromStructField(result.schema("features"))
     val expectedAttrs = new AttributeGroup(
       "features",
@@ -414,7 +423,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0),
       (4, "baz", 5, Vectors.dense(5.0, 0.0, 0.0), 4.0)
     ).toDF("id", "a", "b", "features", "label")
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, String, Int)](original, model, expected)
     val attrs = AttributeGroup.fromStructField(result.schema("features"))
     val expectedAttrs = new AttributeGroup(
       "features",
@@ -436,7 +445,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       (2, "bar", "zq", Vectors.dense(1.0, 0.0, 0.0, 0.0), 2.0),
       (3, "bar", "zz", Vectors.dense(0.0, 1.0, 0.0, 0.0), 3.0)
     ).toDF("id", "a", "b", "features", "label")
-    assert(result.collect() === expected.collect())
+    testRFormulaTransform[(Int, String, String)](original, model, expected)
     val attrs = AttributeGroup.fromStructField(result.schema("features"))
     val expectedAttrs = new AttributeGroup(
       "features",
@@ -511,8 +520,8 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     intercept[SparkException] {
       formula1.fit(df1).transform(df2).collect()
     }
-    val result1 = formula1.setHandleInvalid("skip").fit(df1).transform(df2)
-    val result2 = formula1.setHandleInvalid("keep").fit(df1).transform(df2)
+    val model1 = formula1.setHandleInvalid("skip").fit(df1)
+    val model2 = formula1.setHandleInvalid("keep").fit(df1)
 
     val expected1 = Seq(
       (1, "foo", "zq", Vectors.dense(0.0, 1.0), 1.0),
@@ -524,16 +533,16 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       (3, "bar", "zy", Vectors.dense(1.0, 0.0, 0.0, 0.0), 3.0)
     ).toDF("id", "a", "b", "features", "label")
 
-    assert(result1.collect() === expected1.collect())
-    assert(result2.collect() === expected2.collect())
+    testRFormulaTransform[(Int, String, String)](df2, model1, expected1)
+    testRFormulaTransform[(Int, String, String)](df2, model2, expected2)
 
     // Handle unseen labels.
     val formula2 = new RFormula().setFormula("b ~ a + id")
     intercept[SparkException] {
       formula2.fit(df1).transform(df2).collect()
     }
-    val result3 = formula2.setHandleInvalid("skip").fit(df1).transform(df2)
-    val result4 = formula2.setHandleInvalid("keep").fit(df1).transform(df2)
+    val model3 = formula2.setHandleInvalid("skip").fit(df1)
+    val model4 = formula2.setHandleInvalid("keep").fit(df1)
 
     val expected3 = Seq(
       (1, "foo", "zq", Vectors.dense(0.0, 1.0), 0.0),
@@ -545,7 +554,34 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       (3, "bar", "zy", Vectors.dense(1.0, 0.0, 3.0), 2.0)
     ).toDF("id", "a", "b", "features", "label")
 
-    assert(result3.collect() === expected3.collect())
-    assert(result4.collect() === expected4.collect())
+    testRFormulaTransform[(Int, String, String)](df2, model3, expected3)
+    testRFormulaTransform[(Int, String, String)](df2, model4, expected4)
+  }
+
+  test("Use Vectors as inputs to formula.") {
+    val original = Seq(
+      (1, 4, Vectors.dense(0.0, 0.0, 4.0)),
+      (2, 4, Vectors.dense(1.0, 0.0, 4.0)),
+      (3, 5, Vectors.dense(1.0, 0.0, 5.0)),
+      (4, 5, Vectors.dense(0.0, 1.0, 5.0))
+    ).toDF("id", "a", "b")
+    val formula = new RFormula().setFormula("id ~ a + b")
+    val (first +: rest) = Seq("id", "a", "b", "features", "label")
+    testTransformer[(Int, Int, Vector)](original, formula.fit(original), first, rest: _*) {
+      case Row(id: Int, a: Int, b: Vector, features: Vector, label: Double) =>
+        assert(label === id)
+        assert(features.toArray === a +: b.toArray)
+    }
+
+    val group = new AttributeGroup("b", 3)
+    val vectorColWithMetadata = original("b").as("b", group.toMetadata())
+    val dfWithMetadata = original.withColumn("b", vectorColWithMetadata)
+    val model = formula.fit(dfWithMetadata)
+    // model should work even when applied to dataframe without metadata.
+    testTransformer[(Int, Int, Vector)](original, model, first, rest: _*) {
+      case Row(id: Int, a: Int, b: Vector, features: Vector, label: Double) =>
+        assert(label === id)
+        assert(features.toArray === a +: b.toArray)
+    }
   }
 }
diff --git a/pom.xml b/pom.xml
index 1b3716437646..a8e448a80c13 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.0-SNAPSHOT</version>
+  <version>2.3.1-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/project/MimaBuild.scala b/project/MimaBuild.scala
index 2ef0e7b40d94..adde213e361f 100644
--- a/project/MimaBuild.scala
+++ b/project/MimaBuild.scala
@@ -88,7 +88,7 @@ object MimaBuild {
 
   def mimaSettings(sparkHome: File, projectRef: ProjectRef) = {
     val organization = "org.apache.spark"
-    val previousSparkVersion = "2.0.0"
+    val previousSparkVersion = "2.2.0"
     val project = projectRef.project
     val fullId = "spark-" + project + "_2.11"
     mimaDefaultSettings ++
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 81584af6813e..eec2e2b1757a 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -36,6 +36,9 @@ object MimaExcludes {
 
   // Exclude rules for 2.3.x
   lazy val v23excludes = v22excludes ++ Seq(
+    // [SPARK-22897] Expose stageAttemptId in TaskContext
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.TaskContext.stageAttemptNumber"),
+
     // SPARK-22789: Map-only continuous processing execution
     ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryManager.startQuery$default$8"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryManager.startQuery$default$6"),
@@ -95,7 +98,40 @@ object MimaExcludes {
 
     // [SPARK-21087] CrossValidator, TrainValidationSplit expose sub models after fitting: Scala
     ProblemFilters.exclude[FinalClassProblem]("org.apache.spark.ml.tuning.CrossValidatorModel$CrossValidatorModelWriter"),
-    ProblemFilters.exclude[FinalClassProblem]("org.apache.spark.ml.tuning.TrainValidationSplitModel$TrainValidationSplitModelWriter")
+    ProblemFilters.exclude[FinalClassProblem]("org.apache.spark.ml.tuning.TrainValidationSplitModel$TrainValidationSplitModelWriter"),
+
+    // [SPARK-21728][CORE] Allow SparkSubmit to use Logging
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkSubmit.downloadFileList"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkSubmit.downloadFile"),
+
+    // [SPARK-21714][CORE][YARN] Avoiding re-uploading remote resources in yarn client mode
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment"),
+
+    // [SPARK-22324][SQL][PYTHON] Upgrade Arrow to 0.8.0
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.network.util.AbstractFileRegion.transfered"),
+
+    // [SPARK-20643][CORE] Add listener implementation to collect app state
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskData.<init>$default$5"),
+
+    // [SPARK-20648][CORE] Port JobsTab and StageTab to the new UI backend
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskData.<init>$default$12"),
+
+    // [SPARK-21462][SS] Added batchId to StreamingQueryProgress.json
+    // [SPARK-21409][SS] Expose state store memory usage in SQL metrics and progress updates
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StateOperatorProgress.this"),
+
+    // [SPARK-22278][SS] Expose current event time watermark and current processing time in GroupState
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.GroupState.getCurrentWatermarkMs"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.GroupState.getCurrentProcessingTimeMs"),
+
+    // [SPARK-20542][ML][SQL] Add an API to Bucketizer that can bin multiple columns
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasOutputCols.org$apache$spark$ml$param$shared$HasOutputCols$_setter_$outputCols_="),
+
+    // [SPARK-18619][ML] Make QuantileDiscretizer/Bucketizer/StringIndexer/RFormula inherit from HasHandleInvalid
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.Bucketizer.getHandleInvalid"),
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.StringIndexer.getHandleInvalid"),
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.QuantileDiscretizer.getHandleInvalid"),
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.StringIndexerModel.getHandleInvalid")
   )
 
   // Exclude rules for 2.2.x
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 13bf95cce40b..eb79b193103e 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -45,6 +45,7 @@
            'NGram',
            'Normalizer',
            'OneHotEncoder',
+           'OneHotEncoderEstimator', 'OneHotEncoderModel',
            'PCA', 'PCAModel',
            'PolynomialExpansion',
            'QuantileDiscretizer',
@@ -1577,6 +1578,9 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
     .. note:: This is different from scikit-learn's OneHotEncoder,
         which keeps all categories. The output vectors are sparse.
 
+    .. note:: Deprecated in 2.3.0. :py:class:`OneHotEncoderEstimator` will be renamed to
+        :py:class:`OneHotEncoder` and this :py:class:`OneHotEncoder` will be removed in 3.0.0.
+
     .. seealso::
 
        :py:class:`StringIndexer` for converting categorical values into
@@ -1641,6 +1645,118 @@ def getDropLast(self):
         return self.getOrDefault(self.dropLast)
 
 
+@inherit_doc
+class OneHotEncoderEstimator(JavaEstimator, HasInputCols, HasOutputCols, HasHandleInvalid,
+                             JavaMLReadable, JavaMLWritable):
+    """
+    A one-hot encoder that maps a column of category indices to a column of binary vectors, with
+    at most a single one-value per row that indicates the input category index.
+    For example with 5 categories, an input value of 2.0 would map to an output vector of
+    `[0.0, 0.0, 1.0, 0.0]`.
+    The last category is not included by default (configurable via `dropLast`),
+    because it makes the vector entries sum up to one, and hence linearly dependent.
+    So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
+
+    Note: This is different from scikit-learn's OneHotEncoder, which keeps all categories.
+    The output vectors are sparse.
+
+    When `handleInvalid` is configured to 'keep', an extra "category" indicating invalid values is
+    added as last category. So when `dropLast` is true, invalid values are encoded as all-zeros
+    vector.
+
+    Note: When encoding multi-column by using `inputCols` and `outputCols` params, input/output
+    cols come in pairs, specified by the order in the arrays, and each pair is treated
+    independently.
+
+    See `StringIndexer` for converting categorical values into category indices
+
+    >>> from pyspark.ml.linalg import Vectors
+    >>> df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], ["input"])
+    >>> ohe = OneHotEncoderEstimator(inputCols=["input"], outputCols=["output"])
+    >>> model = ohe.fit(df)
+    >>> model.transform(df).head().output
+    SparseVector(2, {0: 1.0})
+    >>> ohePath = temp_path + "/oheEstimator"
+    >>> ohe.save(ohePath)
+    >>> loadedOHE = OneHotEncoderEstimator.load(ohePath)
+    >>> loadedOHE.getInputCols() == ohe.getInputCols()
+    True
+    >>> modelPath = temp_path + "/ohe-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = OneHotEncoderModel.load(modelPath)
+    >>> loadedModel.categorySizes == model.categorySizes
+    True
+
+    .. versionadded:: 2.3.0
+    """
+
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle invalid data during " +
+                          "transform(). Options are 'keep' (invalid data presented as an extra " +
+                          "categorical feature) or error (throw an error). Note that this Param " +
+                          "is only used during transform; during fitting, invalid data will " +
+                          "result in an error.",
+                          typeConverter=TypeConverters.toString)
+
+    dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category",
+                     typeConverter=TypeConverters.toBoolean)
+
+    @keyword_only
+    def __init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True):
+        """
+        __init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True)
+        """
+        super(OneHotEncoderEstimator, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.feature.OneHotEncoderEstimator", self.uid)
+        self._setDefault(handleInvalid="error", dropLast=True)
+        kwargs = self._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    @since("2.3.0")
+    def setParams(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True):
+        """
+        setParams(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True)
+        Sets params for this OneHotEncoderEstimator.
+        """
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
+    @since("2.3.0")
+    def setDropLast(self, value):
+        """
+        Sets the value of :py:attr:`dropLast`.
+        """
+        return self._set(dropLast=value)
+
+    @since("2.3.0")
+    def getDropLast(self):
+        """
+        Gets the value of dropLast or its default value.
+        """
+        return self.getOrDefault(self.dropLast)
+
+    def _create_model(self, java_model):
+        return OneHotEncoderModel(java_model)
+
+
+class OneHotEncoderModel(JavaModel, JavaMLReadable, JavaMLWritable):
+    """
+    Model fitted by :py:class:`OneHotEncoderEstimator`.
+
+    .. versionadded:: 2.3.0
+    """
+
+    @property
+    @since("2.3.0")
+    def categorySizes(self):
+        """
+        Original number of categories for each feature being encoded.
+        The array contains one value for each input column, in order.
+        """
+        return self._call_java("categorySizes")
+
+
 @inherit_doc
 class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
                           JavaMLWritable):
diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
index c9b840276f67..2d86c7f03860 100644
--- a/python/pyspark/ml/image.py
+++ b/python/pyspark/ml/image.py
@@ -194,9 +194,9 @@ def readImages(self, path, recursive=False, numPartitions=-1,
         :return: a :class:`DataFrame` with a single column of "images",
                see ImageSchema for details.
 
-        >>> df = ImageSchema.readImages('python/test_support/image/kittens', recursive=True)
+        >>> df = ImageSchema.readImages('data/mllib/images/kittens', recursive=True)
         >>> df.count()
-        4
+        5
 
         .. versionadded:: 2.3.0
         """
@@ -216,3 +216,25 @@ def readImages(self, path, recursive=False, numPartitions=-1,
 def _disallow_instance(_):
     raise RuntimeError("Creating instance of _ImageSchema class is disallowed.")
 _ImageSchema.__init__ = _disallow_instance
+
+
+def _test():
+    import doctest
+    import pyspark.ml.image
+    globs = pyspark.ml.image.__dict__.copy()
+    spark = SparkSession.builder\
+        .master("local[2]")\
+        .appName("ml.image tests")\
+        .getOrCreate()
+    globs['spark'] = spark
+
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.ml.image, globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
+    spark.stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index d55d209d0939..db951d81de1e 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -119,10 +119,12 @@ def get$Name(self):
         ("inputCol", "input column name.", None, "TypeConverters.toString"),
         ("inputCols", "input column names.", None, "TypeConverters.toListString"),
         ("outputCol", "output column name.", "self.uid + '__output'", "TypeConverters.toString"),
+        ("outputCols", "output column names.", None, "TypeConverters.toListString"),
         ("numFeatures", "number of features.", None, "TypeConverters.toInt"),
         ("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " +
-         "E.g. 10 means that the cache will get checkpointed every 10 iterations.", None,
-         "TypeConverters.toInt"),
+         "E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: " +
+         "this setting will be ignored if the checkpoint directory is not set in the SparkContext.",
+         None, "TypeConverters.toInt"),
         ("seed", "random seed.", "hash(type(self).__name__)", "TypeConverters.toInt"),
         ("tol", "the convergence tolerance for iterative algorithms (>= 0).", None,
          "TypeConverters.toFloat"),
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index e5c5ddfba6c1..474c38764e5a 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -256,6 +256,29 @@ def getOutputCol(self):
         return self.getOrDefault(self.outputCol)
 
 
+class HasOutputCols(Params):
+    """
+    Mixin for param outputCols: output column names.
+    """
+
+    outputCols = Param(Params._dummy(), "outputCols", "output column names.", typeConverter=TypeConverters.toListString)
+
+    def __init__(self):
+        super(HasOutputCols, self).__init__()
+
+    def setOutputCols(self, value):
+        """
+        Sets the value of :py:attr:`outputCols`.
+        """
+        return self._set(outputCols=value)
+
+    def getOutputCols(self):
+        """
+        Gets the value of outputCols or its default value.
+        """
+        return self.getOrDefault(self.outputCols)
+
+
 class HasNumFeatures(Params):
     """
     Mixin for param numFeatures: number of features.
@@ -281,10 +304,10 @@ def getNumFeatures(self):
 
 class HasCheckpointInterval(Params):
     """
-    Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
+    Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.
     """
 
-    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", typeConverter=TypeConverters.toInt)
+    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.", typeConverter=TypeConverters.toInt)
 
     def __init__(self):
         super(HasCheckpointInterval, self).__init__()
diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index 659bc65701a0..6aef0f22340b 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -224,42 +224,17 @@ def dropGlobalTempView(self, viewName):
         """
         self._jcatalog.dropGlobalTempView(viewName)
 
-    @ignore_unicode_prefix
     @since(2.0)
-    def registerFunction(self, name, f, returnType=StringType()):
-        """Registers a python function (including lambda function) as a UDF
-        so it can be used in SQL statements.
-
-        In addition to a name and the function itself, the return type can be optionally specified.
-        When the return type is not given it default to a string and conversion will automatically
-        be done.  For any other return type, the produced object must match the specified type.
-
-        :param name: name of the UDF
-        :param f: python function
-        :param returnType: a :class:`pyspark.sql.types.DataType` object
-        :return: a wrapped :class:`UserDefinedFunction`
-
-        >>> strlen = spark.catalog.registerFunction("stringLengthString", len)
-        >>> spark.sql("SELECT stringLengthString('test')").collect()
-        [Row(stringLengthString(test)=u'4')]
-
-        >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
-        [Row(stringLengthString(text)=u'3')]
-
-        >>> from pyspark.sql.types import IntegerType
-        >>> _ = spark.catalog.registerFunction("stringLengthInt", len, IntegerType())
-        >>> spark.sql("SELECT stringLengthInt('test')").collect()
-        [Row(stringLengthInt(test)=4)]
-
-        >>> from pyspark.sql.types import IntegerType
-        >>> _ = spark.udf.register("stringLengthInt", len, IntegerType())
-        >>> spark.sql("SELECT stringLengthInt('test')").collect()
-        [Row(stringLengthInt(test)=4)]
+    def registerFunction(self, name, f, returnType=None):
+        """An alias for :func:`spark.udf.register`.
+        See :meth:`pyspark.sql.UDFRegistration.register`.
+
+        .. note:: Deprecated in 2.3.0. Use :func:`spark.udf.register` instead.
         """
-        udf = UserDefinedFunction(f, returnType=returnType, name=name,
-                                  evalType=PythonEvalType.SQL_BATCHED_UDF)
-        self._jsparkSession.udf().registerPython(name, udf._judf)
-        return udf._wrapped()
+        warnings.warn(
+            "Deprecated in 2.3.0. Use spark.udf.register instead.",
+            DeprecationWarning)
+        return self._sparkSession.udf.register(name, f, returnType)
 
     @since(2.0)
     def isCached(self, tableName):
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index b1e723cdecef..cc1cd1a5842d 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -29,9 +29,10 @@
 from pyspark.sql.readwriter import DataFrameReader
 from pyspark.sql.streaming import DataStreamReader
 from pyspark.sql.types import IntegerType, Row, StringType
+from pyspark.sql.udf import UDFRegistration
 from pyspark.sql.utils import install_exception_handler
 
-__all__ = ["SQLContext", "HiveContext", "UDFRegistration"]
+__all__ = ["SQLContext", "HiveContext"]
 
 
 class SQLContext(object):
@@ -147,7 +148,7 @@ def udf(self):
 
         :return: :class:`UDFRegistration`
         """
-        return UDFRegistration(self)
+        return self.sparkSession.udf
 
     @since(1.4)
     def range(self, start, end=None, step=1, numPartitions=None):
@@ -172,82 +173,29 @@ def range(self, start, end=None, step=1, numPartitions=None):
         """
         return self.sparkSession.range(start, end, step, numPartitions)
 
-    @ignore_unicode_prefix
     @since(1.2)
-    def registerFunction(self, name, f, returnType=StringType()):
-        """Registers a python function (including lambda function) as a UDF
-        so it can be used in SQL statements.
-
-        In addition to a name and the function itself, the return type can be optionally specified.
-        When the return type is not given it default to a string and conversion will automatically
-        be done.  For any other return type, the produced object must match the specified type.
-
-        :param name: name of the UDF
-        :param f: python function
-        :param returnType: a :class:`pyspark.sql.types.DataType` object
-        :return: a wrapped :class:`UserDefinedFunction`
-
-        >>> strlen = sqlContext.registerFunction("stringLengthString", lambda x: len(x))
-        >>> sqlContext.sql("SELECT stringLengthString('test')").collect()
-        [Row(stringLengthString(test)=u'4')]
-
-        >>> sqlContext.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
-        [Row(stringLengthString(text)=u'3')]
-
-        >>> from pyspark.sql.types import IntegerType
-        >>> _ = sqlContext.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
-        >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
-        [Row(stringLengthInt(test)=4)]
-
-        >>> from pyspark.sql.types import IntegerType
-        >>> _ = sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
-        >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
-        [Row(stringLengthInt(test)=4)]
+    def registerFunction(self, name, f, returnType=None):
+        """An alias for :func:`spark.udf.register`.
+        See :meth:`pyspark.sql.UDFRegistration.register`.
+
+        .. note:: Deprecated in 2.3.0. Use :func:`spark.udf.register` instead.
         """
-        return self.sparkSession.catalog.registerFunction(name, f, returnType)
+        warnings.warn(
+            "Deprecated in 2.3.0. Use spark.udf.register instead.",
+            DeprecationWarning)
+        return self.sparkSession.udf.register(name, f, returnType)
 
-    @ignore_unicode_prefix
     @since(2.1)
     def registerJavaFunction(self, name, javaClassName, returnType=None):
-        """Register a java UDF so it can be used in SQL statements.
-
-        In addition to a name and the function itself, the return type can be optionally specified.
-        When the return type is not specified we would infer it via reflection.
-        :param name:  name of the UDF
-        :param javaClassName: fully qualified name of java class
-        :param returnType: a :class:`pyspark.sql.types.DataType` object
-
-        >>> sqlContext.registerJavaFunction("javaStringLength",
-        ...   "test.org.apache.spark.sql.JavaStringLength", IntegerType())
-        >>> sqlContext.sql("SELECT javaStringLength('test')").collect()
-        [Row(UDF:javaStringLength(test)=4)]
-        >>> sqlContext.registerJavaFunction("javaStringLength2",
-        ...   "test.org.apache.spark.sql.JavaStringLength")
-        >>> sqlContext.sql("SELECT javaStringLength2('test')").collect()
-        [Row(UDF:javaStringLength2(test)=4)]
+        """An alias for :func:`spark.udf.registerJavaFunction`.
+        See :meth:`pyspark.sql.UDFRegistration.registerJavaFunction`.
 
+        .. note:: Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead.
         """
-        jdt = None
-        if returnType is not None:
-            jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())
-        self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
-
-    @ignore_unicode_prefix
-    @since(2.3)
-    def registerJavaUDAF(self, name, javaClassName):
-        """Register a java UDAF so it can be used in SQL statements.
-
-        :param name:  name of the UDAF
-        :param javaClassName: fully qualified name of java class
-
-        >>> sqlContext.registerJavaUDAF("javaUDAF",
-        ...   "test.org.apache.spark.sql.MyDoubleAvg")
-        >>> df = sqlContext.createDataFrame([(1, "a"),(2, "b"), (3, "a")],["id", "name"])
-        >>> df.registerTempTable("df")
-        >>> sqlContext.sql("SELECT name, javaUDAF(id) as avg from df group by name").collect()
-        [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)]
-        """
-        self.sparkSession._jsparkSession.udf().registerJavaUDAF(name, javaClassName)
+        warnings.warn(
+            "Deprecated in 2.3.0. Use spark.udf.registerJavaFunction instead.",
+            DeprecationWarning)
+        return self.sparkSession.udf.registerJavaFunction(name, javaClassName, returnType)
 
     # TODO(andrew): delete this once we refactor things to take in SparkSession
     def _inferSchema(self, rdd, samplingRatio=None):
@@ -559,24 +507,6 @@ def refreshTable(self, tableName):
         self._ssql_ctx.refreshTable(tableName)
 
 
-class UDFRegistration(object):
-    """Wrapper for user-defined function registration."""
-
-    def __init__(self, sqlContext):
-        self.sqlContext = sqlContext
-
-    def register(self, name, f, returnType=StringType()):
-        return self.sqlContext.registerFunction(name, f, returnType)
-
-    def registerJavaFunction(self, name, javaClassName, returnType=None):
-        self.sqlContext.registerJavaFunction(name, javaClassName, returnType)
-
-    def registerJavaUDAF(self, name, javaClassName):
-        self.sqlContext.registerJavaUDAF(name, javaClassName)
-
-    register.__doc__ = SQLContext.registerFunction.__doc__
-
-
 def _test():
     import os
     import doctest
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 95eca76fa988..2d5e9b91468c 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1364,7 +1364,8 @@ def subtract(self, other):
         """ Return a new :class:`DataFrame` containing rows in this frame
         but not in another frame.
 
-        This is equivalent to `EXCEPT` in SQL.
+        This is equivalent to `EXCEPT DISTINCT` in SQL.
+
         """
         return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sql_ctx)
 
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index a4ed562ad48b..961b3267b44c 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1849,14 +1849,14 @@ def explode_outer(col):
     +---+----------+----+-----+
 
     >>> df.select("id", "a_map", explode_outer("an_array")).show()
-    +---+-------------+----+
-    | id|        a_map| col|
-    +---+-------------+----+
-    |  1|Map(x -> 1.0)| foo|
-    |  1|Map(x -> 1.0)| bar|
-    |  2|        Map()|null|
-    |  3|         null|null|
-    +---+-------------+----+
+    +---+----------+----+
+    | id|     a_map| col|
+    +---+----------+----+
+    |  1|[x -> 1.0]| foo|
+    |  1|[x -> 1.0]| bar|
+    |  2|        []|null|
+    |  3|      null|null|
+    +---+----------+----+
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.explode_outer(_to_java_column(col))
@@ -1881,14 +1881,14 @@ def posexplode_outer(col):
     |  3|      null|null|null| null|
     +---+----------+----+----+-----+
     >>> df.select("id", "a_map", posexplode_outer("an_array")).show()
-    +---+-------------+----+----+
-    | id|        a_map| pos| col|
-    +---+-------------+----+----+
-    |  1|Map(x -> 1.0)|   0| foo|
-    |  1|Map(x -> 1.0)|   1| bar|
-    |  2|        Map()|null|null|
-    |  3|         null|null|null|
-    +---+-------------+----+----+
+    +---+----------+----+----+
+    | id|     a_map| pos| col|
+    +---+----------+----+----+
+    |  1|[x -> 1.0]|   0| foo|
+    |  1|[x -> 1.0]|   1| bar|
+    |  2|        []|null|null|
+    |  3|      null|null|null|
+    +---+----------+----+----+
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.posexplode_outer(_to_java_column(col))
@@ -2103,12 +2103,13 @@ def udf(f=None, returnType=StringType()):
     >>> import random
     >>> random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()
 
-    .. note:: The user-defined functions do not support conditional expressions or short curcuiting
+    .. note:: The user-defined functions do not support conditional expressions or short circuiting
         in boolean expressions and it ends up with being executed all internally. If the functions
         can fail on special rows, the workaround is to incorporate the condition into the functions.
 
     :param f: python function if used as a standalone function
-    :param returnType: a :class:`pyspark.sql.types.DataType` object
+    :param returnType: the return type of the user-defined function. The value can be either a
+        :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
 
     >>> from pyspark.sql.types import IntegerType
     >>> slen = udf(lambda s: len(s), IntegerType())
@@ -2148,7 +2149,8 @@ def pandas_udf(f=None, returnType=None, functionType=None):
     Creates a vectorized user defined function (UDF).
 
     :param f: user-defined function. A python function if used as a standalone function
-    :param returnType: a :class:`pyspark.sql.types.DataType` object
+    :param returnType: the return type of the user-defined function. The value can be either a
+        :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
     :param functionType: an enum value in :class:`pyspark.sql.functions.PandasUDFType`.
                          Default: SCALAR.
 
@@ -2184,6 +2186,11 @@ def pandas_udf(f=None, returnType=None, functionType=None):
        |         8|      JOHN DOE|          22|
        +----------+--------------+------------+
 
+       .. note:: The length of `pandas.Series` within a scalar UDF is not that of the whole input
+           column, but is the length of an internal batch used for each call to the function.
+           Therefore, this can be used, for example, to ensure the length of each returned
+           `pandas.Series`, and can not be used as the column length.
+
     2. GROUP_MAP
 
        A group map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame`
@@ -2214,9 +2221,19 @@ def pandas_udf(f=None, returnType=None, functionType=None):
 
        .. seealso:: :meth:`pyspark.sql.GroupedData.apply`
 
-    .. note:: The user-defined function must be deterministic.
+    .. note:: The user-defined functions are considered deterministic by default. Due to
+        optimization, duplicate invocations may be eliminated or the function may even be invoked
+        more times than it is present in the query. If your function is not deterministic, call
+        `asNondeterministic` on the user defined function. E.g.:
+
+    >>> @pandas_udf('double', PandasUDFType.SCALAR)  # doctest: +SKIP
+    ... def random(v):
+    ...     import numpy as np
+    ...     import pandas as pd
+    ...     return pd.Series(np.random.randn(len(v))
+    >>> random = random.asNondeterministic()  # doctest: +SKIP
 
-    .. note:: The user-defined functions do not support conditional expressions or short curcuiting
+    .. note:: The user-defined functions do not support conditional expressions or short circuiting
         in boolean expressions and it ends up with being executed all internally. If the functions
         can fail on special rows, the workaround is to incorporate the condition into the functions.
     """
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 09fae46adf01..22061b83eb78 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -212,7 +212,8 @@ def apply(self, udf):
         This function does not support partial aggregation, and requires shuffling all the data in
         the :class:`DataFrame`.
 
-        :param udf: A function object returned by :meth:`pyspark.sql.functions.pandas_udf`
+        :param udf: a group map user-defined function returned by
+            :meth:`pyspark.sql.functions.pandas_udf`.
 
         >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
         >>> df = spark.createDataFrame(
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 6e5eec48e8ac..6c84023c43fb 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -29,7 +29,6 @@
 
 from pyspark import since
 from pyspark.rdd import RDD, ignore_unicode_prefix
-from pyspark.sql.catalog import Catalog
 from pyspark.sql.conf import RuntimeConfig
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.readwriter import DataFrameReader
@@ -280,6 +279,7 @@ def catalog(self):
 
         :return: :class:`Catalog`
         """
+        from pyspark.sql.catalog import Catalog
         if not hasattr(self, "_catalog"):
             self._catalog = Catalog(self)
         return self._catalog
@@ -291,8 +291,8 @@ def udf(self):
 
         :return: :class:`UDFRegistration`
         """
-        from pyspark.sql.context import UDFRegistration
-        return UDFRegistration(self._wrapped)
+        from pyspark.sql.udf import UDFRegistration
+        return UDFRegistration(self)
 
     @since(2.0)
     def range(self, start, end=None, step=1, numPartitions=None):
@@ -325,11 +325,12 @@ def range(self, start, end=None, step=1, numPartitions=None):
 
         return DataFrame(jdf, self._wrapped)
 
-    def _inferSchemaFromList(self, data):
+    def _inferSchemaFromList(self, data, names=None):
         """
         Infer schema from list of Row or tuple.
 
         :param data: list of Row or tuple
+        :param names: list of column names
         :return: :class:`pyspark.sql.types.StructType`
         """
         if not data:
@@ -338,12 +339,12 @@ def _inferSchemaFromList(self, data):
         if type(first) is dict:
             warnings.warn("inferring schema from dict is deprecated,"
                           "please use pyspark.sql.Row instead")
-        schema = reduce(_merge_type, map(_infer_schema, data))
+        schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
         if _has_nulltype(schema):
             raise ValueError("Some of types cannot be determined after inferring")
         return schema
 
-    def _inferSchema(self, rdd, samplingRatio=None):
+    def _inferSchema(self, rdd, samplingRatio=None, names=None):
         """
         Infer schema from an RDD of Row or tuple.
 
@@ -360,10 +361,10 @@ def _inferSchema(self, rdd, samplingRatio=None):
                           "Use pyspark.sql.Row instead")
 
         if samplingRatio is None:
-            schema = _infer_schema(first)
+            schema = _infer_schema(first, names=names)
             if _has_nulltype(schema):
                 for row in rdd.take(100)[1:]:
-                    schema = _merge_type(schema, _infer_schema(row))
+                    schema = _merge_type(schema, _infer_schema(row, names=names))
                     if not _has_nulltype(schema):
                         break
                 else:
@@ -372,7 +373,7 @@ def _inferSchema(self, rdd, samplingRatio=None):
         else:
             if samplingRatio < 0.99:
                 rdd = rdd.sample(False, float(samplingRatio))
-            schema = rdd.map(_infer_schema).reduce(_merge_type)
+            schema = rdd.map(lambda row: _infer_schema(row, names)).reduce(_merge_type)
         return schema
 
     def _createFromRDD(self, rdd, schema, samplingRatio):
@@ -380,7 +381,7 @@ def _createFromRDD(self, rdd, schema, samplingRatio):
         Create an RDD for DataFrame from an existing RDD, returns the RDD and schema.
         """
         if schema is None or isinstance(schema, (list, tuple)):
-            struct = self._inferSchema(rdd, samplingRatio)
+            struct = self._inferSchema(rdd, samplingRatio, names=schema)
             converter = _create_converter(struct)
             rdd = rdd.map(converter)
             if isinstance(schema, (list, tuple)):
@@ -406,7 +407,7 @@ def _createFromLocal(self, data, schema):
             data = list(data)
 
         if schema is None or isinstance(schema, (list, tuple)):
-            struct = self._inferSchemaFromList(data)
+            struct = self._inferSchemaFromList(data, names=schema)
             converter = _create_converter(struct)
             data = map(converter, data)
             if isinstance(schema, (list, tuple)):
@@ -458,21 +459,23 @@ def _convert_from_pandas(self, pdf, schema, timezone):
                     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
                     if isinstance(field.dataType, TimestampType):
                         s = _check_series_convert_timestamps_tz_local(pdf[field.name], timezone)
-                        if not copied and s is not pdf[field.name]:
-                            # Copy once if the series is modified to prevent the original Pandas
-                            # DataFrame from being updated
-                            pdf = pdf.copy()
-                            copied = True
-                        pdf[field.name] = s
+                        if s is not pdf[field.name]:
+                            if not copied:
+                                # Copy once if the series is modified to prevent the original
+                                # Pandas DataFrame from being updated
+                                pdf = pdf.copy()
+                                copied = True
+                            pdf[field.name] = s
             else:
                 for column, series in pdf.iteritems():
-                    s = _check_series_convert_timestamps_tz_local(pdf[column], timezone)
-                    if not copied and s is not pdf[column]:
-                        # Copy once if the series is modified to prevent the original Pandas
-                        # DataFrame from being updated
-                        pdf = pdf.copy()
-                        copied = True
-                    pdf[column] = s
+                    s = _check_series_convert_timestamps_tz_local(series, timezone)
+                    if s is not series:
+                        if not copied:
+                            # Copy once if the series is modified to prevent the original
+                            # Pandas DataFrame from being updated
+                            pdf = pdf.copy()
+                            copied = True
+                        pdf[column] = s
 
         # Convert pandas.DataFrame to list of numpy records
         np_records = pdf.to_records(index=False)
@@ -645,7 +648,9 @@ def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=Tr
 
             # If no schema supplied by user then get the names of columns only
             if schema is None:
-                schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in data.columns]
+                schema = [str(x) if not isinstance(x, basestring) else
+                          (x.encode('utf-8') if not isinstance(x, str) else x)
+                          for x in data.columns]
 
             if self.conf.get("spark.sql.execution.arrow.enabled", "false").lower() == "true" \
                     and len(data) > 0:
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index fb228f99ba7a..e2a97acb5e2a 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -786,35 +786,54 @@ def queryName(self, queryName):
 
     @keyword_only
     @since(2.0)
-    def trigger(self, processingTime=None, once=None):
+    def trigger(self, processingTime=None, once=None, continuous=None):
         """Set the trigger for the stream query. If this is not set it will run the query as fast
         as possible, which is equivalent to setting the trigger to ``processingTime='0 seconds'``.
 
         .. note:: Evolving.
 
         :param processingTime: a processing time interval as a string, e.g. '5 seconds', '1 minute'.
+                               Set a trigger that runs a query periodically based on the processing
+                               time. Only one trigger can be set.
+        :param once: if set to True, set a trigger that processes only one batch of data in a
+                     streaming query then terminates the query. Only one trigger can be set.
 
         >>> # trigger the query for execution every 5 seconds
         >>> writer = sdf.writeStream.trigger(processingTime='5 seconds')
         >>> # trigger the query for just once batch of data
         >>> writer = sdf.writeStream.trigger(once=True)
+        >>> # trigger the query for execution every 5 seconds
+        >>> writer = sdf.writeStream.trigger(continuous='5 seconds')
         """
+        params = [processingTime, once, continuous]
+
+        if params.count(None) == 3:
+            raise ValueError('No trigger provided')
+        elif params.count(None) < 2:
+            raise ValueError('Multiple triggers not allowed.')
+
         jTrigger = None
         if processingTime is not None:
-            if once is not None:
-                raise ValueError('Multiple triggers not allowed.')
             if type(processingTime) != str or len(processingTime.strip()) == 0:
                 raise ValueError('Value for processingTime must be a non empty string. Got: %s' %
                                  processingTime)
             interval = processingTime.strip()
             jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.ProcessingTime(
                 interval)
+
         elif once is not None:
             if once is not True:
                 raise ValueError('Value for once must be True. Got: %s' % once)
             jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Once()
+
         else:
-            raise ValueError('No trigger provided')
+            if type(continuous) != str or len(continuous.strip()) == 0:
+                raise ValueError('Value for continuous must be a non empty string. Got: %s' %
+                                 continuous)
+            interval = continuous.strip()
+            jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Continuous(
+                interval)
+
         self._jwrite = self._jwrite.trigger(jTrigger)
         return self
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 67bdb3d72d93..4fee2ecde391 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -68,6 +68,7 @@
 from pyspark.sql.types import UserDefinedType, _infer_type, _make_type_verifier
 from pyspark.sql.types import _array_signed_int_typecode_ctype_mappings, _array_type_mappings
 from pyspark.sql.types import _array_unsigned_int_typecode_ctype_mappings
+from pyspark.sql.types import _merge_type
 from pyspark.tests import QuietTest, ReusedPySparkTestCase, SparkSubmitTests
 from pyspark.sql.functions import UserDefinedFunction, sha2, lit
 from pyspark.sql.window import Window
@@ -371,6 +372,12 @@ def test_udf(self):
         [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect()
         self.assertEqual(row[0], 5)
 
+        # This is to check if a deprecated 'SQLContext.registerFunction' can call its alias.
+        sqlContext = self.spark._wrapped
+        sqlContext.registerFunction("oneArg", lambda x: len(x), IntegerType())
+        [row] = sqlContext.sql("SELECT oneArg('test')").collect()
+        self.assertEqual(row[0], 4)
+
     def test_udf2(self):
         self.spark.catalog.registerFunction("strlen", lambda string: len(string), IntegerType())
         self.spark.createDataFrame(self.sc.parallelize([Row(a="test")]))\
@@ -378,6 +385,68 @@ def test_udf2(self):
         [res] = self.spark.sql("SELECT strlen(a) FROM test WHERE strlen(a) > 1").collect()
         self.assertEqual(4, res[0])
 
+    def test_udf3(self):
+        two_args = self.spark.catalog.registerFunction(
+            "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y))
+        self.assertEqual(two_args.deterministic, True)
+        [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect()
+        self.assertEqual(row[0], u'5')
+
+    def test_udf_registration_return_type_none(self):
+        two_args = self.spark.catalog.registerFunction(
+            "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y, "integer"), None)
+        self.assertEqual(two_args.deterministic, True)
+        [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect()
+        self.assertEqual(row[0], 5)
+
+    def test_udf_registration_return_type_not_none(self):
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(TypeError, "Invalid returnType"):
+                self.spark.catalog.registerFunction(
+                    "f", UserDefinedFunction(lambda x, y: len(x) + y, StringType()), StringType())
+
+    def test_nondeterministic_udf(self):
+        # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
+        from pyspark.sql.functions import udf
+        import random
+        udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
+        self.assertEqual(udf_random_col.deterministic, False)
+        df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
+        udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
+        [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
+        self.assertEqual(row[0] + 10, row[1])
+
+    def test_nondeterministic_udf2(self):
+        import random
+        from pyspark.sql.functions import udf
+        random_udf = udf(lambda: random.randint(6, 6), IntegerType()).asNondeterministic()
+        self.assertEqual(random_udf.deterministic, False)
+        random_udf1 = self.spark.catalog.registerFunction("randInt", random_udf)
+        self.assertEqual(random_udf1.deterministic, False)
+        [row] = self.spark.sql("SELECT randInt()").collect()
+        self.assertEqual(row[0], 6)
+        [row] = self.spark.range(1).select(random_udf1()).collect()
+        self.assertEqual(row[0], 6)
+        [row] = self.spark.range(1).select(random_udf()).collect()
+        self.assertEqual(row[0], 6)
+        # render_doc() reproduces the help() exception without printing output
+        pydoc.render_doc(udf(lambda: random.randint(6, 6), IntegerType()))
+        pydoc.render_doc(random_udf)
+        pydoc.render_doc(random_udf1)
+        pydoc.render_doc(udf(lambda x: x).asNondeterministic)
+
+    def test_nondeterministic_udf_in_aggregate(self):
+        from pyspark.sql.functions import udf, sum
+        import random
+        udf_random_col = udf(lambda: int(100 * random.random()), 'int').asNondeterministic()
+        df = self.spark.range(10)
+
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(AnalysisException, "nondeterministic"):
+                df.groupby('id').agg(sum(udf_random_col())).collect()
+            with self.assertRaisesRegexp(AnalysisException, "nondeterministic"):
+                df.agg(sum(udf_random_col())).collect()
+
     def test_chained_udf(self):
         self.spark.catalog.registerFunction("double", lambda x: x + x, IntegerType())
         [row] = self.spark.sql("SELECT double(1)").collect()
@@ -435,15 +504,6 @@ def test_udf_with_array_type(self):
         self.assertEqual(list(range(3)), l1)
         self.assertEqual(1, l2)
 
-    def test_nondeterministic_udf(self):
-        from pyspark.sql.functions import udf
-        import random
-        udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
-        df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
-        udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
-        [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
-        self.assertEqual(row[0] + 10, row[1])
-
     def test_broadcast_in_udf(self):
         bar = {"a": "aa", "b": "bb", "c": "abc"}
         foo = self.sc.broadcast(bar)
@@ -523,11 +583,25 @@ def test_udf_registration_returns_udf(self):
             df.select(add_three("id").alias("plus_three")).collect()
         )
 
+        # This is to check if a 'SQLContext.udf' can call its alias.
+        sqlContext = self.spark._wrapped
+        add_four = sqlContext.udf.register("add_four", lambda x: x + 4, IntegerType())
+
+        self.assertListEqual(
+            df.selectExpr("add_four(id) AS plus_four").collect(),
+            df.select(add_four("id").alias("plus_four")).collect()
+        )
+
     def test_non_existed_udf(self):
         spark = self.spark
         self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udf",
                                 lambda: spark.udf.registerJavaFunction("udf1", "non_existed_udf"))
 
+        # This is to check if a deprecated 'SQLContext.registerJavaFunction' can call its alias.
+        sqlContext = spark._wrapped
+        self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udf",
+                                lambda: sqlContext.registerJavaFunction("udf1", "non_existed_udf"))
+
     def test_non_existed_udaf(self):
         spark = self.spark
         self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udaf",
@@ -567,7 +641,6 @@ def test_read_multiple_orc_file(self):
 
     def test_udf_with_input_file_name(self):
         from pyspark.sql.functions import udf, input_file_name
-        from pyspark.sql.types import StringType
         sourceFile = udf(lambda path: path, StringType())
         filePath = "python/test_support/sql/people1.json"
         row = self.spark.read.json(filePath).select(sourceFile(input_file_name())).first()
@@ -575,7 +648,6 @@ def test_udf_with_input_file_name(self):
 
     def test_udf_with_input_file_name_for_hadooprdd(self):
         from pyspark.sql.functions import udf, input_file_name
-        from pyspark.sql.types import StringType
 
         def filename(path):
             return path
@@ -635,7 +707,6 @@ def test_udf_with_string_return_type(self):
 
     def test_udf_shouldnt_accept_noncallable_object(self):
         from pyspark.sql.functions import UserDefinedFunction
-        from pyspark.sql.types import StringType
 
         non_callable = None
         self.assertRaises(TypeError, UserDefinedFunction, non_callable, StringType())
@@ -861,6 +932,15 @@ def test_infer_schema(self):
         result = self.spark.sql("SELECT l[0].a from test2 where d['key'].d = '2'")
         self.assertEqual(1, result.head()[0])
 
+    def test_infer_schema_not_enough_names(self):
+        df = self.spark.createDataFrame([["a", "b"]], ["col1"])
+        self.assertEqual(df.columns, ['col1', '_2'])
+
+    def test_infer_schema_fails(self):
+        with self.assertRaisesRegexp(TypeError, 'field a'):
+            self.spark.createDataFrame(self.spark.sparkContext.parallelize([[1, 1], ["x", 1]]),
+                                       schema=["a", "b"], samplingRatio=0.99)
+
     def test_infer_nested_schema(self):
         NestedRow = Row("f1", "f2")
         nestedRdd1 = self.sc.parallelize([NestedRow([1, 2], {"row1": 1.0}),
@@ -881,6 +961,10 @@ def test_infer_nested_schema(self):
         df = self.spark.createDataFrame(rdd)
         self.assertEqual(Row(field1=1, field2=u'row1'), df.first())
 
+    def test_create_dataframe_from_dict_respects_schema(self):
+        df = self.spark.createDataFrame([{'a': 1}], ["b"])
+        self.assertEqual(df.columns, ['b'])
+
     def test_create_dataframe_from_objects(self):
         data = [MyObject(1, "1"), MyObject(2, "2")]
         df = self.spark.createDataFrame(data)
@@ -1105,6 +1189,17 @@ def test_union_with_udt(self):
             ]
         )
 
+    def test_cast_to_string_with_udt(self):
+        from pyspark.sql.tests import ExamplePointUDT, ExamplePoint
+        from pyspark.sql.functions import col
+        row = (ExamplePoint(1.0, 2.0), PythonOnlyPoint(3.0, 4.0))
+        schema = StructType([StructField("point", ExamplePointUDT(), False),
+                             StructField("pypoint", PythonOnlyUDT(), False)])
+        df = self.spark.createDataFrame([row], schema)
+
+        result = df.select(col('point').cast('string'), col('pypoint').cast('string')).head()
+        self.assertEqual(result, Row(point=u'(1.0, 2.0)', pypoint=u'[3.0, 4.0]'))
+
     def test_column_operators(self):
         ci = self.df.key
         cs = self.df.value
@@ -1299,7 +1394,6 @@ def test_between_function(self):
                          df.filter(df.a.between(df.b, df.c)).collect())
 
     def test_struct_type(self):
-        from pyspark.sql.types import StructType, StringType, StructField
         struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
         struct2 = StructType([StructField("f1", StringType(), True),
                               StructField("f2", StringType(), True, None)])
@@ -1368,7 +1462,6 @@ def test_parse_datatype_string(self):
             _parse_datatype_string("a INT, c DOUBLE"))
 
     def test_metadata_null(self):
-        from pyspark.sql.types import StructType, StringType, StructField
         schema = StructType([StructField("f1", StringType(), True, None),
                              StructField("f2", StringType(), True, {'a': None})])
         rdd = self.sc.parallelize([["a", "b"], ["c", "d"]])
@@ -1456,6 +1549,12 @@ def test_stream_trigger(self):
         except ValueError:
             pass
 
+        # Should not take multiple args
+        try:
+            df.writeStream.trigger(processingTime='5 seconds', continuous='1 second')
+        except ValueError:
+            pass
+
         # Should take only keyword args
         try:
             df.writeStream.trigger('5 seconds')
@@ -1737,6 +1836,92 @@ def test_infer_long_type(self):
         self.assertEqual(_infer_type(2**61), LongType())
         self.assertEqual(_infer_type(2**71), LongType())
 
+    def test_merge_type(self):
+        self.assertEqual(_merge_type(LongType(), NullType()), LongType())
+        self.assertEqual(_merge_type(NullType(), LongType()), LongType())
+
+        self.assertEqual(_merge_type(LongType(), LongType()), LongType())
+
+        self.assertEqual(_merge_type(
+            ArrayType(LongType()),
+            ArrayType(LongType())
+        ), ArrayType(LongType()))
+        with self.assertRaisesRegexp(TypeError, 'element in array'):
+            _merge_type(ArrayType(LongType()), ArrayType(DoubleType()))
+
+        self.assertEqual(_merge_type(
+            MapType(StringType(), LongType()),
+            MapType(StringType(), LongType())
+        ), MapType(StringType(), LongType()))
+        with self.assertRaisesRegexp(TypeError, 'key of map'):
+            _merge_type(
+                MapType(StringType(), LongType()),
+                MapType(DoubleType(), LongType()))
+        with self.assertRaisesRegexp(TypeError, 'value of map'):
+            _merge_type(
+                MapType(StringType(), LongType()),
+                MapType(StringType(), DoubleType()))
+
+        self.assertEqual(_merge_type(
+            StructType([StructField("f1", LongType()), StructField("f2", StringType())]),
+            StructType([StructField("f1", LongType()), StructField("f2", StringType())])
+        ), StructType([StructField("f1", LongType()), StructField("f2", StringType())]))
+        with self.assertRaisesRegexp(TypeError, 'field f1'):
+            _merge_type(
+                StructType([StructField("f1", LongType()), StructField("f2", StringType())]),
+                StructType([StructField("f1", DoubleType()), StructField("f2", StringType())]))
+
+        self.assertEqual(_merge_type(
+            StructType([StructField("f1", StructType([StructField("f2", LongType())]))]),
+            StructType([StructField("f1", StructType([StructField("f2", LongType())]))])
+        ), StructType([StructField("f1", StructType([StructField("f2", LongType())]))]))
+        with self.assertRaisesRegexp(TypeError, 'field f2 in field f1'):
+            _merge_type(
+                StructType([StructField("f1", StructType([StructField("f2", LongType())]))]),
+                StructType([StructField("f1", StructType([StructField("f2", StringType())]))]))
+
+        self.assertEqual(_merge_type(
+            StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]),
+            StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())])
+        ), StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]))
+        with self.assertRaisesRegexp(TypeError, 'element in array field f1'):
+            _merge_type(
+                StructType([
+                    StructField("f1", ArrayType(LongType())),
+                    StructField("f2", StringType())]),
+                StructType([
+                    StructField("f1", ArrayType(DoubleType())),
+                    StructField("f2", StringType())]))
+
+        self.assertEqual(_merge_type(
+            StructType([
+                StructField("f1", MapType(StringType(), LongType())),
+                StructField("f2", StringType())]),
+            StructType([
+                StructField("f1", MapType(StringType(), LongType())),
+                StructField("f2", StringType())])
+        ), StructType([
+            StructField("f1", MapType(StringType(), LongType())),
+            StructField("f2", StringType())]))
+        with self.assertRaisesRegexp(TypeError, 'value of map field f1'):
+            _merge_type(
+                StructType([
+                    StructField("f1", MapType(StringType(), LongType())),
+                    StructField("f2", StringType())]),
+                StructType([
+                    StructField("f1", MapType(StringType(), DoubleType())),
+                    StructField("f2", StringType())]))
+
+        self.assertEqual(_merge_type(
+            StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]),
+            StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))])
+        ), StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]))
+        with self.assertRaisesRegexp(TypeError, 'key of map element in array field f1'):
+            _merge_type(
+                StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]),
+                StructType([StructField("f1", ArrayType(MapType(DoubleType(), LongType())))])
+            )
+
     def test_filter_with_datetime(self):
         time = datetime.datetime(2015, 4, 17, 23, 1, 2, 3000)
         date = time.date()
@@ -3397,6 +3582,15 @@ def test_toPandas_with_array_type(self):
                 self.assertTrue(expected[r][e] == result_arrow[r][e] and
                                 result[r][e] == result_arrow[r][e])
 
+    def test_createDataFrame_with_int_col_names(self):
+        import numpy as np
+        import pandas as pd
+        pdf = pd.DataFrame(np.random.rand(4, 2))
+        df, df_arrow = self._createDataFrame_toggle(pdf)
+        pdf_col_names = [str(c) for c in pdf.columns]
+        self.assertEqual(pdf_col_names, df.columns)
+        self.assertEqual(pdf_col_names, df_arrow.columns)
+
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
 class PandasUDFTests(ReusedSQLTestCase):
@@ -3546,6 +3740,18 @@ def tearDownClass(cls):
         time.tzset()
         ReusedSQLTestCase.tearDownClass()
 
+    @property
+    def nondeterministic_vectorized_udf(self):
+        from pyspark.sql.functions import pandas_udf
+
+        @pandas_udf('double')
+        def random_udf(v):
+            import pandas as pd
+            import numpy as np
+            return pd.Series(np.random.random(len(v)))
+        random_udf = random_udf.asNondeterministic()
+        return random_udf
+
     def test_vectorized_udf_basic(self):
         from pyspark.sql.functions import pandas_udf, col
         df = self.spark.range(10).select(
@@ -3570,6 +3776,21 @@ def test_vectorized_udf_basic(self):
                         bool_f(col('bool')))
         self.assertEquals(df.collect(), res.collect())
 
+    def test_register_nondeterministic_vectorized_udf_basic(self):
+        from pyspark.sql.functions import pandas_udf
+        from pyspark.rdd import PythonEvalType
+        import random
+        random_pandas_udf = pandas_udf(
+            lambda x: random.randint(6, 6) + x, IntegerType()).asNondeterministic()
+        self.assertEqual(random_pandas_udf.deterministic, False)
+        self.assertEqual(random_pandas_udf.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        nondeterministic_pandas_udf = self.spark.catalog.registerFunction(
+            "randomPandasUDF", random_pandas_udf)
+        self.assertEqual(nondeterministic_pandas_udf.deterministic, False)
+        self.assertEqual(nondeterministic_pandas_udf.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        [row] = self.spark.sql("SELECT randomPandasUDF(1)").collect()
+        self.assertEqual(row[0], 7)
+
     def test_vectorized_udf_null_boolean(self):
         from pyspark.sql.functions import pandas_udf, col
         data = [(True,), (True,), (None,), (False,)]
@@ -3929,6 +4150,50 @@ def test_vectorized_udf_timestamps_respect_session_timezone(self):
         finally:
             self.spark.conf.set("spark.sql.session.timeZone", orig_tz)
 
+    def test_nondeterministic_vectorized_udf(self):
+        # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
+        from pyspark.sql.functions import udf, pandas_udf, col
+
+        @pandas_udf('double')
+        def plus_ten(v):
+            return v + 10
+        random_udf = self.nondeterministic_vectorized_udf
+
+        df = self.spark.range(10).withColumn('rand', random_udf(col('id')))
+        result1 = df.withColumn('plus_ten(rand)', plus_ten(df['rand'])).toPandas()
+
+        self.assertEqual(random_udf.deterministic, False)
+        self.assertTrue(result1['plus_ten(rand)'].equals(result1['rand'] + 10))
+
+    def test_nondeterministic_vectorized_udf_in_aggregate(self):
+        from pyspark.sql.functions import pandas_udf, sum
+
+        df = self.spark.range(10)
+        random_udf = self.nondeterministic_vectorized_udf
+
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'):
+                df.groupby(df.id).agg(sum(random_udf(df.id))).collect()
+            with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'):
+                df.agg(sum(random_udf(df.id))).collect()
+
+    def test_register_vectorized_udf_basic(self):
+        from pyspark.rdd import PythonEvalType
+        from pyspark.sql.functions import pandas_udf, col, expr
+        df = self.spark.range(10).select(
+            col('id').cast('int').alias('a'),
+            col('id').cast('int').alias('b'))
+        original_add = pandas_udf(lambda x, y: x + y, IntegerType())
+        self.assertEqual(original_add.deterministic, True)
+        self.assertEqual(original_add.evalType, PythonEvalType.SQL_PANDAS_SCALAR_UDF)
+        new_add = self.spark.catalog.registerFunction("add1", original_add)
+        res1 = df.select(new_add(col('a'), col('b')))
+        res2 = self.spark.sql(
+            "SELECT add1(t.a, t.b) FROM (SELECT id as a, id as b FROM range(10)) t")
+        expected = df.select(expr('a + b'))
+        self.assertEquals(expected.collect(), res1.collect())
+        self.assertEquals(expected.collect(), res2.collect())
+
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
 class GroupbyApplyTests(ReusedSQLTestCase):
@@ -3964,6 +4229,15 @@ def test_simple(self):
         expected = df.toPandas().groupby('id').apply(foo_udf.func).reset_index(drop=True)
         self.assertFramesEqual(expected, result)
 
+    def test_register_group_map_udf(self):
+        from pyspark.sql.functions import pandas_udf, PandasUDFType
+
+        foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUP_MAP)
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(ValueError, 'f must be either SQL_BATCHED_UDF or '
+                                                     'SQL_PANDAS_SCALAR_UDF'):
+                self.spark.catalog.registerFunction("foo_udf", foo_udf)
+
     def test_decorator(self):
         from pyspark.sql.functions import pandas_udf, PandasUDFType
         df = self.data
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 146e673ae975..0dc5823f72a3 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1073,7 +1073,7 @@ def _infer_type(obj):
             raise TypeError("not supported type: %s" % type(obj))
 
 
-def _infer_schema(row):
+def _infer_schema(row, names=None):
     """Infer the schema from dict/namedtuple/object"""
     if isinstance(row, dict):
         items = sorted(row.items())
@@ -1084,7 +1084,10 @@ def _infer_schema(row):
         elif hasattr(row, "_fields"):  # namedtuple
             items = zip(row._fields, tuple(row))
         else:
-            names = ['_%d' % i for i in range(1, len(row) + 1)]
+            if names is None:
+                names = ['_%d' % i for i in range(1, len(row) + 1)]
+            elif len(names) < len(row):
+                names.extend('_%d' % i for i in range(len(names) + 1, len(row) + 1))
             items = zip(names, row)
 
     elif hasattr(row, "__dict__"):  # object
@@ -1109,19 +1112,27 @@ def _has_nulltype(dt):
         return isinstance(dt, NullType)
 
 
-def _merge_type(a, b):
+def _merge_type(a, b, name=None):
+    if name is None:
+        new_msg = lambda msg: msg
+        new_name = lambda n: "field %s" % n
+    else:
+        new_msg = lambda msg: "%s: %s" % (name, msg)
+        new_name = lambda n: "field %s in %s" % (n, name)
+
     if isinstance(a, NullType):
         return b
     elif isinstance(b, NullType):
         return a
     elif type(a) is not type(b):
         # TODO: type cast (such as int -> long)
-        raise TypeError("Can not merge type %s and %s" % (type(a), type(b)))
+        raise TypeError(new_msg("Can not merge type %s and %s" % (type(a), type(b))))
 
     # same type
     if isinstance(a, StructType):
         nfs = dict((f.name, f.dataType) for f in b.fields)
-        fields = [StructField(f.name, _merge_type(f.dataType, nfs.get(f.name, NullType())))
+        fields = [StructField(f.name, _merge_type(f.dataType, nfs.get(f.name, NullType()),
+                                                  name=new_name(f.name)))
                   for f in a.fields]
         names = set([f.name for f in fields])
         for n in nfs:
@@ -1130,11 +1141,12 @@ def _merge_type(a, b):
         return StructType(fields)
 
     elif isinstance(a, ArrayType):
-        return ArrayType(_merge_type(a.elementType, b.elementType), True)
+        return ArrayType(_merge_type(a.elementType, b.elementType,
+                                     name='element in array %s' % name), True)
 
     elif isinstance(a, MapType):
-        return MapType(_merge_type(a.keyType, b.keyType),
-                       _merge_type(a.valueType, b.valueType),
+        return MapType(_merge_type(a.keyType, b.keyType, name='key of map %s' % name),
+                       _merge_type(a.valueType, b.valueType, name='value of map %s' % name),
                        True)
     else:
         return a
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 54b5a8656e1c..c77f19f89a44 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -19,11 +19,13 @@
 """
 import functools
 
-from pyspark import SparkContext
-from pyspark.rdd import _prepare_for_python_RDD, PythonEvalType
+from pyspark import SparkContext, since
+from pyspark.rdd import _prepare_for_python_RDD, PythonEvalType, ignore_unicode_prefix
 from pyspark.sql.column import Column, _to_java_column, _to_seq
 from pyspark.sql.types import StringType, DataType, StructType, _parse_datatype_string
 
+__all__ = ["UDFRegistration"]
+
 
 def _wrap_function(sc, func, returnType):
     command = (func, returnType)
@@ -56,7 +58,8 @@ def _create_udf(f, returnType, evalType):
             )
 
     # Set the name of the UserDefinedFunction object to be the name of function f
-    udf_obj = UserDefinedFunction(f, returnType=returnType, name=None, evalType=evalType)
+    udf_obj = UserDefinedFunction(
+        f, returnType=returnType, name=None, evalType=evalType, deterministic=True)
     return udf_obj._wrapped()
 
 
@@ -67,8 +70,10 @@ class UserDefinedFunction(object):
     .. versionadded:: 1.3
     """
     def __init__(self, func,
-                 returnType=StringType(), name=None,
-                 evalType=PythonEvalType.SQL_BATCHED_UDF):
+                 returnType=StringType(),
+                 name=None,
+                 evalType=PythonEvalType.SQL_BATCHED_UDF,
+                 deterministic=True):
         if not callable(func):
             raise TypeError(
                 "Invalid function: not a function or callable (__call__ is not defined): "
@@ -92,7 +97,7 @@ def __init__(self, func,
             func.__name__ if hasattr(func, '__name__')
             else func.__class__.__name__)
         self.evalType = evalType
-        self._deterministic = True
+        self.deterministic = deterministic
 
     @property
     def returnType(self):
@@ -130,7 +135,7 @@ def _create_judf(self):
         wrapped_func = _wrap_function(sc, self.func, self.returnType)
         jdt = spark._jsparkSession.parseDataType(self.returnType.json())
         judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
-            self._name, wrapped_func, jdt, self.evalType, self._deterministic)
+            self._name, wrapped_func, jdt, self.evalType, self.deterministic)
         return judf
 
     def __call__(self, *cols):
@@ -138,6 +143,9 @@ def __call__(self, *cols):
         sc = SparkContext._active_spark_context
         return Column(judf.apply(_to_seq(sc, cols, _to_java_column)))
 
+    # This function is for improving the online help system in the interactive interpreter.
+    # For example, the built-in help / pydoc.help. It wraps the UDF with the docstring and
+    # argument annotation. (See: SPARK-19161)
     def _wrapped(self):
         """
         Wrap this udf with a function and attach docstring from func
@@ -162,8 +170,9 @@ def wrapper(*args):
         wrapper.func = self.func
         wrapper.returnType = self.returnType
         wrapper.evalType = self.evalType
-        wrapper.asNondeterministic = self.asNondeterministic
-
+        wrapper.deterministic = self.deterministic
+        wrapper.asNondeterministic = functools.wraps(
+            self.asNondeterministic)(lambda: self.asNondeterministic()._wrapped())
         return wrapper
 
     def asNondeterministic(self):
@@ -172,5 +181,191 @@ def asNondeterministic(self):
 
         .. versionadded:: 2.3
         """
-        self._deterministic = False
+        self.deterministic = False
         return self
+
+
+class UDFRegistration(object):
+    """
+    Wrapper for user-defined function registration. This instance can be accessed by
+    :attr:`spark.udf` or :attr:`sqlContext.udf`.
+
+    .. versionadded:: 1.3.1
+    """
+
+    def __init__(self, sparkSession):
+        self.sparkSession = sparkSession
+
+    @ignore_unicode_prefix
+    @since("1.3.1")
+    def register(self, name, f, returnType=None):
+        """Registers a Python function (including lambda function) or a user-defined function
+        in SQL statements.
+
+        :param name: name of the user-defined function in SQL statements.
+        :param f: a Python function, or a user-defined function. The user-defined function can
+            be either row-at-a-time or vectorized. See :meth:`pyspark.sql.functions.udf` and
+            :meth:`pyspark.sql.functions.pandas_udf`.
+        :param returnType: the return type of the registered user-defined function. The value can
+            be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
+        :return: a user-defined function.
+
+        `returnType` can be optionally specified when `f` is a Python function but not
+        when `f` is a user-defined function. Please see below.
+
+        1. When `f` is a Python function:
+
+            `returnType` defaults to string type and can be optionally specified. The produced
+            object must match the specified type. In this case, this API works as if
+            `register(name, f, returnType=StringType())`.
+
+            >>> strlen = spark.udf.register("stringLengthString", lambda x: len(x))
+            >>> spark.sql("SELECT stringLengthString('test')").collect()
+            [Row(stringLengthString(test)=u'4')]
+
+            >>> spark.sql("SELECT 'foo' AS text").select(strlen("text")).collect()
+            [Row(stringLengthString(text)=u'3')]
+
+            >>> from pyspark.sql.types import IntegerType
+            >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
+            >>> spark.sql("SELECT stringLengthInt('test')").collect()
+            [Row(stringLengthInt(test)=4)]
+
+            >>> from pyspark.sql.types import IntegerType
+            >>> _ = spark.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
+            >>> spark.sql("SELECT stringLengthInt('test')").collect()
+            [Row(stringLengthInt(test)=4)]
+
+        2. When `f` is a user-defined function:
+
+            Spark uses the return type of the given user-defined function as the return type of
+            the registered user-defined function. `returnType` should not be specified.
+            In this case, this API works as if `register(name, f)`.
+
+            >>> from pyspark.sql.types import IntegerType
+            >>> from pyspark.sql.functions import udf
+            >>> slen = udf(lambda s: len(s), IntegerType())
+            >>> _ = spark.udf.register("slen", slen)
+            >>> spark.sql("SELECT slen('test')").collect()
+            [Row(slen(test)=4)]
+
+            >>> import random
+            >>> from pyspark.sql.functions import udf
+            >>> from pyspark.sql.types import IntegerType
+            >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
+            >>> new_random_udf = spark.udf.register("random_udf", random_udf)
+            >>> spark.sql("SELECT random_udf()").collect()  # doctest: +SKIP
+            [Row(random_udf()=82)]
+
+            >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
+            >>> @pandas_udf("integer", PandasUDFType.SCALAR)  # doctest: +SKIP
+            ... def add_one(x):
+            ...     return x + 1
+            ...
+            >>> _ = spark.udf.register("add_one", add_one)  # doctest: +SKIP
+            >>> spark.sql("SELECT add_one(id) FROM range(3)").collect()  # doctest: +SKIP
+            [Row(add_one(id)=1), Row(add_one(id)=2), Row(add_one(id)=3)]
+
+            .. note:: Registration for a user-defined function (case 2.) was added from
+                Spark 2.3.0.
+        """
+
+        # This is to check whether the input function is from a user-defined function or
+        # Python function.
+        if hasattr(f, 'asNondeterministic'):
+            if returnType is not None:
+                raise TypeError(
+                    "Invalid returnType: data type can not be specified when f is"
+                    "a user-defined function, but got %s." % returnType)
+            if f.evalType not in [PythonEvalType.SQL_BATCHED_UDF,
+                                  PythonEvalType.SQL_PANDAS_SCALAR_UDF]:
+                raise ValueError(
+                    "Invalid f: f must be either SQL_BATCHED_UDF or SQL_PANDAS_SCALAR_UDF")
+            register_udf = UserDefinedFunction(f.func, returnType=f.returnType, name=name,
+                                               evalType=f.evalType,
+                                               deterministic=f.deterministic)
+            return_udf = f
+        else:
+            if returnType is None:
+                returnType = StringType()
+            register_udf = UserDefinedFunction(f, returnType=returnType, name=name,
+                                               evalType=PythonEvalType.SQL_BATCHED_UDF)
+            return_udf = register_udf._wrapped()
+        self.sparkSession._jsparkSession.udf().registerPython(name, register_udf._judf)
+        return return_udf
+
+    @ignore_unicode_prefix
+    @since(2.3)
+    def registerJavaFunction(self, name, javaClassName, returnType=None):
+        """Register a Java user-defined function so it can be used in SQL statements.
+
+        In addition to a name and the function itself, the return type can be optionally specified.
+        When the return type is not specified we would infer it via reflection.
+
+        :param name: name of the user-defined function
+        :param javaClassName: fully qualified name of java class
+        :param returnType: the return type of the registered Java function. The value can be either
+            a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
+
+        >>> from pyspark.sql.types import IntegerType
+        >>> spark.udf.registerJavaFunction(
+        ...     "javaStringLength", "test.org.apache.spark.sql.JavaStringLength", IntegerType())
+        >>> spark.sql("SELECT javaStringLength('test')").collect()
+        [Row(UDF:javaStringLength(test)=4)]
+
+        >>> spark.udf.registerJavaFunction(
+        ...     "javaStringLength2", "test.org.apache.spark.sql.JavaStringLength")
+        >>> spark.sql("SELECT javaStringLength2('test')").collect()
+        [Row(UDF:javaStringLength2(test)=4)]
+
+        >>> spark.udf.registerJavaFunction(
+        ...     "javaStringLength3", "test.org.apache.spark.sql.JavaStringLength", "integer")
+        >>> spark.sql("SELECT javaStringLength3('test')").collect()
+        [Row(UDF:javaStringLength3(test)=4)]
+        """
+
+        jdt = None
+        if returnType is not None:
+            if not isinstance(returnType, DataType):
+                returnType = _parse_datatype_string(returnType)
+            jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())
+        self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
+
+    @ignore_unicode_prefix
+    @since(2.3)
+    def registerJavaUDAF(self, name, javaClassName):
+        """Register a Java user-defined aggregate function so it can be used in SQL statements.
+
+        :param name: name of the user-defined aggregate function
+        :param javaClassName: fully qualified name of java class
+
+        >>> spark.udf.registerJavaUDAF("javaUDAF", "test.org.apache.spark.sql.MyDoubleAvg")
+        >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "a")],["id", "name"])
+        >>> df.registerTempTable("df")
+        >>> spark.sql("SELECT name, javaUDAF(id) as avg from df group by name").collect()
+        [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)]
+        """
+
+        self.sparkSession._jsparkSession.udf().registerJavaUDAF(name, javaClassName)
+
+
+def _test():
+    import doctest
+    from pyspark.sql import SparkSession
+    import pyspark.sql.udf
+    globs = pyspark.sql.udf.__dict__.copy()
+    spark = SparkSession.builder\
+        .master("local[4]")\
+        .appName("sql.udf tests")\
+        .getOrCreate()
+    globs['spark'] = spark
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.sql.udf, globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
+    spark.stop()
+    if failure_count:
+        exit(-1)
+
+
+if __name__ == "__main__":
+    _test()
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 12dd53b9d290..b09fa9822184 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.0.dev0"
+__version__ = "2.3.1.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 1cb0098d0eca..3b445bfaf773 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 7d35aea8a414..ae1c69f30558 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
index e5d79d9a9d9d..471196ac0e3f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -29,17 +29,23 @@ private[spark] object Config extends Logging {
       .stringConf
       .createWithDefault("default")
 
+  val CONTAINER_IMAGE =
+    ConfigBuilder("spark.kubernetes.container.image")
+      .doc("Container image to use for Spark containers. Individual container types " +
+        "(e.g. driver or executor) can also be configured to use different images if desired, " +
+        "by setting the container type-specific image name.")
+      .stringConf
+      .createOptional
+
   val DRIVER_CONTAINER_IMAGE =
     ConfigBuilder("spark.kubernetes.driver.container.image")
       .doc("Container image to use for the driver.")
-      .stringConf
-      .createOptional
+      .fallbackConf(CONTAINER_IMAGE)
 
   val EXECUTOR_CONTAINER_IMAGE =
     ConfigBuilder("spark.kubernetes.executor.container.image")
       .doc("Container image to use for the executors.")
-      .stringConf
-      .createOptional
+      .fallbackConf(CONTAINER_IMAGE)
 
   val CONTAINER_IMAGE_PULL_POLICY =
     ConfigBuilder("spark.kubernetes.container.image.pullPolicy")
@@ -148,8 +154,7 @@ private[spark] object Config extends Logging {
   val INIT_CONTAINER_IMAGE =
     ConfigBuilder("spark.kubernetes.initContainer.image")
       .doc("Image for the driver and executor's init-container for downloading dependencies.")
-      .stringConf
-      .createOptional
+      .fallbackConf(CONTAINER_IMAGE)
 
   val INIT_CONTAINER_MOUNT_TIMEOUT =
     ConfigBuilder("spark.kubernetes.mountDependencies.timeout")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
index 111cb2a3b75e..941195699684 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
@@ -60,10 +60,9 @@ private[spark] object Constants {
   val ENV_APPLICATION_ID = "SPARK_APPLICATION_ID"
   val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
   val ENV_EXECUTOR_POD_IP = "SPARK_EXECUTOR_POD_IP"
-  val ENV_EXECUTOR_EXTRA_CLASSPATH = "SPARK_EXECUTOR_EXTRA_CLASSPATH"
   val ENV_MOUNTED_CLASSPATH = "SPARK_MOUNTED_CLASSPATH"
   val ENV_JAVA_OPT_PREFIX = "SPARK_JAVA_OPT_"
-  val ENV_SUBMIT_EXTRA_CLASSPATH = "SPARK_SUBMIT_EXTRA_CLASSPATH"
+  val ENV_CLASSPATH = "SPARK_CLASSPATH"
   val ENV_DRIVER_MAIN_CLASS = "SPARK_DRIVER_CLASS"
   val ENV_DRIVER_ARGS = "SPARK_DRIVER_ARGS"
   val ENV_DRIVER_JAVA_OPTS = "SPARK_DRIVER_JAVA_OPTS"
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
index dfeccf9e2bd1..f6a57dfe0017 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
@@ -77,6 +77,7 @@ private[spark] class InitContainerBootstrap(
         .withMountPath(INIT_CONTAINER_PROPERTIES_FILE_DIR)
         .endVolumeMount()
       .addToVolumeMounts(sharedVolumeMounts: _*)
+      .addToArgs("init")
       .addToArgs(INIT_CONTAINER_PROPERTIES_FILE_PATH)
       .build()
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
index 8286546ce064..c35e7db51d40 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
@@ -24,26 +24,36 @@ import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, Pod, PodBui
 private[spark] class MountSecretsBootstrap(secretNamesToMountPaths: Map[String, String]) {
 
   /**
-   * Mounts Kubernetes secrets as secret volumes into the given container in the given pod.
+   * Add new secret volumes for the secrets specified in secretNamesToMountPaths into the given pod.
    *
    * @param pod the pod into which the secret volumes are being added.
-   * @param container the container into which the secret volumes are being mounted.
-   * @return the updated pod and container with the secrets mounted.
+   * @return the updated pod with the secret volumes added.
    */
-  def mountSecrets(pod: Pod, container: Container): (Pod, Container) = {
+  def addSecretVolumes(pod: Pod): Pod = {
     var podBuilder = new PodBuilder(pod)
     secretNamesToMountPaths.keys.foreach { name =>
       podBuilder = podBuilder
         .editOrNewSpec()
           .addNewVolume()
-          .withName(secretVolumeName(name))
-          .withNewSecret()
-            .withSecretName(name)
-            .endSecret()
-          .endVolume()
+            .withName(secretVolumeName(name))
+            .withNewSecret()
+              .withSecretName(name)
+              .endSecret()
+            .endVolume()
           .endSpec()
     }
 
+    podBuilder.build()
+  }
+
+  /**
+   * Mounts Kubernetes secret volumes of the secrets specified in secretNamesToMountPaths into the
+   * given container.
+   *
+   * @param container the container into which the secret volumes are being mounted.
+   * @return the updated container with the secrets mounted.
+   */
+  def mountSecrets(container: Container): Container = {
     var containerBuilder = new ContainerBuilder(container)
     secretNamesToMountPaths.foreach { case (name, path) =>
       containerBuilder = containerBuilder
@@ -53,7 +63,7 @@ private[spark] class MountSecretsBootstrap(secretNamesToMountPaths: Map[String,
           .endVolumeMount()
     }
 
-    (podBuilder.build(), containerBuilder.build())
+    containerBuilder.build()
   }
 
   private def secretVolumeName(secretName: String): String = {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala
index 4a4b628aedbb..c0f08786b76a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.k8s
+package org.apache.spark.deploy.k8s
 
 import java.io.File
 import java.util.concurrent.TimeUnit
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
index 00c9c4ee4917..ae7090462118 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
@@ -20,7 +20,7 @@ import java.util.UUID
 
 import com.google.common.primitives.Longs
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.k8s.{KubernetesUtils, MountSecretsBootstrap}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
@@ -117,6 +117,12 @@ private[spark] class DriverConfigOrchestrator(
       .map(_.split(","))
       .getOrElse(Array.empty[String])
 
+    // TODO(SPARK-23153): remove once submission client local dependencies are supported.
+    if (existSubmissionLocalFiles(sparkJars) || existSubmissionLocalFiles(sparkFiles)) {
+      throw new SparkException("The Kubernetes mode does not yet support referencing application " +
+        "dependencies in the local file system.")
+    }
+
     val dependencyResolutionStep = if (sparkJars.nonEmpty || sparkFiles.nonEmpty) {
       Seq(new DependencyResolutionStep(
         sparkJars,
@@ -127,6 +133,12 @@ private[spark] class DriverConfigOrchestrator(
       Nil
     }
 
+    val mountSecretsStep = if (secretNamesToMountPaths.nonEmpty) {
+      Seq(new DriverMountSecretsStep(new MountSecretsBootstrap(secretNamesToMountPaths)))
+    } else {
+      Nil
+    }
+
     val initContainerBootstrapStep = if (existNonContainerLocalFiles(sparkJars ++ sparkFiles)) {
       val orchestrator = new InitContainerConfigOrchestrator(
         sparkJars,
@@ -147,19 +159,19 @@ private[spark] class DriverConfigOrchestrator(
       Nil
     }
 
-    val mountSecretsStep = if (secretNamesToMountPaths.nonEmpty) {
-      Seq(new DriverMountSecretsStep(new MountSecretsBootstrap(secretNamesToMountPaths)))
-    } else {
-      Nil
-    }
-
     Seq(
       initialSubmissionStep,
       serviceBootstrapStep,
       kubernetesCredentialsStep) ++
       dependencyResolutionStep ++
-      initContainerBootstrapStep ++
-      mountSecretsStep
+      mountSecretsStep ++
+      initContainerBootstrapStep
+  }
+
+  private def existSubmissionLocalFiles(files: Seq[String]): Boolean = {
+    files.exists { uri =>
+      Utils.resolveURI(uri).getScheme == "file"
+    }
   }
 
   private def existNonContainerLocalFiles(files: Seq[String]): Boolean = {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
index b7a69a7dfd47..164e2e559477 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
@@ -66,7 +66,7 @@ private[spark] class BasicDriverConfigurationStep(
   override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
     val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
       new EnvVarBuilder()
-        .withName(ENV_SUBMIT_EXTRA_CLASSPATH)
+        .withName(ENV_CLASSPATH)
         .withValue(classPath)
         .build()
     }
@@ -119,7 +119,7 @@ private[spark] class BasicDriverConfigurationStep(
         .endEnv()
       .addNewEnv()
         .withName(ENV_DRIVER_ARGS)
-        .withValue(appArgs.map(arg => "\"" + arg + "\"").mkString(" "))
+        .withValue(appArgs.mkString(" "))
         .endEnv()
       .addNewEnv()
         .withName(ENV_DRIVER_BIND_ADDRESS)
@@ -133,6 +133,7 @@ private[spark] class BasicDriverConfigurationStep(
         .addToLimits("memory", driverMemoryLimitQuantity)
         .addToLimits(maybeCpuLimitQuantity.toMap.asJava)
         .endResources()
+      .addToArgs("driver")
       .build()
 
     val baseDriverPod = new PodBuilder(driverSpec.driverPod)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
index f872e0f4b65d..91e9a9f21133 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
@@ -28,8 +28,8 @@ private[spark] class DriverMountSecretsStep(
     bootstrap: MountSecretsBootstrap) extends DriverConfigurationStep {
 
   override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
-    val (pod, container) = bootstrap.mountSecrets(
-      driverSpec.driverPod, driverSpec.driverContainer)
+    val pod = bootstrap.addSecretVolumes(driverSpec.driverPod)
+    val container = bootstrap.mountSecrets(driverSpec.driverContainer)
     driverSpec.copy(
       driverPod = pod,
       driverContainer = container
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
index eb594e4f16ec..34af7cde6c1a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
@@ -83,7 +83,7 @@ private[spark] class DriverServiceBootstrapStep(
       .build()
 
     val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
-    val driverHostname = s"${driverService.getMetadata.getName}.$namespace.svc.cluster.local"
+    val driverHostname = s"${driverService.getMetadata.getName}.$namespace.svc"
     val resolvedSparkConf = driverSpec.driverSparkConf.clone()
       .set(DRIVER_HOST_KEY, driverHostname)
       .set("spark.driver.port", driverPort.toString)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
index c0e7bb20cce8..0daa7b95e8aa 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
@@ -28,12 +28,9 @@ private[spark] class InitContainerMountSecretsStep(
     bootstrap: MountSecretsBootstrap) extends InitContainerConfigurationStep {
 
   override def configureInitContainer(spec: InitContainerSpec) : InitContainerSpec = {
-    val (driverPod, initContainer) = bootstrap.mountSecrets(
-      spec.driverPod,
-      spec.initContainer)
-    spec.copy(
-      driverPod = driverPod,
-      initContainer = initContainer
-    )
+    // Mount the secret volumes given that the volumes have already been added to the driver pod
+    // when mounting the secrets into the main driver container.
+    val initContainer = bootstrap.mountSecrets(spec.initContainer)
+    spec.copy(initContainer = initContainer)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
index ba5d891f4c77..141bd2827e7c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
@@ -94,6 +94,8 @@ private[spark] class ExecutorPodFactory(
   private val executorCores = sparkConf.getDouble("spark.executor.cores", 1)
   private val executorLimitCores = sparkConf.get(KUBERNETES_EXECUTOR_LIMIT_CORES)
 
+  private val executorJarsDownloadDir = sparkConf.get(JARS_DOWNLOAD_LOCATION)
+
   /**
    * Configure and construct an executor pod with the given parameters.
    */
@@ -126,7 +128,7 @@ private[spark] class ExecutorPodFactory(
       .build()
     val executorExtraClasspathEnv = executorExtraClasspath.map { cp =>
       new EnvVarBuilder()
-        .withName(ENV_EXECUTOR_EXTRA_CLASSPATH)
+        .withName(ENV_CLASSPATH)
         .withValue(cp)
         .build()
     }
@@ -145,7 +147,8 @@ private[spark] class ExecutorPodFactory(
       (ENV_EXECUTOR_CORES, math.ceil(executorCores).toInt.toString),
       (ENV_EXECUTOR_MEMORY, executorMemoryString),
       (ENV_APPLICATION_ID, applicationId),
-      (ENV_EXECUTOR_ID, executorId)) ++ executorEnvs)
+      (ENV_EXECUTOR_ID, executorId),
+      (ENV_MOUNTED_CLASSPATH, s"$executorJarsDownloadDir/*")) ++ executorEnvs)
       .map(env => new EnvVarBuilder()
         .withName(env._1)
         .withValue(env._2)
@@ -178,6 +181,7 @@ private[spark] class ExecutorPodFactory(
         .endResources()
       .addAllToEnv(executorEnv.asJava)
       .withPorts(requiredPorts.asJava)
+      .addToArgs("executor")
       .build()
 
     val executorPod = new PodBuilder()
@@ -214,7 +218,7 @@ private[spark] class ExecutorPodFactory(
 
     val (maybeSecretsMountedPod, maybeSecretsMountedContainer) =
       mountSecretsBootstrap.map { bootstrap =>
-        bootstrap.mountSecrets(executorPod, containerWithLimitCores)
+        (bootstrap.addSecretVolumes(executorPod), bootstrap.mountSecrets(containerWithLimitCores))
       }.getOrElse((executorPod, containerWithLimitCores))
 
     val (bootstrappedPod, bootstrappedContainer) =
@@ -227,7 +231,9 @@ private[spark] class ExecutorPodFactory(
 
         val (pod, mayBeSecretsMountedInitContainer) =
           initContainerMountSecretsBootstrap.map { bootstrap =>
-            bootstrap.mountSecrets(podWithInitContainer.pod, podWithInitContainer.initContainer)
+            // Mount the secret volumes given that the volumes have already been added to the
+            // executor pod when mounting the secrets into the main executor container.
+            (podWithInitContainer.pod, bootstrap.mountSecrets(podWithInitContainer.initContainer))
           }.getOrElse((podWithInitContainer.pod, podWithInitContainer.initContainer))
 
         val bootstrappedPod = KubernetesUtils.appendInitContainer(
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala
similarity index 71%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala
index 8388c16ded26..16780584a674 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.k8s.submit
+package org.apache.spark.deploy.k8s
 
 import scala.collection.JavaConverters._
 
@@ -22,15 +22,15 @@ import io.fabric8.kubernetes.api.model.{Container, Pod}
 
 private[spark] object SecretVolumeUtils {
 
-  def podHasVolume(driverPod: Pod, volumeName: String): Boolean = {
-    driverPod.getSpec.getVolumes.asScala.exists(volume => volume.getName == volumeName)
+  def podHasVolume(pod: Pod, volumeName: String): Boolean = {
+    pod.getSpec.getVolumes.asScala.exists { volume =>
+      volume.getName == volumeName
+    }
   }
 
-  def containerHasVolume(
-      driverContainer: Container,
-      volumeName: String,
-      mountPath: String): Boolean = {
-    driverContainer.getVolumeMounts.asScala.exists(volumeMount =>
-      volumeMount.getName == volumeName && volumeMount.getMountPath == mountPath)
+  def containerHasVolume(container: Container, volumeName: String, mountPath: String): Boolean = {
+    container.getVolumeMounts.asScala.exists { volumeMount =>
+      volumeMount.getName == volumeName && volumeMount.getMountPath == mountPath
+    }
   }
 }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala
index 6c557ec4a7c9..e0f29ecd0fb5 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.k8s
+package org.apache.spark.deploy.k8s
 
 import java.io.File
 import java.util.UUID
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
index f193b1f4d366..033d303e946f 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.deploy.k8s.submit
 
-import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.submit.steps._
 
@@ -34,8 +34,7 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
   private val SECRET_MOUNT_PATH = "/etc/secrets/driver"
 
   test("Base submission steps with a main app resource.") {
-    val sparkConf = new SparkConf(false)
-      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
+    val sparkConf = new SparkConf(false).set(CONTAINER_IMAGE, DRIVER_IMAGE)
     val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
     val orchestrator = new DriverConfigOrchestrator(
       APP_ID,
@@ -55,8 +54,7 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
   }
 
   test("Base submission steps without a main app resource.") {
-    val sparkConf = new SparkConf(false)
-      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
+    val sparkConf = new SparkConf(false).set(CONTAINER_IMAGE, DRIVER_IMAGE)
     val orchestrator = new DriverConfigOrchestrator(
       APP_ID,
       LAUNCH_TIME,
@@ -75,8 +73,8 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
 
   test("Submission steps with an init-container.") {
     val sparkConf = new SparkConf(false)
-      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
-      .set(INIT_CONTAINER_IMAGE, IC_IMAGE)
+      .set(CONTAINER_IMAGE, DRIVER_IMAGE)
+      .set(INIT_CONTAINER_IMAGE.key, IC_IMAGE)
       .set("spark.jars", "hdfs://localhost:9000/var/apps/jars/jar1.jar")
     val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
     val orchestrator = new DriverConfigOrchestrator(
@@ -98,7 +96,7 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
 
   test("Submission steps with driver secrets to mount") {
     val sparkConf = new SparkConf(false)
-      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
+      .set(CONTAINER_IMAGE, DRIVER_IMAGE)
       .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_FOO", SECRET_MOUNT_PATH)
       .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_BAR", SECRET_MOUNT_PATH)
     val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
@@ -119,6 +117,35 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
       classOf[DriverMountSecretsStep])
   }
 
+  test("Submission using client local dependencies") {
+    val sparkConf = new SparkConf(false)
+      .set(CONTAINER_IMAGE, DRIVER_IMAGE)
+    var orchestrator = new DriverConfigOrchestrator(
+      APP_ID,
+      LAUNCH_TIME,
+      Some(JavaMainAppResource("file:///var/apps/jars/main.jar")),
+      APP_NAME,
+      MAIN_CLASS,
+      APP_ARGS,
+      sparkConf)
+    assertThrows[SparkException] {
+      orchestrator.getAllConfigurationSteps
+    }
+
+    sparkConf.set("spark.files", "/path/to/file1,/path/to/file2")
+    orchestrator = new DriverConfigOrchestrator(
+      APP_ID,
+      LAUNCH_TIME,
+      Some(JavaMainAppResource("local:///var/apps/jars/main.jar")),
+      APP_NAME,
+      MAIN_CLASS,
+      APP_ARGS,
+      sparkConf)
+    assertThrows[SparkException] {
+      orchestrator.getAllConfigurationSteps
+    }
+  }
+
   private def validateStepTypes(
       orchestrator: DriverConfigOrchestrator,
       types: Class[_ <: DriverConfigurationStep]*): Unit = {
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
index e864c6a16eeb..b136f2c02ffb 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
@@ -33,7 +33,7 @@ class BasicDriverConfigurationStepSuite extends SparkFunSuite {
   private val CONTAINER_IMAGE_PULL_POLICY = "IfNotPresent"
   private val APP_NAME = "spark-test"
   private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
-  private val APP_ARGS = Array("arg1", "arg2", "arg 3")
+  private val APP_ARGS = Array("arg1", "arg2", "\"arg 3\"")
   private val CUSTOM_ANNOTATION_KEY = "customAnnotation"
   private val CUSTOM_ANNOTATION_VALUE = "customAnnotationValue"
   private val DRIVER_CUSTOM_ENV_KEY1 = "customDriverEnv1"
@@ -47,7 +47,7 @@ class BasicDriverConfigurationStepSuite extends SparkFunSuite {
       .set(KUBERNETES_DRIVER_LIMIT_CORES, "4")
       .set(org.apache.spark.internal.config.DRIVER_MEMORY.key, "256M")
       .set(org.apache.spark.internal.config.DRIVER_MEMORY_OVERHEAD, 200L)
-      .set(DRIVER_CONTAINER_IMAGE, "spark-driver:latest")
+      .set(CONTAINER_IMAGE, "spark-driver:latest")
       .set(s"$KUBERNETES_DRIVER_ANNOTATION_PREFIX$CUSTOM_ANNOTATION_KEY", CUSTOM_ANNOTATION_VALUE)
       .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY1", "customDriverEnv1")
       .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY2", "customDriverEnv2")
@@ -79,10 +79,10 @@ class BasicDriverConfigurationStepSuite extends SparkFunSuite {
       .asScala
       .map(env => (env.getName, env.getValue))
       .toMap
-    assert(envs(ENV_SUBMIT_EXTRA_CLASSPATH) === "/opt/spark/spark-examples.jar")
+    assert(envs(ENV_CLASSPATH) === "/opt/spark/spark-examples.jar")
     assert(envs(ENV_DRIVER_MEMORY) === "256M")
     assert(envs(ENV_DRIVER_MAIN_CLASS) === MAIN_CLASS)
-    assert(envs(ENV_DRIVER_ARGS) === "\"arg1\" \"arg2\" \"arg 3\"")
+    assert(envs(ENV_DRIVER_ARGS) === "arg1 arg2 \"arg 3\"")
     assert(envs(DRIVER_CUSTOM_ENV_KEY1) === "customDriverEnv1")
     assert(envs(DRIVER_CUSTOM_ENV_KEY2) === "customDriverEnv2")
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
index 9ec0cb55de5a..960d0bda1d01 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
@@ -17,8 +17,8 @@
 package org.apache.spark.deploy.k8s.submit.steps
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.MountSecretsBootstrap
-import org.apache.spark.deploy.k8s.submit.{KubernetesDriverSpec, SecretVolumeUtils}
+import org.apache.spark.deploy.k8s.{MountSecretsBootstrap, SecretVolumeUtils}
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
 
 class DriverMountSecretsStepSuite extends SparkFunSuite {
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
index 006ce2668f8a..78c8c3ba1afb 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
@@ -85,7 +85,7 @@ class DriverServiceBootstrapStepSuite extends SparkFunSuite with BeforeAndAfter
     val resolvedDriverSpec = configurationStep.configureDriver(baseDriverSpec)
     val expectedServiceName = SHORT_RESOURCE_NAME_PREFIX +
       DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX
-    val expectedHostName = s"$expectedServiceName.my-namespace.svc.cluster.local"
+    val expectedHostName = s"$expectedServiceName.my-namespace.svc"
     verifySparkConfHostNames(resolvedDriverSpec.driverSparkConf, expectedHostName)
   }
 
@@ -120,7 +120,7 @@ class DriverServiceBootstrapStepSuite extends SparkFunSuite with BeforeAndAfter
     val driverService = resolvedDriverSpec.otherKubernetesResources.head.asInstanceOf[Service]
     val expectedServiceName = s"spark-10000${DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX}"
     assert(driverService.getMetadata.getName === expectedServiceName)
-    val expectedHostName = s"$expectedServiceName.my-namespace.svc.cluster.local"
+    val expectedHostName = s"$expectedServiceName.my-namespace.svc"
     verifySparkConfHostNames(resolvedDriverSpec.driverSparkConf, expectedHostName)
   }
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
index 20f2e5bc15df..09b42e4484d8 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
@@ -40,7 +40,7 @@ class InitContainerConfigOrchestratorSuite extends SparkFunSuite {
 
   test("including basic configuration step") {
     val sparkConf = new SparkConf(true)
-      .set(INIT_CONTAINER_IMAGE, DOCKER_IMAGE)
+      .set(CONTAINER_IMAGE, DOCKER_IMAGE)
       .set(s"$KUBERNETES_DRIVER_LABEL_PREFIX$CUSTOM_LABEL_KEY", CUSTOM_LABEL_VALUE)
 
     val orchestrator = new InitContainerConfigOrchestrator(
@@ -59,7 +59,7 @@ class InitContainerConfigOrchestratorSuite extends SparkFunSuite {
 
   test("including step to mount user-specified secrets") {
     val sparkConf = new SparkConf(false)
-      .set(INIT_CONTAINER_IMAGE, DOCKER_IMAGE)
+      .set(CONTAINER_IMAGE, DOCKER_IMAGE)
       .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_FOO", SECRET_MOUNT_PATH)
       .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_BAR", SECRET_MOUNT_PATH)
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
index eab4e1765945..7ac0bde80dfe 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
@@ -19,8 +19,7 @@ package org.apache.spark.deploy.k8s.submit.steps.initcontainer
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder}
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.k8s.MountSecretsBootstrap
-import org.apache.spark.deploy.k8s.submit.SecretVolumeUtils
+import org.apache.spark.deploy.k8s.{MountSecretsBootstrap, SecretVolumeUtils}
 
 class InitContainerMountSecretsStepSuite extends SparkFunSuite {
 
@@ -44,12 +43,8 @@ class InitContainerMountSecretsStepSuite extends SparkFunSuite {
     val initContainerMountSecretsStep = new InitContainerMountSecretsStep(mountSecretsBootstrap)
     val configuredInitContainerSpec = initContainerMountSecretsStep.configureInitContainer(
       baseInitContainerSpec)
-
-    val podWithSecretsMounted = configuredInitContainerSpec.driverPod
     val initContainerWithSecretsMounted = configuredInitContainerSpec.initContainer
 
-    Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach(volumeName =>
-      assert(SecretVolumeUtils.podHasVolume(podWithSecretsMounted, volumeName)))
     Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach(volumeName =>
       assert(SecretVolumeUtils.containerHasVolume(
         initContainerWithSecretsMounted, volumeName, SECRET_MOUNT_PATH)))
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
index 7121a802c69c..a3c615be031d 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
@@ -25,7 +25,7 @@ import org.mockito.Mockito._
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterEach}
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.{InitContainerBootstrap, MountSecretsBootstrap, PodWithDetachedInitContainer}
+import org.apache.spark.deploy.k8s.{InitContainerBootstrap, MountSecretsBootstrap, PodWithDetachedInitContainer, SecretVolumeUtils}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
 
@@ -54,7 +54,7 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
     baseConf = new SparkConf()
       .set(KUBERNETES_DRIVER_POD_NAME, driverPodName)
       .set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, executorPrefix)
-      .set(EXECUTOR_CONTAINER_IMAGE, executorImage)
+      .set(CONTAINER_IMAGE, executorImage)
   }
 
   test("basic executor pod has reasonable defaults") {
@@ -107,7 +107,7 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
 
     checkEnv(executor,
       Map("SPARK_JAVA_OPT_0" -> "foo=bar",
-        "SPARK_EXECUTOR_EXTRA_CLASSPATH" -> "bar=baz",
+        ENV_CLASSPATH -> "bar=baz",
         "qux" -> "quux"))
     checkOwnerReferences(executor, driverPodUid)
   }
@@ -165,17 +165,19 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
 
     val factory = new ExecutorPodFactory(
       conf,
-      None,
+      Some(secretsBootstrap),
       Some(initContainerBootstrap),
       Some(secretsBootstrap))
     val executor = factory.createExecutorPod(
       "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
 
+    assert(executor.getSpec.getVolumes.size() === 1)
+    assert(SecretVolumeUtils.podHasVolume(executor, "secret1-volume"))
+    assert(SecretVolumeUtils.containerHasVolume(
+      executor.getSpec.getContainers.get(0), "secret1-volume", "/var/secret1"))
     assert(executor.getSpec.getInitContainers.size() === 1)
-    assert(executor.getSpec.getInitContainers.get(0).getVolumeMounts.get(0).getName
-      === "secret1-volume")
-    assert(executor.getSpec.getInitContainers.get(0).getVolumeMounts.get(0)
-      .getMountPath === "/var/secret1")
+    assert(SecretVolumeUtils.containerHasVolume(
+      executor.getSpec.getInitContainers.get(0), "secret1-volume", "/var/secret1"))
 
     checkOwnerReferences(executor, driverPodUid)
   }
@@ -195,7 +197,8 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
       ENV_EXECUTOR_CORES -> "1",
       ENV_EXECUTOR_MEMORY -> "1g",
       ENV_APPLICATION_ID -> "dummy",
-      ENV_EXECUTOR_POD_IP -> null) ++ additionalEnvVars
+      ENV_EXECUTOR_POD_IP -> null,
+      ENV_MOUNTED_CLASSPATH -> "/var/spark-data/spark-jars/*") ++ additionalEnvVars
 
     assert(executor.getSpec.getContainers.size() === 1)
     assert(executor.getSpec.getContainers.get(0).getEnv.size() === defaultEnvs.size)
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
deleted file mode 100644
index 45fbcd9cd0de..000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM spark-base
-
-# Before building the docker image, first build and make a Spark distribution following
-# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
-# If this docker file is being used in the context of building your images from a Spark
-# distribution, the docker build command should be invoked from the top level directory
-# of the Spark distribution. E.g.:
-# docker build -t spark-driver:latest -f kubernetes/dockerfiles/driver/Dockerfile .
-
-COPY examples /opt/spark/examples
-
-CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
-    env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt && \
-    readarray -t SPARK_DRIVER_JAVA_OPTS < /tmp/java_opts.txt && \
-    if ! [ -z ${SPARK_MOUNTED_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_SUBMIT_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_SUBMIT_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_MOUNTED_FILES_DIR+x} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
-    ${JAVA_HOME}/bin/java "${SPARK_DRIVER_JAVA_OPTS[@]}" -cp "$SPARK_CLASSPATH" -Xms$SPARK_DRIVER_MEMORY -Xmx$SPARK_DRIVER_MEMORY -Dspark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS $SPARK_DRIVER_CLASS $SPARK_DRIVER_ARGS
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
deleted file mode 100644
index 0f806cf7e148..000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM spark-base
-
-# Before building the docker image, first build and make a Spark distribution following
-# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
-# If this docker file is being used in the context of building your images from a Spark
-# distribution, the docker build command should be invoked from the top level directory
-# of the Spark distribution. E.g.:
-# docker build -t spark-executor:latest -f kubernetes/dockerfiles/executor/Dockerfile .
-
-COPY examples /opt/spark/examples
-
-CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
-    env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt && \
-    readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt && \
-    if ! [ -z ${SPARK_MOUNTED_CLASSPATH}+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_EXECUTOR_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXECUTOR_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_MOUNTED_FILES_DIR+x} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
-    ${JAVA_HOME}/bin/java "${SPARK_EXECUTOR_JAVA_OPTS[@]}" -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp "$SPARK_CLASSPATH" org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
deleted file mode 100644
index 055493188fcb..000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM spark-base
-
-# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
-# command should be invoked from the top level directory of the Spark distribution. E.g.:
-# docker build -t spark-init:latest -f kubernetes/dockerfiles/init-container/Dockerfile .
-
-ENTRYPOINT [ "/opt/entrypoint.sh", "/opt/spark/bin/spark-class", "org.apache.spark.deploy.rest.k8s.SparkPodInitContainer" ]
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh
deleted file mode 100755
index 82559889f4be..000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# echo commands to the terminal output
-set -ex
-
-# Check whether there is a passwd entry for the container UID
-myuid=$(id -u)
-mygid=$(id -g)
-uidentry=$(getent passwd $myuid)
-
-# If there is no passwd entry for the container UID, attempt to create one
-if [ -z "$uidentry" ] ; then
-    if [ -w /etc/passwd ] ; then
-        echo "$myuid:x:$myuid:$mygid:anonymous uid:$SPARK_HOME:/bin/false" >> /etc/passwd
-    else
-        echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
-    fi
-fi
-
-# Execute the container CMD under tini for better hygiene
-/sbin/tini -s -- "$@"
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
similarity index 85%
rename from resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
rename to resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
index 222e777db3a8..491b7cf69247 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
@@ -17,12 +17,15 @@
 
 FROM openjdk:8-alpine
 
+ARG spark_jars=jars
+ARG img_path=kubernetes/dockerfiles
+
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
 # If this docker file is being used in the context of building your images from a Spark
 # distribution, the docker build command should be invoked from the top level directory
 # of the Spark distribution. E.g.:
-# docker build -t spark-base:latest -f kubernetes/dockerfiles/spark-base/Dockerfile .
+# docker build -t spark:latest -f kubernetes/dockerfiles/spark/Dockerfile .
 
 RUN set -ex && \
     apk upgrade --no-cache && \
@@ -34,11 +37,13 @@ RUN set -ex && \
     ln -sv /bin/bash /bin/sh && \
     chgrp root /etc/passwd && chmod ug+rw /etc/passwd
 
-COPY jars /opt/spark/jars
+COPY ${spark_jars} /opt/spark/jars
 COPY bin /opt/spark/bin
 COPY sbin /opt/spark/sbin
 COPY conf /opt/spark/conf
-COPY kubernetes/dockerfiles/spark-base/entrypoint.sh /opt/
+COPY ${img_path}/spark/entrypoint.sh /opt/
+COPY examples /opt/spark/examples
+COPY data /opt/spark/data
 
 ENV SPARK_HOME /opt/spark
 
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
new file mode 100755
index 000000000000..b9090dc2852a
--- /dev/null
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# echo commands to the terminal output
+set -ex
+
+# Check whether there is a passwd entry for the container UID
+myuid=$(id -u)
+mygid=$(id -g)
+uidentry=$(getent passwd $myuid)
+
+# If there is no passwd entry for the container UID, attempt to create one
+if [ -z "$uidentry" ] ; then
+    if [ -w /etc/passwd ] ; then
+        echo "$myuid:x:$myuid:$mygid:anonymous uid:$SPARK_HOME:/bin/false" >> /etc/passwd
+    else
+        echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
+    fi
+fi
+
+SPARK_K8S_CMD="$1"
+if [ -z "$SPARK_K8S_CMD" ]; then
+  echo "No command to execute has been provided." 1>&2
+  exit 1
+fi
+shift 1
+
+SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
+env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt
+readarray -t SPARK_JAVA_OPTS < /tmp/java_opts.txt
+if [ -n "$SPARK_MOUNTED_CLASSPATH" ]; then
+  SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_MOUNTED_CLASSPATH"
+fi
+if [ -n "$SPARK_MOUNTED_FILES_DIR" ]; then
+  cp -R "$SPARK_MOUNTED_FILES_DIR/." .
+fi
+
+case "$SPARK_K8S_CMD" in
+  driver)
+    CMD=(
+      ${JAVA_HOME}/bin/java
+      "${SPARK_JAVA_OPTS[@]}"
+      -cp "$SPARK_CLASSPATH"
+      -Xms$SPARK_DRIVER_MEMORY
+      -Xmx$SPARK_DRIVER_MEMORY
+      -Dspark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS
+      $SPARK_DRIVER_CLASS
+      $SPARK_DRIVER_ARGS
+    )
+    ;;
+
+  executor)
+    CMD=(
+      ${JAVA_HOME}/bin/java
+      "${SPARK_JAVA_OPTS[@]}"
+      -Xms$SPARK_EXECUTOR_MEMORY
+      -Xmx$SPARK_EXECUTOR_MEMORY
+      -cp "$SPARK_CLASSPATH"
+      org.apache.spark.executor.CoarseGrainedExecutorBackend
+      --driver-url $SPARK_DRIVER_URL
+      --executor-id $SPARK_EXECUTOR_ID
+      --cores $SPARK_EXECUTOR_CORES
+      --app-id $SPARK_APPLICATION_ID
+      --hostname $SPARK_EXECUTOR_POD_IP
+    )
+    ;;
+
+  init)
+    CMD=(
+      "$SPARK_HOME/bin/spark-class"
+      "org.apache.spark.deploy.k8s.SparkPodInitContainer"
+      "$@"
+    )
+    ;;
+
+  *)
+    echo "Unknown command: $SPARK_K8S_CMD" 1>&2
+    exit 1
+esac
+
+# Execute the container CMD under tini for better hygiene
+exec /sbin/tini -s -- "${CMD[@]}"
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 70d0c1750b14..ea31b9c12143 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 43a7ce95bd3d..d42bfff85157 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index b2576b0d7263..4d5e3bb04367 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -427,11 +427,8 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
       uiAddress: Option[String]) = {
     val appId = client.getAttemptId().getApplicationId().toString()
     val attemptId = client.getAttemptId().getAttemptId().toString()
-    val historyAddress =
-      _sparkConf.get(HISTORY_SERVER_ADDRESS)
-        .map { text => SparkHadoopUtil.get.substituteHadoopVariables(text, yarnConf) }
-        .map { address => s"${address}${HistoryServer.UI_PATH_PREFIX}/${appId}/${attemptId}" }
-        .getOrElse("")
+    val historyAddress = ApplicationMaster
+      .getHistoryServerAddress(_sparkConf, yarnConf, appId, attemptId)
 
     val driverUrl = RpcEndpointAddress(
       _sparkConf.get("spark.driver.host"),
@@ -834,6 +831,16 @@ object ApplicationMaster extends Logging {
     master.getAttemptId
   }
 
+  private[spark] def getHistoryServerAddress(
+      sparkConf: SparkConf,
+      yarnConf: YarnConfiguration,
+      appId: String,
+      attemptId: String): String = {
+    sparkConf.get(HISTORY_SERVER_ADDRESS)
+      .map { text => SparkHadoopUtil.get.substituteHadoopVariables(text, yarnConf) }
+      .map { address => s"${address}${HistoryServer.UI_PATH_PREFIX}/${appId}/${attemptId}" }
+      .getOrElse("")
+  }
 }
 
 /**
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 15328d08b3b5..8cd3cd9746a3 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1421,15 +1421,20 @@ private object Client extends Logging {
   }
 
   /**
-   * Return whether the two file systems are the same.
+   * Return whether two URI represent file system are the same
    */
-  private def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
-    val srcUri = srcFs.getUri()
-    val dstUri = destFs.getUri()
+  private[spark] def compareUri(srcUri: URI, dstUri: URI): Boolean = {
+
     if (srcUri.getScheme() == null || srcUri.getScheme() != dstUri.getScheme()) {
       return false
     }
 
+    val srcAuthority = srcUri.getAuthority()
+    val dstAuthority = dstUri.getAuthority()
+    if (srcAuthority != null && !srcAuthority.equalsIgnoreCase(dstAuthority)) {
+      return false
+    }
+
     var srcHost = srcUri.getHost()
     var dstHost = dstUri.getHost()
 
@@ -1447,6 +1452,17 @@ private object Client extends Logging {
     }
 
     Objects.equal(srcHost, dstHost) && srcUri.getPort() == dstUri.getPort()
+
+  }
+
+  /**
+   * Return whether the two file systems are the same.
+   */
+  protected def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
+    val srcUri = srcFs.getUri()
+    val dstUri = destFs.getUri()
+
+    compareUri(srcUri, dstUri)
   }
 
   /**
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ApplicationMasterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ApplicationMasterSuite.scala
new file mode 100644
index 000000000000..695a82f3583e
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ApplicationMasterSuite.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+
+class ApplicationMasterSuite extends SparkFunSuite {
+
+  test("history url with hadoop and spark substitutions") {
+    val host = "rm.host.com"
+    val port = 18080
+    val sparkConf = new SparkConf()
+
+    sparkConf.set("spark.yarn.historyServer.address",
+      "http://${hadoopconf-yarn.resourcemanager.hostname}:${spark.history.ui.port}")
+    val yarnConf = new YarnConfiguration()
+    yarnConf.set("yarn.resourcemanager.hostname", host)
+    val appId = "application_123_1"
+    val attemptId = appId + "_1"
+
+    val shsAddr = ApplicationMaster
+      .getHistoryServerAddress(sparkConf, yarnConf, appId, attemptId)
+
+    assert(shsAddr === s"http://${host}:${port}/history/${appId}/${attemptId}")
+  }
+}
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 9d5f5eb62111..7fa597167f3f 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -357,6 +357,39 @@ class ClientSuite extends SparkFunSuite with Matchers {
     sparkConf.get(SECONDARY_JARS) should be (Some(Seq(new File(jar2.toURI).getName)))
   }
 
+  private val matching = Seq(
+    ("files URI match test1", "file:///file1", "file:///file2"),
+    ("files URI match test2", "file:///c:file1", "file://c:file2"),
+    ("files URI match test3", "file://host/file1", "file://host/file2"),
+    ("wasb URI match test", "wasb://bucket1@user", "wasb://bucket1@user/"),
+    ("hdfs URI match test", "hdfs:/path1", "hdfs:/path1")
+  )
+
+  matching.foreach { t =>
+      test(t._1) {
+        assert(Client.compareUri(new URI(t._2), new URI(t._3)),
+          s"No match between ${t._2} and ${t._3}")
+      }
+  }
+
+  private val unmatching = Seq(
+    ("files URI unmatch test1", "file:///file1", "file://host/file2"),
+    ("files URI unmatch test2", "file://host/file1", "file:///file2"),
+    ("files URI unmatch test3", "file://host/file1", "file://host2/file2"),
+    ("wasb URI unmatch test1", "wasb://bucket1@user", "wasb://bucket2@user/"),
+    ("wasb URI unmatch test2", "wasb://bucket1@user", "wasb://bucket1@user2/"),
+    ("s3 URI unmatch test", "s3a://user@pass:bucket1/", "s3a://user2@pass2:bucket1/"),
+    ("hdfs URI unmatch test1", "hdfs://namenode1/path1", "hdfs://namenode1:8080/path2"),
+    ("hdfs URI unmatch test2", "hdfs://namenode1:8020/path1", "hdfs://namenode1:8080/path2")
+  )
+
+  unmatching.foreach { t =>
+      test(t._1) {
+        assert(!Client.compareUri(new URI(t._2), new URI(t._3)),
+          s"match between ${t._2} and ${t._3}")
+      }
+  }
+
   object Fixtures {
 
     val knownDefYarnAppCP: Seq[String] =
diff --git a/sbin/build-push-docker-images.sh b/sbin/build-push-docker-images.sh
deleted file mode 100755
index b3137598692d..000000000000
--- a/sbin/build-push-docker-images.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This script builds and pushes docker images when run from a release of Spark
-# with Kubernetes support.
-
-declare -A path=( [spark-driver]=kubernetes/dockerfiles/driver/Dockerfile \
-                  [spark-executor]=kubernetes/dockerfiles/executor/Dockerfile \
-                  [spark-init]=kubernetes/dockerfiles/init-container/Dockerfile )
-
-function build {
-  docker build -t spark-base -f kubernetes/dockerfiles/spark-base/Dockerfile .
-  for image in "${!path[@]}"; do
-    docker build -t ${REPO}/$image:${TAG} -f ${path[$image]} .
-  done
-}
-
-
-function push {
-  for image in "${!path[@]}"; do
-    docker push ${REPO}/$image:${TAG}
-  done
-}
-
-function usage {
-  echo "This script must be run from a runnable distribution of Apache Spark."
-  echo "Usage: ./sbin/build-push-docker-images.sh -r <repo> -t <tag> build"
-  echo "       ./sbin/build-push-docker-images.sh -r <repo> -t <tag> push"
-  echo "for example: ./sbin/build-push-docker-images.sh -r docker.io/myrepo -t v2.3.0 push"
-}
-
-if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
-  usage
-  exit 0
-fi
-
-while getopts r:t: option
-do
- case "${option}"
- in
- r) REPO=${OPTARG};;
- t) TAG=${OPTARG};;
- esac
-done
-
-if [ -z "$REPO" ] || [ -z "$TAG" ]; then
-    usage
-else
-  case "${@: -1}" in
-    build) build;;
-    push) push;;
-    *) usage;;
-  esac
-fi
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 7bdd3fac773a..e2fa5754afae 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -93,7 +93,7 @@ This file is divided into 3 sections:
     <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
   </check>
 
-  <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
+  <check customId="argcount" level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
     <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
   </check>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 9e2ced30407d..865d3b3f6ad4 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -134,7 +134,7 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
         <configuration>
-          <argLine>-ea -Xmx4g -Xss4m -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
+          <argLine>-ea -Xmx4g -Xss4m -XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine>
         </configuration>
       </plugin>
       <plugin>
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 6fe995f650d5..39d5e4ed5662 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -73,18 +73,22 @@ statement
     | ALTER DATABASE identifier SET DBPROPERTIES tablePropertyList     #setDatabaseProperties
     | DROP DATABASE (IF EXISTS)? identifier (RESTRICT | CASCADE)?      #dropDatabase
     | createTableHeader ('(' colTypeList ')')? tableProvider
-        (OPTIONS options=tablePropertyList)?
-        (PARTITIONED BY partitionColumnNames=identifierList)?
-        bucketSpec? locationSpec?
-        (COMMENT comment=STRING)?
-        (TBLPROPERTIES tableProps=tablePropertyList)?
+        ((OPTIONS options=tablePropertyList) |
+        (PARTITIONED BY partitionColumnNames=identifierList) |
+        bucketSpec |
+        locationSpec |
+        (COMMENT comment=STRING) |
+        (TBLPROPERTIES tableProps=tablePropertyList))*
         (AS? query)?                                                   #createTable
     | createTableHeader ('(' columns=colTypeList ')')?
-        (COMMENT comment=STRING)?
-        (PARTITIONED BY '(' partitionColumns=colTypeList ')')?
-        bucketSpec? skewSpec?
-        rowFormat?  createFileFormat? locationSpec?
-        (TBLPROPERTIES tablePropertyList)?
+        ((COMMENT comment=STRING) |
+        (PARTITIONED BY '(' partitionColumns=colTypeList ')') |
+        bucketSpec |
+        skewSpec |
+        rowFormat |
+        createFileFormat |
+        locationSpec |
+        (TBLPROPERTIES tableProps=tablePropertyList))*
         (AS? query)?                                                   #createHiveTable
     | CREATE TABLE (IF NOT EXISTS)? target=tableIdentifier
         LIKE source=tableIdentifier locationSpec?                      #createTableLike
@@ -137,7 +141,7 @@ statement
         (LIKE? pattern=STRING)?                                        #showTables
     | SHOW TABLE EXTENDED ((FROM | IN) db=identifier)?
         LIKE pattern=STRING partitionSpec?                             #showTable
-    | SHOW DATABASES (LIKE pattern=STRING)?                            #showDatabases
+    | SHOW DATABASES (LIKE? pattern=STRING)?                            #showDatabases
     | SHOW TBLPROPERTIES table=tableIdentifier
         ('(' key=tablePropertyKey ')')?                                #showTblProperties
     | SHOW COLUMNS (FROM | IN) tableIdentifier
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java
new file mode 100644
index 000000000000..f0f66bae245f
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen;
+
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * A helper class to write {@link UTF8String}s to an internal buffer and build the concatenated
+ * {@link UTF8String} at the end.
+ */
+public class UTF8StringBuilder {
+
+  private static final int ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH;
+
+  private byte[] buffer;
+  private int cursor = Platform.BYTE_ARRAY_OFFSET;
+
+  public UTF8StringBuilder() {
+    // Since initial buffer size is 16 in `StringBuilder`, we set the same size here
+    this.buffer = new byte[16];
+  }
+
+  // Grows the buffer by at least `neededSize`
+  private void grow(int neededSize) {
+    if (neededSize > ARRAY_MAX - totalSize()) {
+      throw new UnsupportedOperationException(
+        "Cannot grow internal buffer by size " + neededSize + " because the size after growing " +
+          "exceeds size limitation " + ARRAY_MAX);
+    }
+    final int length = totalSize() + neededSize;
+    if (buffer.length < length) {
+      int newLength = length < ARRAY_MAX / 2 ? length * 2 : ARRAY_MAX;
+      final byte[] tmp = new byte[newLength];
+      Platform.copyMemory(
+        buffer,
+        Platform.BYTE_ARRAY_OFFSET,
+        tmp,
+        Platform.BYTE_ARRAY_OFFSET,
+        totalSize());
+      buffer = tmp;
+    }
+  }
+
+  private int totalSize() {
+    return cursor - Platform.BYTE_ARRAY_OFFSET;
+  }
+
+  public void append(UTF8String value) {
+    grow(value.numBytes());
+    value.writeToMemory(buffer, cursor);
+    cursor += value.numBytes();
+  }
+
+  public void append(String value) {
+    append(UTF8String.fromString(value));
+  }
+
+  public UTF8String build() {
+    return UTF8String.fromBytes(buffer, 0, totalSize());
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 65040f1af4b0..9a4bf0075a17 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -63,6 +63,7 @@ object ScalaReflection extends ScalaReflection {
 
   private def dataTypeFor(tpe: `Type`): DataType = cleanUpReflectionObjects {
     tpe.dealias match {
+      case t if t <:< definitions.NullTpe => NullType
       case t if t <:< definitions.IntTpe => IntegerType
       case t if t <:< definitions.LongTpe => LongType
       case t if t <:< definitions.DoubleTpe => DoubleType
@@ -712,6 +713,9 @@ object ScalaReflection extends ScalaReflection {
   /** Returns a catalyst DataType and its nullability for the given Scala Type using reflection. */
   def schemaFor(tpe: `Type`): Schema = cleanUpReflectionObjects {
     tpe.dealias match {
+      // this must be the first case, since all objects in scala are instances of Null, therefore
+      // Null type would wrongly match the first of them, which is Option as of now
+      case t if t <:< definitions.NullTpe => Schema(NullType, nullable = true)
       case t if t.typeSymbol.annotations.exists(_.tree.tpe =:= typeOf[SQLUserDefinedType]) =>
         val udt = getClassFromType(t).getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance()
         Schema(udt, nullable = true)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala
index 57f7a80bedc6..6d587abd8fd4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala
@@ -31,7 +31,7 @@ class TableAlreadyExistsException(db: String, table: String)
   extends AnalysisException(s"Table or view '$table' already exists in database '$db'")
 
 class TempTableAlreadyExistsException(table: String)
-  extends AnalysisException(s"Temporary table '$table' already exists")
+  extends AnalysisException(s"Temporary view '$table' already exists")
 
 class PartitionAlreadyExistsException(db: String, table: String, spec: TablePartitionSpec)
   extends AnalysisException(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 6d294d48c0ee..35b35110e491 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -52,6 +52,7 @@ object SimpleAnalyzer extends Analyzer(
 /**
  * Provides a way to keep state during the analysis, this enables us to decouple the concerns
  * of analysis environment from the catalog.
+ * The state that is kept here is per-query.
  *
  * Note this is thread local.
  *
@@ -70,6 +71,8 @@ object AnalysisContext {
   }
 
   def get: AnalysisContext = value.get()
+  def reset(): Unit = value.remove()
+
   private def set(context: AnalysisContext): Unit = value.set(context)
 
   def withAnalysisContext[A](database: Option[String])(f: => A): A = {
@@ -95,6 +98,17 @@ class Analyzer(
     this(catalog, conf, conf.optimizerMaxIterations)
   }
 
+  override def execute(plan: LogicalPlan): LogicalPlan = {
+    AnalysisContext.reset()
+    try {
+      executeSameContext(plan)
+    } finally {
+      AnalysisContext.reset()
+    }
+  }
+
+  private def executeSameContext(plan: LogicalPlan): LogicalPlan = super.execute(plan)
+
   def resolver: Resolver = conf.resolver
 
   protected val fixedPoint = FixedPoint(maxIterations)
@@ -176,7 +190,7 @@ class Analyzer(
       case With(child, relations) =>
         substituteCTE(child, relations.foldLeft(Seq.empty[(String, LogicalPlan)]) {
           case (resolved, (name, relation)) =>
-            resolved :+ name -> execute(substituteCTE(relation, resolved))
+            resolved :+ name -> executeSameContext(substituteCTE(relation, resolved))
         })
       case other => other
     }
@@ -600,7 +614,7 @@ class Analyzer(
               "avoid errors. Increase the value of spark.sql.view.maxNestedViewDepth to work " +
               "aroud this.")
           }
-          execute(child)
+          executeSameContext(child)
         }
         view.copy(child = newChild)
       case p @ SubqueryAlias(_, view: View) =>
@@ -1269,7 +1283,7 @@ class Analyzer(
       do {
         // Try to resolve the subquery plan using the regular analyzer.
         previous = current
-        current = execute(current)
+        current = executeSameContext(current)
 
         // Use the outer references to resolve the subquery plan if it isn't resolved yet.
         val i = plans.iterator
@@ -1392,7 +1406,7 @@ class Analyzer(
               grouping,
               Alias(cond, "havingCondition")() :: Nil,
               child)
-          val resolvedOperator = execute(aggregatedCondition)
+          val resolvedOperator = executeSameContext(aggregatedCondition)
           def resolvedAggregateFilter =
             resolvedOperator
               .asInstanceOf[Aggregate]
@@ -1450,7 +1464,8 @@ class Analyzer(
           val aliasedOrdering =
             unresolvedSortOrders.map(o => Alias(o.child, "aggOrder")())
           val aggregatedOrdering = aggregate.copy(aggregateExpressions = aliasedOrdering)
-          val resolvedAggregate: Aggregate = execute(aggregatedOrdering).asInstanceOf[Aggregate]
+          val resolvedAggregate: Aggregate =
+            executeSameContext(aggregatedOrdering).asInstanceOf[Aggregate]
           val resolvedAliasedOrdering: Seq[Alias] =
             resolvedAggregate.aggregateExpressions.asInstanceOf[Seq[Alias]]
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
index a8100b9b24aa..ab63131b0757 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
@@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.Literal._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 
@@ -42,8 +43,10 @@ import org.apache.spark.sql.types._
  *   e1 / e2      p1 - s1 + s2 + max(6, s1 + p2 + 1)      max(6, s1 + p2 + 1)
  *   e1 % e2      min(p1-s1, p2-s2) + max(s1, s2)         max(s1, s2)
  *   e1 union e2  max(s1, s2) + max(p1-s1, p2-s2)         max(s1, s2)
- *   sum(e1)      p1 + 10                                 s1
- *   avg(e1)      p1 + 4                                  s1 + 4
+ *
+ * When `spark.sql.decimalOperations.allowPrecisionLoss` is set to true, if the precision / scale
+ * needed are out of the range of available values, the scale is reduced up to 6, in order to
+ * prevent the truncation of the integer part of the decimals.
  *
  * To implement the rules for fixed-precision types, we introduce casts to turn them to unlimited
  * precision, do the math on unlimited-precision numbers, then introduce casts back to the
@@ -56,6 +59,7 @@ import org.apache.spark.sql.types._
  * - INT gets turned into DECIMAL(10, 0)
  * - LONG gets turned into DECIMAL(20, 0)
  * - FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE
+ * - Literals INT and LONG get turned into DECIMAL with the precision strictly needed by the value
  */
 // scalastyle:on
 object DecimalPrecision extends TypeCoercionRule {
@@ -93,41 +97,76 @@ object DecimalPrecision extends TypeCoercionRule {
     case e: BinaryArithmetic if e.left.isInstanceOf[PromotePrecision] => e
 
     case Add(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
-      CheckOverflow(Add(promotePrecision(e1, dt), promotePrecision(e2, dt)), dt)
+      val resultScale = max(s1, s2)
+      val resultType = if (SQLConf.get.decimalOperationsAllowPrecisionLoss) {
+        DecimalType.adjustPrecisionScale(max(p1 - s1, p2 - s2) + resultScale + 1,
+          resultScale)
+      } else {
+        DecimalType.bounded(max(p1 - s1, p2 - s2) + resultScale + 1, resultScale)
+      }
+      CheckOverflow(Add(promotePrecision(e1, resultType), promotePrecision(e2, resultType)),
+        resultType)
 
     case Subtract(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
-      CheckOverflow(Subtract(promotePrecision(e1, dt), promotePrecision(e2, dt)), dt)
+      val resultScale = max(s1, s2)
+      val resultType = if (SQLConf.get.decimalOperationsAllowPrecisionLoss) {
+        DecimalType.adjustPrecisionScale(max(p1 - s1, p2 - s2) + resultScale + 1,
+          resultScale)
+      } else {
+        DecimalType.bounded(max(p1 - s1, p2 - s2) + resultScale + 1, resultScale)
+      }
+      CheckOverflow(Subtract(promotePrecision(e1, resultType), promotePrecision(e2, resultType)),
+        resultType)
 
     case Multiply(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val resultType = DecimalType.bounded(p1 + p2 + 1, s1 + s2)
+      val resultType = if (SQLConf.get.decimalOperationsAllowPrecisionLoss) {
+        DecimalType.adjustPrecisionScale(p1 + p2 + 1, s1 + s2)
+      } else {
+        DecimalType.bounded(p1 + p2 + 1, s1 + s2)
+      }
       val widerType = widerDecimalType(p1, s1, p2, s2)
       CheckOverflow(Multiply(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
         resultType)
 
     case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2)
-      var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1))
-      val diff = (intDig + decDig) - DecimalType.MAX_SCALE
-      if (diff > 0) {
-        decDig -= diff / 2 + 1
-        intDig = DecimalType.MAX_SCALE - decDig
+      val resultType = if (SQLConf.get.decimalOperationsAllowPrecisionLoss) {
+        // Precision: p1 - s1 + s2 + max(6, s1 + p2 + 1)
+        // Scale: max(6, s1 + p2 + 1)
+        val intDig = p1 - s1 + s2
+        val scale = max(DecimalType.MINIMUM_ADJUSTED_SCALE, s1 + p2 + 1)
+        val prec = intDig + scale
+        DecimalType.adjustPrecisionScale(prec, scale)
+      } else {
+        var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2)
+        var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1))
+        val diff = (intDig + decDig) - DecimalType.MAX_SCALE
+        if (diff > 0) {
+          decDig -= diff / 2 + 1
+          intDig = DecimalType.MAX_SCALE - decDig
+        }
+        DecimalType.bounded(intDig + decDig, decDig)
       }
-      val resultType = DecimalType.bounded(intDig + decDig, decDig)
       val widerType = widerDecimalType(p1, s1, p2, s2)
       CheckOverflow(Divide(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
         resultType)
 
     case Remainder(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val resultType = DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      val resultType = if (SQLConf.get.decimalOperationsAllowPrecisionLoss) {
+        DecimalType.adjustPrecisionScale(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      } else {
+        DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      }
       // resultType may have lower precision, so we cast them into wider type first.
       val widerType = widerDecimalType(p1, s1, p2, s2)
       CheckOverflow(Remainder(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
         resultType)
 
     case Pmod(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val resultType = DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      val resultType = if (SQLConf.get.decimalOperationsAllowPrecisionLoss) {
+        DecimalType.adjustPrecisionScale(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      } else {
+        DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      }
       // resultType may have lower precision, so we cast them into wider type first.
       val widerType = widerDecimalType(p1, s1, p2, s2)
       CheckOverflow(Pmod(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
@@ -137,9 +176,6 @@ object DecimalPrecision extends TypeCoercionRule {
     e2 @ DecimalType.Expression(p2, s2)) if p1 != p2 || s1 != s2 =>
       val resultType = widerDecimalType(p1, s1, p2, s2)
       b.makeCopy(Array(Cast(e1, resultType), Cast(e2, resultType)))
-
-    // TODO: MaxOf, MinOf, etc might want other rules
-    // SUM and AVERAGE are handled by the implementations of those expressions
   }
 
   /**
@@ -243,17 +279,35 @@ object DecimalPrecision extends TypeCoercionRule {
     // Promote integers inside a binary expression with fixed-precision decimals to decimals,
     // and fixed-precision decimals in an expression with floats / doubles to doubles
     case b @ BinaryOperator(left, right) if left.dataType != right.dataType =>
-      (left.dataType, right.dataType) match {
-        case (t: IntegralType, DecimalType.Fixed(p, s)) =>
-          b.makeCopy(Array(Cast(left, DecimalType.forType(t)), right))
-        case (DecimalType.Fixed(p, s), t: IntegralType) =>
-          b.makeCopy(Array(left, Cast(right, DecimalType.forType(t))))
-        case (t, DecimalType.Fixed(p, s)) if isFloat(t) =>
-          b.makeCopy(Array(left, Cast(right, DoubleType)))
-        case (DecimalType.Fixed(p, s), t) if isFloat(t) =>
-          b.makeCopy(Array(Cast(left, DoubleType), right))
-        case _ =>
-          b
+      (left, right) match {
+        // Promote literal integers inside a binary expression with fixed-precision decimals to
+        // decimals. The precision and scale are the ones strictly needed by the integer value.
+        // Requiring more precision than necessary may lead to a useless loss of precision.
+        // Consider the following example: multiplying a column which is DECIMAL(38, 18) by 2.
+        // If we use the default precision and scale for the integer type, 2 is considered a
+        // DECIMAL(10, 0). According to the rules, the result would be DECIMAL(38 + 10 + 1, 18),
+        // which is out of range and therefore it will becomes DECIMAL(38, 7), leading to
+        // potentially loosing 11 digits of the fractional part. Using only the precision needed
+        // by the Literal, instead, the result would be DECIMAL(38 + 1 + 1, 18), which would
+        // become DECIMAL(38, 16), safely having a much lower precision loss.
+        case (l: Literal, r) if r.dataType.isInstanceOf[DecimalType]
+          && l.dataType.isInstanceOf[IntegralType] =>
+          b.makeCopy(Array(Cast(l, DecimalType.fromLiteral(l)), r))
+        case (l, r: Literal) if l.dataType.isInstanceOf[DecimalType]
+          && r.dataType.isInstanceOf[IntegralType] =>
+          b.makeCopy(Array(l, Cast(r, DecimalType.fromLiteral(r))))
+        // Promote integers inside a binary expression with fixed-precision decimals to decimals,
+        // and fixed-precision decimals in an expression with floats / doubles to doubles
+        case (l @ IntegralType(), r @ DecimalType.Expression(_, _)) =>
+          b.makeCopy(Array(Cast(l, DecimalType.forType(l.dataType)), r))
+        case (l @ DecimalType.Expression(_, _), r @ IntegralType()) =>
+          b.makeCopy(Array(l, Cast(r, DecimalType.forType(r.dataType))))
+        case (l, r @ DecimalType.Expression(_, _)) if isFloat(l.dataType) =>
+          b.makeCopy(Array(l, Cast(r, DoubleType)))
+        case (l @ DecimalType.Expression(_, _), r) if isFloat(r.dataType) =>
+          b.makeCopy(Array(Cast(l, DoubleType), r))
+        case _ => b
       }
   }
+
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 5ddb39822617..747016beb06e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -526,7 +526,15 @@ object FunctionRegistry {
         // Otherwise, find a constructor method that matches the number of arguments, and use that.
         val params = Seq.fill(expressions.size)(classOf[Expression])
         val f = constructors.find(_.getParameterTypes.toSeq == params).getOrElse {
-          throw new AnalysisException(s"Invalid number of arguments for function $name")
+          val validParametersCount = constructors.map(_.getParameterCount).distinct.sorted
+          val expectedNumberOfParameters = if (validParametersCount.length == 1) {
+            validParametersCount.head.toString
+          } else {
+            validParametersCount.init.mkString("one of ", ", ", " and ") +
+              validParametersCount.last
+          }
+          throw new AnalysisException(s"Invalid number of arguments for function $name. " +
+            s"Expected: $expectedNumberOfParameters; Found: ${params.length}")
         }
         Try(f.newInstance(expressions : _*).asInstanceOf[Expression]) match {
           case Success(e) => e
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index e9436367c7e2..e8669c4637d0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -54,6 +54,7 @@ object TypeCoercion {
       BooleanEquality ::
       FunctionArgumentConversion ::
       ConcatCoercion(conf) ::
+      EltCoercion(conf) ::
       CaseWhenCoercion ::
       IfCoercion ::
       StackCoercion ::
@@ -684,6 +685,34 @@ object TypeCoercion {
     }
   }
 
+  /**
+   * Coerces the types of [[Elt]] children to expected ones.
+   *
+   * If `spark.sql.function.eltOutputAsString` is false and all children types are binary,
+   * the expected types are binary. Otherwise, the expected ones are strings.
+   */
+  case class EltCoercion(conf: SQLConf) extends TypeCoercionRule {
+
+    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan transform { case p =>
+      p transformExpressionsUp {
+        // Skip nodes if unresolved or not enough children
+        case c @ Elt(children) if !c.childrenResolved || children.size < 2 => c
+        case c @ Elt(children) =>
+          val index = children.head
+          val newIndex = ImplicitTypeCasts.implicitCast(index, IntegerType).getOrElse(index)
+          val newInputs = if (conf.eltOutputAsString ||
+              !children.tail.map(_.dataType).forall(_ == BinaryType)) {
+            children.tail.map { e =>
+              ImplicitTypeCasts.implicitCast(e, StringType).getOrElse(e)
+            }
+          } else {
+            children.tail
+          }
+          c.copy(children = newIndex +: newInputs)
+      }
+    }
+  }
+
   /**
    * Turns Add/Subtract of DateType/TimestampType/StringType and CalendarIntervalType
    * to TimeAdd/TimeSub
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 274d8813f16d..79b051670e9e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -206,6 +206,85 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
     case DateType => buildCast[Int](_, d => UTF8String.fromString(DateTimeUtils.dateToString(d)))
     case TimestampType => buildCast[Long](_,
       t => UTF8String.fromString(DateTimeUtils.timestampToString(t, timeZone)))
+    case ArrayType(et, _) =>
+      buildCast[ArrayData](_, array => {
+        val builder = new UTF8StringBuilder
+        builder.append("[")
+        if (array.numElements > 0) {
+          val toUTF8String = castToString(et)
+          if (!array.isNullAt(0)) {
+            builder.append(toUTF8String(array.get(0, et)).asInstanceOf[UTF8String])
+          }
+          var i = 1
+          while (i < array.numElements) {
+            builder.append(",")
+            if (!array.isNullAt(i)) {
+              builder.append(" ")
+              builder.append(toUTF8String(array.get(i, et)).asInstanceOf[UTF8String])
+            }
+            i += 1
+          }
+        }
+        builder.append("]")
+        builder.build()
+      })
+    case MapType(kt, vt, _) =>
+      buildCast[MapData](_, map => {
+        val builder = new UTF8StringBuilder
+        builder.append("[")
+        if (map.numElements > 0) {
+          val keyArray = map.keyArray()
+          val valueArray = map.valueArray()
+          val keyToUTF8String = castToString(kt)
+          val valueToUTF8String = castToString(vt)
+          builder.append(keyToUTF8String(keyArray.get(0, kt)).asInstanceOf[UTF8String])
+          builder.append(" ->")
+          if (!valueArray.isNullAt(0)) {
+            builder.append(" ")
+            builder.append(valueToUTF8String(valueArray.get(0, vt)).asInstanceOf[UTF8String])
+          }
+          var i = 1
+          while (i < map.numElements) {
+            builder.append(", ")
+            builder.append(keyToUTF8String(keyArray.get(i, kt)).asInstanceOf[UTF8String])
+            builder.append(" ->")
+            if (!valueArray.isNullAt(i)) {
+              builder.append(" ")
+              builder.append(valueToUTF8String(valueArray.get(i, vt))
+                .asInstanceOf[UTF8String])
+            }
+            i += 1
+          }
+        }
+        builder.append("]")
+        builder.build()
+      })
+    case StructType(fields) =>
+      buildCast[InternalRow](_, row => {
+        val builder = new UTF8StringBuilder
+        builder.append("[")
+        if (row.numFields > 0) {
+          val st = fields.map(_.dataType)
+          val toUTF8StringFuncs = st.map(castToString)
+          if (!row.isNullAt(0)) {
+            builder.append(toUTF8StringFuncs(0)(row.get(0, st(0))).asInstanceOf[UTF8String])
+          }
+          var i = 1
+          while (i < row.numFields) {
+            builder.append(",")
+            if (!row.isNullAt(i)) {
+              builder.append(" ")
+              builder.append(toUTF8StringFuncs(i)(row.get(i, st(i))).asInstanceOf[UTF8String])
+            }
+            i += 1
+          }
+        }
+        builder.append("]")
+        builder.build()
+      })
+    case pudt: PythonUserDefinedType => castToString(pudt.sqlType)
+    case udt: UserDefinedType[_] =>
+      buildCast[Any](_, o => UTF8String.fromString(udt.deserialize(o).toString))
     case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
   }
 
@@ -597,6 +676,123 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
     """
   }
 
+  private def writeArrayToStringBuilder(
+      et: DataType,
+      array: String,
+      buffer: String,
+      ctx: CodegenContext): String = {
+    val elementToStringCode = castToStringCode(et, ctx)
+    val funcName = ctx.freshName("elementToString")
+    val elementToStringFunc = ctx.addNewFunction(funcName,
+      s"""
+         |private UTF8String $funcName(${ctx.javaType(et)} element) {
+         |  UTF8String elementStr = null;
+         |  ${elementToStringCode("element", "elementStr", null /* resultIsNull won't be used */)}
+         |  return elementStr;
+         |}
+       """.stripMargin)
+
+    val loopIndex = ctx.freshName("loopIndex")
+    s"""
+       |$buffer.append("[");
+       |if ($array.numElements() > 0) {
+       |  if (!$array.isNullAt(0)) {
+       |    $buffer.append($elementToStringFunc(${ctx.getValue(array, et, "0")}));
+       |  }
+       |  for (int $loopIndex = 1; $loopIndex < $array.numElements(); $loopIndex++) {
+       |    $buffer.append(",");
+       |    if (!$array.isNullAt($loopIndex)) {
+       |      $buffer.append(" ");
+       |      $buffer.append($elementToStringFunc(${ctx.getValue(array, et, loopIndex)}));
+       |    }
+       |  }
+       |}
+       |$buffer.append("]");
+     """.stripMargin
+  }
+
+  private def writeMapToStringBuilder(
+      kt: DataType,
+      vt: DataType,
+      map: String,
+      buffer: String,
+      ctx: CodegenContext): String = {
+
+    def dataToStringFunc(func: String, dataType: DataType) = {
+      val funcName = ctx.freshName(func)
+      val dataToStringCode = castToStringCode(dataType, ctx)
+      ctx.addNewFunction(funcName,
+        s"""
+           |private UTF8String $funcName(${ctx.javaType(dataType)} data) {
+           |  UTF8String dataStr = null;
+           |  ${dataToStringCode("data", "dataStr", null /* resultIsNull won't be used */)}
+           |  return dataStr;
+           |}
+         """.stripMargin)
+    }
+
+    val keyToStringFunc = dataToStringFunc("keyToString", kt)
+    val valueToStringFunc = dataToStringFunc("valueToString", vt)
+    val loopIndex = ctx.freshName("loopIndex")
+    s"""
+       |$buffer.append("[");
+       |if ($map.numElements() > 0) {
+       |  $buffer.append($keyToStringFunc(${ctx.getValue(s"$map.keyArray()", kt, "0")}));
+       |  $buffer.append(" ->");
+       |  if (!$map.valueArray().isNullAt(0)) {
+       |    $buffer.append(" ");
+       |    $buffer.append($valueToStringFunc(${ctx.getValue(s"$map.valueArray()", vt, "0")}));
+       |  }
+       |  for (int $loopIndex = 1; $loopIndex < $map.numElements(); $loopIndex++) {
+       |    $buffer.append(", ");
+       |    $buffer.append($keyToStringFunc(${ctx.getValue(s"$map.keyArray()", kt, loopIndex)}));
+       |    $buffer.append(" ->");
+       |    if (!$map.valueArray().isNullAt($loopIndex)) {
+       |      $buffer.append(" ");
+       |      $buffer.append($valueToStringFunc(
+       |        ${ctx.getValue(s"$map.valueArray()", vt, loopIndex)}));
+       |    }
+       |  }
+       |}
+       |$buffer.append("]");
+     """.stripMargin
+  }
+
+  private def writeStructToStringBuilder(
+      st: Seq[DataType],
+      row: String,
+      buffer: String,
+      ctx: CodegenContext): String = {
+    val structToStringCode = st.zipWithIndex.map { case (ft, i) =>
+      val fieldToStringCode = castToStringCode(ft, ctx)
+      val field = ctx.freshName("field")
+      val fieldStr = ctx.freshName("fieldStr")
+      s"""
+         |${if (i != 0) s"""$buffer.append(",");""" else ""}
+         |if (!$row.isNullAt($i)) {
+         |  ${if (i != 0) s"""$buffer.append(" ");""" else ""}
+         |
+         |  // Append $i field into the string buffer
+         |  ${ctx.javaType(ft)} $field = ${ctx.getValue(row, ft, s"$i")};
+         |  UTF8String $fieldStr = null;
+         |  ${fieldToStringCode(field, fieldStr, null /* resultIsNull won't be used */)}
+         |  $buffer.append($fieldStr);
+         |}
+       """.stripMargin
+    }
+
+    val writeStructCode = ctx.splitExpressions(
+      expressions = structToStringCode,
+      funcName = "fieldToString",
+      arguments = ("InternalRow", row) :: (classOf[UTF8StringBuilder].getName, buffer) :: Nil)
+
+    s"""
+       |$buffer.append("[");
+       |$writeStructCode
+       |$buffer.append("]");
+     """.stripMargin
+  }
+
   private[this] def castToStringCode(from: DataType, ctx: CodegenContext): CastFunction = {
     from match {
       case BinaryType =>
@@ -608,6 +804,47 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         val tz = ctx.addReferenceObj("timeZone", timeZone)
         (c, evPrim, evNull) => s"""$evPrim = UTF8String.fromString(
           org.apache.spark.sql.catalyst.util.DateTimeUtils.timestampToString($c, $tz));"""
+      case ArrayType(et, _) =>
+        (c, evPrim, evNull) => {
+          val buffer = ctx.freshName("buffer")
+          val bufferClass = classOf[UTF8StringBuilder].getName
+          val writeArrayElemCode = writeArrayToStringBuilder(et, c, buffer, ctx)
+          s"""
+             |$bufferClass $buffer = new $bufferClass();
+             |$writeArrayElemCode;
+             |$evPrim = $buffer.build();
+           """.stripMargin
+        }
+      case MapType(kt, vt, _) =>
+        (c, evPrim, evNull) => {
+          val buffer = ctx.freshName("buffer")
+          val bufferClass = classOf[UTF8StringBuilder].getName
+          val writeMapElemCode = writeMapToStringBuilder(kt, vt, c, buffer, ctx)
+          s"""
+             |$bufferClass $buffer = new $bufferClass();
+             |$writeMapElemCode;
+             |$evPrim = $buffer.build();
+           """.stripMargin
+        }
+      case StructType(fields) =>
+        (c, evPrim, evNull) => {
+          val row = ctx.freshName("row")
+          val buffer = ctx.freshName("buffer")
+          val bufferClass = classOf[UTF8StringBuilder].getName
+          val writeStructCode = writeStructToStringBuilder(fields.map(_.dataType), row, buffer, ctx)
+          s"""
+             |InternalRow $row = $c;
+             |$bufferClass $buffer = new $bufferClass();
+             |$writeStructCode
+             |$evPrim = $buffer.build();
+           """.stripMargin
+        }
+      case pudt: PythonUserDefinedType => castToStringCode(pudt.sqlType, ctx)
+      case udt: UserDefinedType[_] =>
+        val udtRef = ctx.addReferenceObj("udt", udt)
+        (c, evPrim, evNull) => {
+          s"$evPrim = UTF8String.fromString($udtRef.deserialize($c).toString());"
+        }
       case _ =>
         (c, evPrim, evNull) => s"$evPrim = UTF8String.fromString(String.valueOf($c));"
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index 149ac265e6ed..a45854a3b514 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -296,8 +296,8 @@ object ApproximatePercentile {
       Ints.BYTES + Doubles.BYTES + Longs.BYTES +
       // length of summary.sampled
       Ints.BYTES +
-      // summary.sampled, Array[Stat(value: Double, g: Int, delta: Int)]
-      summaries.sampled.length * (Doubles.BYTES + Ints.BYTES + Ints.BYTES)
+      // summary.sampled, Array[Stat(value: Double, g: Long, delta: Long)]
+      summaries.sampled.length * (Doubles.BYTES + Longs.BYTES + Longs.BYTES)
     }
 
     final def serialize(obj: PercentileDigest): Array[Byte] = {
@@ -312,8 +312,8 @@ object ApproximatePercentile {
       while (i < summary.sampled.length) {
         val stat = summary.sampled(i)
         buffer.putDouble(stat.value)
-        buffer.putInt(stat.g)
-        buffer.putInt(stat.delta)
+        buffer.putLong(stat.g)
+        buffer.putLong(stat.delta)
         i += 1
       }
       buffer.array()
@@ -330,8 +330,8 @@ object ApproximatePercentile {
       var i = 0
       while (i < sampledLength) {
         val value = buffer.getDouble()
-        val g = buffer.getInt()
-        val delta = buffer.getInt()
+        val g = buffer.getLong()
+        val delta = buffer.getLong()
         sampled(i) = Stats(value, g, delta)
         i += 1
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
index be5f5a73b5d4..febf7b0c96c2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
@@ -70,7 +70,7 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
 
     // --------------------- copy bitset from row 1 and row 2 --------------------------- //
     val copyBitset = Seq.tabulate(outputBitsetWords) { i =>
-      val bits = if (bitset1Remainder > 0) {
+      val bits = if (bitset1Remainder > 0 && bitset2Words != 0) {
         if (i < bitset1Words - 1) {
           s"$getLong(obj1, offset1 + ${i * 8})"
         } else if (i == bitset1Words - 1) {
@@ -152,7 +152,9 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
       } else {
         // Number of bytes to increase for the offset. Note that since in UnsafeRow we store the
         // offset in the upper 32 bit of the words, we can just shift the offset to the left by
-        // 32 and increment that amount in place.
+        // 32 and increment that amount in place. However, we need to handle the important special
+        // case of a null field, in which case the offset should be zero and should not have a
+        // shift added to it.
         val shift =
           if (i < schema1.size) {
             s"${(outputBitsetWords - bitset1Words + schema2.size) * 8}L"
@@ -160,14 +162,55 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
             s"(${(outputBitsetWords - bitset2Words + schema1.size) * 8}L + numBytesVariableRow1)"
           }
         val cursor = offset + outputBitsetWords * 8 + i * 8
-        s"$putLong(buf, $cursor, $getLong(buf, $cursor) + ($shift << 32));\n"
+        // UnsafeRow is a little underspecified, so in what follows we'll treat UnsafeRowWriter's
+        // output as a de-facto specification for the internal layout of data.
+        //
+        // Null-valued fields will always have a data offset of 0 because
+        // UnsafeRowWriter.setNullAt(ordinal) sets the null bit and stores 0 to in field's
+        // position in the fixed-length section of the row. As a result, we must NOT add
+        // `shift` to the offset for null fields.
+        //
+        // We could perform a null-check here by inspecting the null-tracking bitmap, but doing
+        // so could be expensive and will add significant bloat to the generated code. Instead,
+        // we'll rely on the invariant "stored offset == 0 for variable-length data type implies
+        // that the field's value is null."
+        //
+        // To establish that this invariant holds, we'll prove that a non-null field can never
+        // have a stored offset of 0. There are two cases to consider:
+        //
+        //   1. The non-null field's data is of non-zero length: reading this field's value
+        //      must read data from the variable-length section of the row, so the stored offset
+        //      will actually be used in address calculation and must be correct. The offsets
+        //      count bytes from the start of the UnsafeRow so these offsets will always be
+        //      non-zero because the storage of the offsets themselves takes up space at the
+        //      start of the row.
+        //   2. The non-null field's data is of zero length (i.e. its data is empty). In this
+        //      case, we have to worry about the possibility that an arbitrary offset value was
+        //      stored because we never actually read any bytes using this offset and therefore
+        //      would not crash if it was incorrect. The variable-sized data writing paths in
+        //      UnsafeRowWriter unconditionally calls setOffsetAndSize(ordinal, numBytes) with
+        //      no special handling for the case where `numBytes == 0`. Internally,
+        //      setOffsetAndSize computes the offset without taking the size into account. Thus
+        //      the stored offset is the same non-zero offset that would be used if the field's
+        //      dataSize was non-zero (and in (1) above we've shown that case behaves as we
+        //      expect).
+        //
+        // Thus it is safe to perform `existingOffset != 0` checks here in the place of
+        // more expensive null-bit checks.
+        s"""
+           |existingOffset = $getLong(buf, $cursor);
+           |if (existingOffset != 0) {
+           |    $putLong(buf, $cursor, existingOffset + ($shift << 32));
+           |}
+         """.stripMargin
       }
     }
 
     val updateOffsets = ctx.splitExpressions(
       expressions = updateOffset,
       funcName = "copyBitsetFunc",
-      arguments = ("long", "numBytesVariableRow1") :: Nil)
+      arguments = ("long", "numBytesVariableRow1") :: Nil,
+      makeSplitFunction = (s: String) => "long existingOffset;\n" + s)
 
     // ------------------------ Finally, put everything together  --------------------------- //
     val codeBody = s"""
@@ -200,6 +243,7 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
        |    $copyFixedLengthRow2
        |    $copyVariableLengthRow1
        |    $copyVariableLengthRow2
+       |    long existingOffset;
        |    $updateOffsets
        |
        |    out.pointTo(buf, sizeInBytes);
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 7a674ea7f4d7..424871f2047e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -23,6 +23,8 @@ import java.util.{Calendar, TimeZone}
 
 import scala.util.control.NonFatal
 
+import org.apache.commons.lang3.StringEscapeUtils
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -1008,7 +1010,7 @@ case class FromUTCTimestamp(left: Expression, right: Expression)
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     if (right.foldable) {
-      val tz = right.eval()
+      val tz = right.eval().asInstanceOf[UTF8String]
       if (tz == null) {
         ev.copy(code = s"""
            |boolean ${ev.isNull} = true;
@@ -1017,8 +1019,9 @@ case class FromUTCTimestamp(left: Expression, right: Expression)
       } else {
         val tzClass = classOf[TimeZone].getName
         val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+        val escapedTz = StringEscapeUtils.escapeJava(tz.toString)
         val tzTerm = ctx.addMutableState(tzClass, "tz",
-          v => s"""$v = $dtu.getTimeZone("$tz");""")
+          v => s"""$v = $dtu.getTimeZone("$escapedTz");""")
         val utcTerm = "tzUTC"
         ctx.addImmutableStateIfNotExists(tzClass, utcTerm,
           v => s"""$v = $dtu.getTimeZone("UTC");""")
@@ -1185,7 +1188,7 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     if (right.foldable) {
-      val tz = right.eval()
+      val tz = right.eval().asInstanceOf[UTF8String]
       if (tz == null) {
         ev.copy(code = s"""
            |boolean ${ev.isNull} = true;
@@ -1194,8 +1197,9 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
       } else {
         val tzClass = classOf[TimeZone].getName
         val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+        val escapedTz = StringEscapeUtils.escapeJava(tz.toString)
         val tzTerm = ctx.addMutableState(tzClass, "tz",
-          v => s"""$v = $dtu.getTimeZone("$tz");""")
+          v => s"""$v = $dtu.getTimeZone("$escapedTz");""")
         val utcTerm = "tzUTC"
         ctx.addImmutableStateIfNotExists(tzClass, utcTerm,
           v => s"""$v = $dtu.getTimeZone("UTC");""")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 383203a20983..cd176d941819 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -58,7 +58,7 @@ object Literal {
     case s: Short => Literal(s, ShortType)
     case s: String => Literal(UTF8String.fromString(s), StringType)
     case b: Boolean => Literal(b, BooleanType)
-    case d: BigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale))
+    case d: BigDecimal => Literal(Decimal(d), DecimalType.fromBigDecimal(d))
     case d: JavaBigDecimal =>
       Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale()))
     case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index b0da55a4a961..e004bfc6af47 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -58,7 +58,7 @@ case class Concat(children: Seq[Expression]) extends Expression {
     } else {
       val childTypes = children.map(_.dataType)
       if (childTypes.exists(tpe => !Seq(StringType, BinaryType).contains(tpe))) {
-        TypeCheckResult.TypeCheckFailure(
+        return TypeCheckResult.TypeCheckFailure(
           s"input to function $prettyName should have StringType or BinaryType, but it's " +
             childTypes.map(_.simpleString).mkString("[", ", ", "]"))
       }
@@ -271,33 +271,45 @@ case class ConcatWs(children: Seq[Expression])
   }
 }
 
+/**
+ * An expression that returns the `n`-th input in given inputs.
+ * If all inputs are binary, `elt` returns an output as binary. Otherwise, it returns as string.
+ * If any input is null, `elt` returns null.
+ */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(n, str1, str2, ...) - Returns the `n`-th string, e.g., returns `str2` when `n` is 2.",
+  usage = "_FUNC_(n, input1, input2, ...) - Returns the `n`-th input, e.g., returns `input2` when `n` is 2.",
   examples = """
     Examples:
       > SELECT _FUNC_(1, 'scala', 'java');
        scala
   """)
 // scalastyle:on line.size.limit
-case class Elt(children: Seq[Expression])
-  extends Expression with ImplicitCastInputTypes {
+case class Elt(children: Seq[Expression]) extends Expression {
 
   private lazy val indexExpr = children.head
-  private lazy val stringExprs = children.tail.toArray
+  private lazy val inputExprs = children.tail.toArray
 
   /** This expression is always nullable because it returns null if index is out of range. */
   override def nullable: Boolean = true
 
-  override def dataType: DataType = StringType
-
-  override def inputTypes: Seq[DataType] = IntegerType +: Seq.fill(children.size - 1)(StringType)
+  override def dataType: DataType = inputExprs.map(_.dataType).headOption.getOrElse(StringType)
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (children.size < 2) {
       TypeCheckResult.TypeCheckFailure("elt function requires at least two arguments")
     } else {
-      super[ImplicitCastInputTypes].checkInputDataTypes()
+      val (indexType, inputTypes) = (indexExpr.dataType, inputExprs.map(_.dataType))
+      if (indexType != IntegerType) {
+        return TypeCheckResult.TypeCheckFailure(s"first input to function $prettyName should " +
+          s"have IntegerType, but it's $indexType")
+      }
+      if (inputTypes.exists(tpe => !Seq(StringType, BinaryType).contains(tpe))) {
+        return TypeCheckResult.TypeCheckFailure(
+          s"input to function $prettyName should have StringType or BinaryType, but it's " +
+            inputTypes.map(_.simpleString).mkString("[", ", ", "]"))
+      }
+      TypeUtils.checkForSameTypeInputExpr(inputTypes, s"function $prettyName")
     }
   }
 
@@ -307,27 +319,27 @@ case class Elt(children: Seq[Expression])
       null
     } else {
       val index = indexObj.asInstanceOf[Int]
-      if (index <= 0 || index > stringExprs.length) {
+      if (index <= 0 || index > inputExprs.length) {
         null
       } else {
-        stringExprs(index - 1).eval(input)
+        inputExprs(index - 1).eval(input)
       }
     }
   }
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val index = indexExpr.genCode(ctx)
-    val strings = stringExprs.map(_.genCode(ctx))
+    val inputs = inputExprs.map(_.genCode(ctx))
     val indexVal = ctx.freshName("index")
     val indexMatched = ctx.freshName("eltIndexMatched")
 
-    val stringVal = ctx.addMutableState(ctx.javaType(dataType), "stringVal")
+    val inputVal = ctx.addMutableState(ctx.javaType(dataType), "inputVal")
 
-    val assignStringValue = strings.zipWithIndex.map { case (eval, index) =>
+    val assignInputValue = inputs.zipWithIndex.map { case (eval, index) =>
       s"""
          |if ($indexVal == ${index + 1}) {
          |  ${eval.code}
-         |  $stringVal = ${eval.isNull} ? null : ${eval.value};
+         |  $inputVal = ${eval.isNull} ? null : ${eval.value};
          |  $indexMatched = true;
          |  continue;
          |}
@@ -335,7 +347,7 @@ case class Elt(children: Seq[Expression])
     }
 
     val codes = ctx.splitExpressionsWithCurrentInputs(
-      expressions = assignStringValue,
+      expressions = assignInputValue,
       funcName = "eltFunc",
       extraArguments = ("int", indexVal) :: Nil,
       returnType = ctx.JAVA_BOOLEAN,
@@ -361,11 +373,11 @@ case class Elt(children: Seq[Expression])
          |${index.code}
          |final int $indexVal = ${index.value};
          |${ctx.JAVA_BOOLEAN} $indexMatched = false;
-         |$stringVal = null;
+         |$inputVal = null;
          |do {
          |  $codes
          |} while (false);
-         |final UTF8String ${ev.value} = $stringVal;
+         |final ${ctx.javaType(dataType)} ${ev.value} = $inputVal;
          |final boolean ${ev.isNull} = ${ev.value} == null;
        """.stripMargin)
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
index 025a388aacaa..b1672e7e2fca 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala
@@ -40,10 +40,11 @@ private[sql] object CreateJacksonParser extends Serializable {
   }
 
   def text(jsonFactory: JsonFactory, record: Text): JsonParser = {
-    jsonFactory.createParser(record.getBytes, 0, record.getLength)
+    val bain = new ByteArrayInputStream(record.getBytes, 0, record.getLength)
+    jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))
   }
 
   def inputStream(jsonFactory: JsonFactory, record: InputStream): JsonParser = {
-    jsonFactory.createParser(record)
+    jsonFactory.createParser(new InputStreamReader(record, "UTF-8"))
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 0d4b02c6e7d8..c794ba861932 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -795,7 +795,8 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
       project.copy(child = Filter(replaceAlias(condition, aliasMap), grandChild))
 
     case filter @ Filter(condition, aggregate: Aggregate)
-      if aggregate.aggregateExpressions.forall(_.deterministic) =>
+      if aggregate.aggregateExpressions.forall(_.deterministic)
+        && aggregate.groupingExpressions.nonEmpty =>
       // Find all the aliased expressions in the aggregate list that don't include any actual
       // AggregateExpression, and create a map from the alias to the expression
       val aliasMap = AttributeMap(aggregate.aggregateExpressions.collect {
@@ -1221,7 +1222,13 @@ object ReplaceDeduplicateWithAggregate extends Rule[LogicalPlan] {
           Alias(new First(attr).toAggregateExpression(), attr.name)(attr.exprId)
         }
       }
-      Aggregate(keys, aggCols, child)
+      // SPARK-22951: Physical aggregate operators distinguishes global aggregation and grouping
+      // aggregations by checking the number of grouping keys. The key difference here is that a
+      // global aggregation always returns at least one row even if there are no input rows. Here
+      // we append a literal when the grouping key list is empty so that the result aggregate
+      // operator is properly treated as a grouping aggregation.
+      val nonemptyKeys = if (keys.isEmpty) Literal(1) :: Nil else keys
+      Aggregate(nonemptyKeys, aggCols, child)
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 7d830bbb7dc3..1c0b7bd80680 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -506,18 +506,21 @@ object NullPropagation extends Rule[LogicalPlan] {
 
 
 /**
- * Propagate foldable expressions:
  * Replace attributes with aliases of the original foldable expressions if possible.
- * Other optimizations will take advantage of the propagated foldable expressions.
- *
+ * Other optimizations will take advantage of the propagated foldable expressions. For example,
+ * this rule can optimize
  * {{{
  *   SELECT 1.0 x, 'abc' y, Now() z ORDER BY x, y, 3
- *   ==>  SELECT 1.0 x, 'abc' y, Now() z ORDER BY 1.0, 'abc', Now()
  * }}}
+ * to
+ * {{{
+ *   SELECT 1.0 x, 'abc' y, Now() z ORDER BY 1.0, 'abc', Now()
+ * }}}
+ * and other rules can further optimize it and remove the ORDER BY operator.
  */
 object FoldablePropagation extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = {
-    val foldableMap = AttributeMap(plan.flatMap {
+    var foldableMap = AttributeMap(plan.flatMap {
       case Project(projectList, _) => projectList.collect {
         case a: Alias if a.child.foldable => (a.toAttribute, a)
       }
@@ -530,38 +533,44 @@ object FoldablePropagation extends Rule[LogicalPlan] {
     if (foldableMap.isEmpty) {
       plan
     } else {
-      var stop = false
       CleanupAliases(plan.transformUp {
-        // A leaf node should not stop the folding process (note that we are traversing up the
-        // tree, starting at the leaf nodes); so we are allowing it.
-        case l: LeafNode =>
-          l
-
         // We can only propagate foldables for a subset of unary nodes.
-        case u: UnaryNode if !stop && canPropagateFoldables(u) =>
+        case u: UnaryNode if foldableMap.nonEmpty && canPropagateFoldables(u) =>
           u.transformExpressions(replaceFoldable)
 
-        // Allow inner joins. We do not allow outer join, although its output attributes are
-        // derived from its children, they are actually different attributes: the output of outer
-        // join is not always picked from its children, but can also be null.
+        // Join derives the output attributes from its child while they are actually not the
+        // same attributes. For example, the output of outer join is not always picked from its
+        // children, but can also be null. We should exclude these miss-derived attributes when
+        // propagating the foldable expressions.
         // TODO(cloud-fan): It seems more reasonable to use new attributes as the output attributes
         // of outer join.
-        case j @ Join(_, _, Inner, _) if !stop =>
-          j.transformExpressions(replaceFoldable)
-
-        // We can fold the projections an expand holds. However expand changes the output columns
-        // and often reuses the underlying attributes; so we cannot assume that a column is still
-        // foldable after the expand has been applied.
-        // TODO(hvanhovell): Expand should use new attributes as the output attributes.
-        case expand: Expand if !stop =>
-          val newExpand = expand.copy(projections = expand.projections.map { projection =>
+        case j @ Join(left, right, joinType, _) if foldableMap.nonEmpty =>
+          val newJoin = j.transformExpressions(replaceFoldable)
+          val missDerivedAttrsSet: AttributeSet = AttributeSet(joinType match {
+            case _: InnerLike | LeftExistence(_) => Nil
+            case LeftOuter => right.output
+            case RightOuter => left.output
+            case FullOuter => left.output ++ right.output
+          })
+          foldableMap = AttributeMap(foldableMap.baseMap.values.filterNot {
+            case (attr, _) => missDerivedAttrsSet.contains(attr)
+          }.toSeq)
+          newJoin
+
+        // We can not replace the attributes in `Expand.output`. If there are other non-leaf
+        // operators that have the `output` field, we should put them here too.
+        case expand: Expand if foldableMap.nonEmpty =>
+          expand.copy(projections = expand.projections.map { projection =>
             projection.map(_.transform(replaceFoldable))
           })
-          stop = true
-          newExpand
 
-        case other =>
-          stop = true
+        // For other plans, they are not safe to apply foldable propagation, and they should not
+        // propagate foldable expressions from children.
+        case other if foldableMap.nonEmpty =>
+          val childrenOutputSet = AttributeSet(other.children.flatMap(_.output))
+          foldableMap = AttributeMap(foldableMap.baseMap.values.filterNot {
+            case (attr, _) => childrenOutputSet.contains(attr)
+          }.toSeq)
           other
       })
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
index 2673bea648d0..709db6d8bec7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
@@ -369,13 +369,14 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] {
           case ne => (ne.exprId, evalAggOnZeroTups(ne))
         }.toMap
 
-      case _ => sys.error(s"Unexpected operator in scalar subquery: $lp")
+      case _ =>
+        sys.error(s"Unexpected operator in scalar subquery: $lp")
     }
 
     val resultMap = evalPlan(plan)
 
     // By convention, the scalar subquery result is the leftmost field.
-    resultMap(plan.output.head.exprId)
+    resultMap.getOrElse(plan.output.head.exprId, None)
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
index 9b127f91648e..89347f4b1f7b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.sql.catalyst.parser
 
+import java.util
+
 import scala.collection.mutable.StringBuilder
 
 import org.antlr.v4.runtime.{ParserRuleContext, Token}
@@ -39,6 +41,13 @@ object ParserUtils {
     throw new ParseException(s"Operation not allowed: $message", ctx)
   }
 
+  def checkDuplicateClauses[T](
+      nodes: util.List[T], clauseName: String, ctx: ParserRuleContext): Unit = {
+    if (nodes.size() > 1) {
+      throw new ParseException(s"Found duplicate clauses: $clauseName", ctx)
+    }
+  }
+
   /** Check if duplicate keys exist in a set of key-value pairs. */
   def checkDuplicateKeys[T](keyPairs: Seq[(String, T)], ctx: ParserRuleContext): Unit = {
     keyPairs.groupBy(_._1).filter(_._2.size > 1).foreach { case (key, _) =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index a38458add7b5..c8ccd9bd0399 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -247,12 +247,15 @@ abstract class UnaryNode extends LogicalPlan {
   protected def getAliasedConstraints(projectList: Seq[NamedExpression]): Set[Expression] = {
     var allConstraints = child.constraints.asInstanceOf[Set[Expression]]
     projectList.foreach {
+      case a @ Alias(l: Literal, _) =>
+        allConstraints += EqualTo(a.toAttribute, l)
       case a @ Alias(e, _) =>
         // For every alias in `projectList`, replace the reference in constraints by its attribute.
         allConstraints ++= allConstraints.map(_ transform {
           case expr: Expression if expr.semanticEquals(e) =>
             a.toAttribute
         })
+        allConstraints += EqualNullSafe(e, a.toAttribute)
       case _ => // Don't change.
     }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
index b0f611fd38de..5c7b8e5b9788 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
@@ -94,25 +94,16 @@ trait QueryPlanConstraints { self: LogicalPlan =>
     case _ => Seq.empty[Attribute]
   }
 
-  // Collect aliases from expressions of the whole tree rooted by the current QueryPlan node, so
-  // we may avoid producing recursive constraints.
-  private lazy val aliasMap: AttributeMap[Expression] = AttributeMap(
-    expressions.collect {
-      case a: Alias => (a.toAttribute, a.child)
-    } ++ children.flatMap(_.asInstanceOf[QueryPlanConstraints].aliasMap))
-    // Note: the explicit cast is necessary, since Scala compiler fails to infer the type.
-
   /**
    * Infers an additional set of constraints from a given set of equality constraints.
    * For e.g., if an operator has constraints of the form (`a = 5`, `a = b`), this returns an
    * additional constraint of the form `b = 5`.
    */
   private def inferAdditionalConstraints(constraints: Set[Expression]): Set[Expression] = {
-    val aliasedConstraints = eliminateAliasedExpressionInConstraints(constraints)
     var inferredConstraints = Set.empty[Expression]
-    aliasedConstraints.foreach {
+    constraints.foreach {
       case eq @ EqualTo(l: Attribute, r: Attribute) =>
-        val candidateConstraints = aliasedConstraints - eq
+        val candidateConstraints = constraints - eq
         inferredConstraints ++= replaceConstraints(candidateConstraints, l, r)
         inferredConstraints ++= replaceConstraints(candidateConstraints, r, l)
       case _ => // No inference
@@ -120,30 +111,6 @@ trait QueryPlanConstraints { self: LogicalPlan =>
     inferredConstraints -- constraints
   }
 
-  /**
-   * Replace the aliased expression in [[Alias]] with the alias name if both exist in constraints.
-   * Thus non-converging inference can be prevented.
-   * E.g. `Alias(b, f(a)), a = b` infers `f(a) = f(f(a))` without eliminating aliased expressions.
-   * Also, the size of constraints is reduced without losing any information.
-   * When the inferred filters are pushed down the operators that generate the alias,
-   * the alias names used in filters are replaced by the aliased expressions.
-   */
-  private def eliminateAliasedExpressionInConstraints(constraints: Set[Expression])
-    : Set[Expression] = {
-    val attributesInEqualTo = constraints.flatMap {
-      case EqualTo(l: Attribute, r: Attribute) => l :: r :: Nil
-      case _ => Nil
-    }
-    var aliasedConstraints = constraints
-    attributesInEqualTo.foreach { a =>
-      if (aliasMap.contains(a)) {
-        val child = aliasMap.get(a).get
-        aliasedConstraints = replaceConstraints(aliasedConstraints, child, a)
-      }
-    }
-    aliasedConstraints
-  }
-
   private def replaceConstraints(
       constraints: Set[Expression],
       source: Expression,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 95e099c340af..a4fca790dd08 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -903,6 +903,7 @@ case class Deduplicate(
  * This analysis barrier will be removed at the end of analysis stage.
  */
 case class AnalysisBarrier(child: LogicalPlan) extends LeafNode {
+  override protected def innerChildren: Seq[LogicalPlan] = Seq(child)
   override def output: Seq[Attribute] = child.output
   override def isStreaming: Boolean = child.isStreaming
   override def doCanonicalize(): LogicalPlan = child.canonicalized
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index e57c842ce2a3..0189bd73c56b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -30,18 +30,43 @@ import org.apache.spark.sql.types.{DataType, IntegerType}
  *  - Intra-partition ordering of data: In this case the distribution describes guarantees made
  *    about how tuples are distributed within a single partition.
  */
-sealed trait Distribution
+sealed trait Distribution {
+  /**
+   * The required number of partitions for this distribution. If it's None, then any number of
+   * partitions is allowed for this distribution.
+   */
+  def requiredNumPartitions: Option[Int]
+
+  /**
+   * Creates a default partitioning for this distribution, which can satisfy this distribution while
+   * matching the given number of partitions.
+   */
+  def createPartitioning(numPartitions: Int): Partitioning
+}
 
 /**
  * Represents a distribution where no promises are made about co-location of data.
  */
-case object UnspecifiedDistribution extends Distribution
+case object UnspecifiedDistribution extends Distribution {
+  override def requiredNumPartitions: Option[Int] = None
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    throw new IllegalStateException("UnspecifiedDistribution does not have default partitioning.")
+  }
+}
 
 /**
  * Represents a distribution that only has a single partition and all tuples of the dataset
  * are co-located.
  */
-case object AllTuples extends Distribution
+case object AllTuples extends Distribution {
+  override def requiredNumPartitions: Option[Int] = Some(1)
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    assert(numPartitions == 1, "The default partitioning of AllTuples can only have 1 partition.")
+    SinglePartition
+  }
+}
 
 /**
  * Represents data where tuples that share the same values for the `clustering`
@@ -51,12 +76,41 @@ case object AllTuples extends Distribution
  */
 case class ClusteredDistribution(
     clustering: Seq[Expression],
-    numPartitions: Option[Int] = None) extends Distribution {
+    requiredNumPartitions: Option[Int] = None) extends Distribution {
   require(
     clustering != Nil,
     "The clustering expressions of a ClusteredDistribution should not be Nil. " +
       "An AllTuples should be used to represent a distribution that only has " +
       "a single partition.")
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    assert(requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions,
+      s"This ClusteredDistribution requires ${requiredNumPartitions.get} partitions, but " +
+        s"the actual number of partitions is $numPartitions.")
+    HashPartitioning(clustering, numPartitions)
+  }
+}
+
+/**
+ * Represents data where tuples have been clustered according to the hash of the given
+ * `expressions`. The hash function is defined as `HashPartitioning.partitionIdExpression`, so only
+ * [[HashPartitioning]] can satisfy this distribution.
+ *
+ * This is a strictly stronger guarantee than [[ClusteredDistribution]]. Given a tuple and the
+ * number of partitions, this distribution strictly requires which partition the tuple should be in.
+ */
+case class HashClusteredDistribution(expressions: Seq[Expression]) extends Distribution {
+  require(
+    expressions != Nil,
+    "The expressions for hash of a HashPartitionedDistribution should not be Nil. " +
+      "An AllTuples should be used to represent a distribution that only has " +
+      "a single partition.")
+
+  override def requiredNumPartitions: Option[Int] = None
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    HashPartitioning(expressions, numPartitions)
+  }
 }
 
 /**
@@ -73,46 +127,31 @@ case class OrderedDistribution(ordering: Seq[SortOrder]) extends Distribution {
       "An AllTuples should be used to represent a distribution that only has " +
       "a single partition.")
 
-  // TODO: This is not really valid...
-  def clustering: Set[Expression] = ordering.map(_.child).toSet
+  override def requiredNumPartitions: Option[Int] = None
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    RangePartitioning(ordering, numPartitions)
+  }
 }
 
 /**
  * Represents data where tuples are broadcasted to every node. It is quite common that the
  * entire set of tuples is transformed into different data structure.
  */
-case class BroadcastDistribution(mode: BroadcastMode) extends Distribution
+case class BroadcastDistribution(mode: BroadcastMode) extends Distribution {
+  override def requiredNumPartitions: Option[Int] = Some(1)
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    assert(numPartitions == 1,
+      "The default partitioning of BroadcastDistribution can only have 1 partition.")
+    BroadcastPartitioning(mode)
+  }
+}
 
 /**
- * Describes how an operator's output is split across partitions. The `compatibleWith`,
- * `guarantees`, and `satisfies` methods describe relationships between child partitionings,
- * target partitionings, and [[Distribution]]s. These relations are described more precisely in
- * their individual method docs, but at a high level:
- *
- *  - `satisfies` is a relationship between partitionings and distributions.
- *  - `compatibleWith` is relationships between an operator's child output partitionings.
- *  - `guarantees` is a relationship between a child's existing output partitioning and a target
- *     output partitioning.
- *
- *  Diagrammatically:
- *
- *            +--------------+
- *            | Distribution |
- *            +--------------+
- *                    ^
- *                    |
- *               satisfies
- *                    |
- *            +--------------+                  +--------------+
- *            |    Child     |                  |    Target    |
- *       +----| Partitioning |----guarantees--->| Partitioning |
- *       |    +--------------+                  +--------------+
- *       |            ^
- *       |            |
- *       |     compatibleWith
- *       |            |
- *       +------------+
- *
+ * Describes how an operator's output is split across partitions. It has 2 major properties:
+ *   1. number of partitions.
+ *   2. if it can satisfy a given distribution.
  */
 sealed trait Partitioning {
   /** Returns the number of partitions that the data is split across */
@@ -123,113 +162,35 @@ sealed trait Partitioning {
    * to satisfy the partitioning scheme mandated by the `required` [[Distribution]],
    * i.e. the current dataset does not need to be re-partitioned for the `required`
    * Distribution (it is possible that tuples within a partition need to be reorganized).
-   */
-  def satisfies(required: Distribution): Boolean
-
-  /**
-   * Returns true iff we can say that the partitioning scheme of this [[Partitioning]]
-   * guarantees the same partitioning scheme described by `other`.
-   *
-   * Compatibility of partitionings is only checked for operators that have multiple children
-   * and that require a specific child output [[Distribution]], such as joins.
-   *
-   * Intuitively, partitionings are compatible if they route the same partitioning key to the same
-   * partition. For instance, two hash partitionings are only compatible if they produce the same
-   * number of output partitionings and hash records according to the same hash function and
-   * same partitioning key schema.
-   *
-   * Put another way, two partitionings are compatible with each other if they satisfy all of the
-   * same distribution guarantees.
-   */
-  def compatibleWith(other: Partitioning): Boolean
-
-  /**
-   * Returns true iff we can say that the partitioning scheme of this [[Partitioning]] guarantees
-   * the same partitioning scheme described by `other`. If a `A.guarantees(B)`, then repartitioning
-   * the child's output according to `B` will be unnecessary. `guarantees` is used as a performance
-   * optimization to allow the exchange planner to avoid redundant repartitionings. By default,
-   * a partitioning only guarantees partitionings that are equal to itself (i.e. the same number
-   * of partitions, same strategy (range or hash), etc).
-   *
-   * In order to enable more aggressive optimization, this strict equality check can be relaxed.
-   * For example, say that the planner needs to repartition all of an operator's children so that
-   * they satisfy the [[AllTuples]] distribution. One way to do this is to repartition all children
-   * to have the [[SinglePartition]] partitioning. If one of the operator's children already happens
-   * to be hash-partitioned with a single partition then we do not need to re-shuffle this child;
-   * this repartitioning can be avoided if a single-partition [[HashPartitioning]] `guarantees`
-   * [[SinglePartition]].
-   *
-   * The SinglePartition example given above is not particularly interesting; guarantees' real
-   * value occurs for more advanced partitioning strategies. SPARK-7871 will introduce a notion
-   * of null-safe partitionings, under which partitionings can specify whether rows whose
-   * partitioning keys contain null values will be grouped into the same partition or whether they
-   * will have an unknown / random distribution. If a partitioning does not require nulls to be
-   * clustered then a partitioning which _does_ cluster nulls will guarantee the null clustered
-   * partitioning. The converse is not true, however: a partitioning which clusters nulls cannot
-   * be guaranteed by one which does not cluster them. Thus, in general `guarantees` is not a
-   * symmetric relation.
    *
-   * Another way to think about `guarantees`: if `A.guarantees(B)`, then any partitioning of rows
-   * produced by `A` could have also been produced by `B`.
+   * By default a [[Partitioning]] can satisfy [[UnspecifiedDistribution]], and [[AllTuples]] if
+   * the [[Partitioning]] only have one partition. Implementations can overwrite this method with
+   * special logic.
    */
-  def guarantees(other: Partitioning): Boolean = this == other
-}
-
-object Partitioning {
-  def allCompatible(partitionings: Seq[Partitioning]): Boolean = {
-    // Note: this assumes transitivity
-    partitionings.sliding(2).map {
-      case Seq(a) => true
-      case Seq(a, b) =>
-        if (a.numPartitions != b.numPartitions) {
-          assert(!a.compatibleWith(b) && !b.compatibleWith(a))
-          false
-        } else {
-          a.compatibleWith(b) && b.compatibleWith(a)
-        }
-    }.forall(_ == true)
-  }
-}
-
-case class UnknownPartitioning(numPartitions: Int) extends Partitioning {
-  override def satisfies(required: Distribution): Boolean = required match {
+  def satisfies(required: Distribution): Boolean = required match {
     case UnspecifiedDistribution => true
+    case AllTuples => numPartitions == 1
     case _ => false
   }
-
-  override def compatibleWith(other: Partitioning): Boolean = false
-
-  override def guarantees(other: Partitioning): Boolean = false
 }
 
+case class UnknownPartitioning(numPartitions: Int) extends Partitioning
+
 /**
  * Represents a partitioning where rows are distributed evenly across output partitions
  * by starting from a random target partition number and distributing rows in a round-robin
  * fashion. This partitioning is used when implementing the DataFrame.repartition() operator.
  */
-case class RoundRobinPartitioning(numPartitions: Int) extends Partitioning {
-  override def satisfies(required: Distribution): Boolean = required match {
-    case UnspecifiedDistribution => true
-    case _ => false
-  }
-
-  override def compatibleWith(other: Partitioning): Boolean = false
-
-  override def guarantees(other: Partitioning): Boolean = false
-}
+case class RoundRobinPartitioning(numPartitions: Int) extends Partitioning
 
 case object SinglePartition extends Partitioning {
   val numPartitions = 1
 
   override def satisfies(required: Distribution): Boolean = required match {
     case _: BroadcastDistribution => false
-    case ClusteredDistribution(_, desiredPartitions) => desiredPartitions.forall(_ == 1)
+    case ClusteredDistribution(_, Some(requiredNumPartitions)) => requiredNumPartitions == 1
     case _ => true
   }
-
-  override def compatibleWith(other: Partitioning): Boolean = other.numPartitions == 1
-
-  override def guarantees(other: Partitioning): Boolean = other.numPartitions == 1
 }
 
 /**
@@ -244,22 +205,19 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
   override def nullable: Boolean = false
   override def dataType: DataType = IntegerType
 
-  override def satisfies(required: Distribution): Boolean = required match {
-    case UnspecifiedDistribution => true
-    case ClusteredDistribution(requiredClustering, desiredPartitions) =>
-      expressions.forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
-        desiredPartitions.forall(_ == numPartitions) // if desiredPartitions = None, returns true
-    case _ => false
-  }
-
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case o: HashPartitioning => this.semanticEquals(o)
-    case _ => false
-  }
-
-  override def guarantees(other: Partitioning): Boolean = other match {
-    case o: HashPartitioning => this.semanticEquals(o)
-    case _ => false
+  override def satisfies(required: Distribution): Boolean = {
+    super.satisfies(required) || {
+      required match {
+        case h: HashClusteredDistribution =>
+          expressions.length == h.expressions.length && expressions.zip(h.expressions).forall {
+            case (l, r) => l.semanticEquals(r)
+          }
+        case ClusteredDistribution(requiredClustering, requiredNumPartitions) =>
+          expressions.forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
+            (requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions)
+        case _ => false
+      }
+    }
   }
 
   /**
@@ -288,25 +246,18 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int)
   override def nullable: Boolean = false
   override def dataType: DataType = IntegerType
 
-  override def satisfies(required: Distribution): Boolean = required match {
-    case UnspecifiedDistribution => true
-    case OrderedDistribution(requiredOrdering) =>
-      val minSize = Seq(requiredOrdering.size, ordering.size).min
-      requiredOrdering.take(minSize) == ordering.take(minSize)
-    case ClusteredDistribution(requiredClustering, desiredPartitions) =>
-      ordering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
-        desiredPartitions.forall(_ == numPartitions) // if desiredPartitions = None, returns true
-    case _ => false
-  }
-
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case o: RangePartitioning => this.semanticEquals(o)
-    case _ => false
-  }
-
-  override def guarantees(other: Partitioning): Boolean = other match {
-    case o: RangePartitioning => this.semanticEquals(o)
-    case _ => false
+  override def satisfies(required: Distribution): Boolean = {
+    super.satisfies(required) || {
+      required match {
+        case OrderedDistribution(requiredOrdering) =>
+          val minSize = Seq(requiredOrdering.size, ordering.size).min
+          requiredOrdering.take(minSize) == ordering.take(minSize)
+        case ClusteredDistribution(requiredClustering, requiredNumPartitions) =>
+          ordering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
+            (requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions)
+        case _ => false
+      }
+    }
   }
 }
 
@@ -347,20 +298,6 @@ case class PartitioningCollection(partitionings: Seq[Partitioning])
   override def satisfies(required: Distribution): Boolean =
     partitionings.exists(_.satisfies(required))
 
-  /**
-   * Returns true if any `partitioning` of this collection is compatible with
-   * the given [[Partitioning]].
-   */
-  override def compatibleWith(other: Partitioning): Boolean =
-    partitionings.exists(_.compatibleWith(other))
-
-  /**
-   * Returns true if any `partitioning` of this collection guarantees
-   * the given [[Partitioning]].
-   */
-  override def guarantees(other: Partitioning): Boolean =
-    partitionings.exists(_.guarantees(other))
-
   override def toString: String = {
     partitionings.map(_.toString).mkString("(", " or ", ")")
   }
@@ -377,9 +314,4 @@ case class BroadcastPartitioning(mode: BroadcastMode) extends Partitioning {
     case BroadcastDistribution(m) if m == mode => true
     case _ => false
   }
-
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case BroadcastPartitioning(m) if m == mode => true
-    case _ => false
-  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
index eb7941cf9e6a..b013add9c977 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
@@ -105,7 +105,7 @@ class QuantileSummaries(
         if (newSamples.isEmpty || (sampleIdx == sampled.length && opsIdx == sorted.length - 1)) {
           0
         } else {
-          math.floor(2 * relativeError * currentCount).toInt
+          math.floor(2 * relativeError * currentCount).toLong
         }
 
       val tuple = Stats(currentSample, 1, delta)
@@ -192,10 +192,10 @@ class QuantileSummaries(
     }
 
     // Target rank
-    val rank = math.ceil(quantile * count).toInt
+    val rank = math.ceil(quantile * count).toLong
     val targetError = relativeError * count
     // Minimum rank at current sample
-    var minRank = 0
+    var minRank = 0L
     var i = 0
     while (i < sampled.length - 1) {
       val curSample = sampled(i)
@@ -235,7 +235,7 @@ object QuantileSummaries {
    * @param g the minimum rank jump from the previous value's minimum rank
    * @param delta the maximum span of the rank.
    */
-  case class Stats(value: Double, g: Int, delta: Int)
+  case class Stats(value: Double, g: Long, delta: Long)
 
   private def compressImmut(
       currentSamples: IndexedSeq[Stats],
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 4f77c54a7af5..cc4f4bf33245 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -27,11 +27,13 @@ import scala.util.matching.Regex
 
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.{SparkContext, SparkEnv}
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator
+import org.apache.spark.util.Utils
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // This file defines the configuration options for Spark SQL.
@@ -70,7 +72,7 @@ object SQLConf {
    * Default config. Only used when there is no active SparkSession for the thread.
    * See [[get]] for more information.
    */
-  private val fallbackConf = new ThreadLocal[SQLConf] {
+  private lazy val fallbackConf = new ThreadLocal[SQLConf] {
     override def initialValue: SQLConf = new SQLConf
   }
 
@@ -247,7 +249,7 @@ object SQLConf {
   val CONSTRAINT_PROPAGATION_ENABLED = buildConf("spark.sql.constraintPropagation.enabled")
     .internal()
     .doc("When true, the query optimizer will infer and propagate data constraints in the query " +
-      "plan to optimize them. Constraint propagation can sometimes be computationally expensive" +
+      "plan to optimize them. Constraint propagation can sometimes be computationally expensive " +
       "for certain kinds of query plans (such as those with a large number of predicates and " +
       "aliases) which might negatively impact overall runtime.")
     .booleanConf
@@ -261,6 +263,15 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val FILE_COMRESSION_FACTOR = buildConf("spark.sql.sources.fileCompressionFactor")
+    .internal()
+    .doc("When estimating the output data size of a table scan, multiply the file size with this " +
+      "factor as the estimated data size, in case the data is compressed in the file and lead to" +
+      " a heavily underestimated result.")
+    .doubleConf
+    .checkValue(_ > 0, "the value of fileDataSizeFactor must be larger than 0")
+    .createWithDefault(1.0)
+
   val PARQUET_SCHEMA_MERGING_ENABLED = buildConf("spark.sql.parquet.mergeSchema")
     .doc("When true, the Parquet data source merges schemas collected from all data files, " +
          "otherwise the schema is picked from the summary file or a random data file " +
@@ -323,11 +334,14 @@ object SQLConf {
     .createWithDefault(false)
 
   val PARQUET_COMPRESSION = buildConf("spark.sql.parquet.compression.codec")
-    .doc("Sets the compression codec use when writing Parquet files. Acceptable values include: " +
-      "uncompressed, snappy, gzip, lzo.")
+    .doc("Sets the compression codec used when writing Parquet files. If either `compression` or " +
+      "`parquet.compression` is specified in the table-specific options/properties, the " +
+      "precedence would be `compression`, `parquet.compression`, " +
+      "`spark.sql.parquet.compression.codec`. Acceptable values include: none, uncompressed, " +
+      "snappy, gzip, lzo.")
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
-    .checkValues(Set("uncompressed", "snappy", "gzip", "lzo"))
+    .checkValues(Set("none", "uncompressed", "snappy", "gzip", "lzo"))
     .createWithDefault("snappy")
 
   val PARQUET_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.parquet.filterPushdown")
@@ -364,8 +378,10 @@ object SQLConf {
       .createWithDefault(true)
 
   val ORC_COMPRESSION = buildConf("spark.sql.orc.compression.codec")
-    .doc("Sets the compression codec use when writing ORC files. Acceptable values include: " +
-      "none, uncompressed, snappy, zlib, lzo.")
+    .doc("Sets the compression codec used when writing ORC files. If either `compression` or " +
+      "`orc.compress` is specified in the table-specific options/properties, the precedence " +
+      "would be `compression`, `orc.compress`, `spark.sql.orc.compression.codec`." +
+      "Acceptable values include: none, uncompressed, snappy, zlib, lzo.")
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
     .checkValues(Set("none", "uncompressed", "snappy", "zlib", "lzo"))
@@ -379,10 +395,22 @@ object SQLConf {
     .checkValues(Set("hive", "native"))
     .createWithDefault("native")
 
+  val ORC_VECTORIZED_READER_ENABLED = buildConf("spark.sql.orc.enableVectorizedReader")
+    .doc("Enables vectorized orc decoding.")
+    .booleanConf
+    .createWithDefault(true)
+
+  val ORC_COPY_BATCH_TO_SPARK = buildConf("spark.sql.orc.copyBatchToSpark")
+    .doc("Whether or not to copy the ORC columnar batch to Spark columnar batch in the " +
+      "vectorized ORC reader.")
+    .internal()
+    .booleanConf
+    .createWithDefault(false)
+
   val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown")
     .doc("When true, enable filter pushdown for ORC files.")
     .booleanConf
-    .createWithDefault(false)
+    .createWithDefault(true)
 
   val HIVE_VERIFY_PARTITION_PATH = buildConf("spark.sql.hive.verifyPartitionPath")
     .doc("When true, check all the partition paths under the table\'s root directory " +
@@ -1036,6 +1064,16 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val DECIMAL_OPERATIONS_ALLOW_PREC_LOSS =
+    buildConf("spark.sql.decimalOperations.allowPrecisionLoss")
+      .internal()
+      .doc("When true (default), establishing the result type of an arithmetic operation " +
+        "happens according to Hive behavior and SQL ANSI 2011 specification, ie. rounding the " +
+        "decimal part of the result if an exact representation is not possible. Otherwise, NULL " +
+        "is returned in those cases, as previously.")
+      .booleanConf
+      .createWithDefault(true)
+
   val SQL_STRING_REDACTION_PATTERN =
     ConfigBuilder("spark.sql.redaction.string.regex")
       .doc("Regex to decide which parts of strings produced by Spark contain sensitive " +
@@ -1050,6 +1088,12 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val ELT_OUTPUT_AS_STRING = buildConf("spark.sql.function.eltOutputAsString")
+    .doc("When this option is set to false and all inputs are binary, `elt` returns " +
+      "an output as binary. Otherwise, it returns as a string. ")
+    .booleanConf
+    .createWithDefault(false)
+
   val CONTINUOUS_STREAMING_EXECUTOR_QUEUE_SIZE =
     buildConf("spark.sql.streaming.continuous.executorQueueSize")
     .internal()
@@ -1066,6 +1110,24 @@ object SQLConf {
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefault(100)
 
+  object PartitionOverwriteMode extends Enumeration {
+    val STATIC, DYNAMIC = Value
+  }
+
+  val PARTITION_OVERWRITE_MODE =
+    buildConf("spark.sql.sources.partitionOverwriteMode")
+      .doc("When INSERT OVERWRITE a partitioned data source table, we currently support 2 modes: " +
+        "static and dynamic. In static mode, Spark deletes all the partitions that match the " +
+        "partition specification(e.g. PARTITION(a=1,b)) in the INSERT statement, before " +
+        "overwriting. In dynamic mode, Spark doesn't delete partitions ahead, and only overwrite " +
+        "those partitions that have data written into it at runtime. By default we use static " +
+        "mode to keep the same behavior of Spark prior to 2.3. Note that this config doesn't " +
+        "affect Hive serde tables, as they are always overwritten with dynamic mode.")
+      .stringConf
+      .transform(_.toUpperCase(Locale.ROOT))
+      .checkValues(PartitionOverwriteMode.values.map(_.toString))
+      .createWithDefault(PartitionOverwriteMode.STATIC.toString)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1087,6 +1149,12 @@ object SQLConf {
 class SQLConf extends Serializable with Logging {
   import SQLConf._
 
+  if (Utils.isTesting && SparkEnv.get != null) {
+    // assert that we're only accessing it on the driver.
+    assert(SparkEnv.get.executorId == SparkContext.DRIVER_IDENTIFIER,
+      "SQLConf should only be created and accessed on the driver.")
+  }
+
   /** Only low degree of contention is expected for conf, thus NOT using ConcurrentHashMap. */
   @transient protected[spark] val settings = java.util.Collections.synchronizedMap(
     new java.util.HashMap[String, String]())
@@ -1146,6 +1214,8 @@ class SQLConf extends Serializable with Logging {
 
   def orcCompressionCodec: String = getConf(ORC_COMPRESSION)
 
+  def orcVectorizedReaderEnabled: Boolean = getConf(ORC_VECTORIZED_READER_ENABLED)
+
   def parquetCompressionCodec: String = getConf(PARQUET_COMPRESSION)
 
   def parquetVectorizedReaderEnabled: Boolean = getConf(PARQUET_VECTORIZED_READER_ENABLED)
@@ -1204,6 +1274,8 @@ class SQLConf extends Serializable with Logging {
 
   def escapedStringLiterals: Boolean = getConf(ESCAPED_STRING_LITERALS)
 
+  def fileCompressionFactor: Double = getConf(FILE_COMRESSION_FACTOR)
+
   def stringRedationPattern: Option[Regex] = SQL_STRING_REDACTION_PATTERN.readFrom(reader)
 
   /**
@@ -1379,6 +1451,8 @@ class SQLConf extends Serializable with Logging {
 
   def replaceExceptWithFilter: Boolean = getConf(REPLACE_EXCEPT_WITH_FILTER)
 
+  def decimalOperationsAllowPrecisionLoss: Boolean = getConf(DECIMAL_OPERATIONS_ALLOW_PREC_LOSS)
+
   def continuousStreamingExecutorQueueSize: Int = getConf(CONTINUOUS_STREAMING_EXECUTOR_QUEUE_SIZE)
 
   def continuousStreamingExecutorPollIntervalMs: Long =
@@ -1386,6 +1460,11 @@ class SQLConf extends Serializable with Logging {
 
   def concatBinaryAsString: Boolean = getConf(CONCAT_BINARY_AS_STRING)
 
+  def eltOutputAsString: Boolean = getConf(ELT_OUTPUT_AS_STRING)
+
+  def partitionOverwriteMode: PartitionOverwriteMode.Value =
+    PartitionOverwriteMode.withName(getConf(PARTITION_OVERWRITE_MODE))
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index 6e050c18b8ac..ef3b67c0d48d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -23,7 +23,7 @@ import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.{Expression, Literal}
 
 
 /**
@@ -117,6 +117,7 @@ object DecimalType extends AbstractDataType {
   val MAX_SCALE = 38
   val SYSTEM_DEFAULT: DecimalType = DecimalType(MAX_PRECISION, 18)
   val USER_DEFAULT: DecimalType = DecimalType(10, 0)
+  val MINIMUM_ADJUSTED_SCALE = 6
 
   // The decimal types compatible with other numeric types
   private[sql] val ByteDecimal = DecimalType(3, 0)
@@ -136,10 +137,52 @@ object DecimalType extends AbstractDataType {
     case DoubleType => DoubleDecimal
   }
 
+  private[sql] def fromLiteral(literal: Literal): DecimalType = literal.value match {
+    case v: Short => fromBigDecimal(BigDecimal(v))
+    case v: Int => fromBigDecimal(BigDecimal(v))
+    case v: Long => fromBigDecimal(BigDecimal(v))
+    case _ => forType(literal.dataType)
+  }
+
+  private[sql] def fromBigDecimal(d: BigDecimal): DecimalType = {
+    DecimalType(Math.max(d.precision, d.scale), d.scale)
+  }
+
   private[sql] def bounded(precision: Int, scale: Int): DecimalType = {
     DecimalType(min(precision, MAX_PRECISION), min(scale, MAX_SCALE))
   }
 
+  /**
+   * Scale adjustment implementation is based on Hive's one, which is itself inspired to
+   * SQLServer's one. In particular, when a result precision is greater than
+   * {@link #MAX_PRECISION}, the corresponding scale is reduced to prevent the integral part of a
+   * result from being truncated.
+   *
+   * This method is used only when `spark.sql.decimalOperations.allowPrecisionLoss` is set to true.
+   */
+  private[sql] def adjustPrecisionScale(precision: Int, scale: Int): DecimalType = {
+    // Assumptions:
+    assert(precision >= scale)
+    assert(scale >= 0)
+
+    if (precision <= MAX_PRECISION) {
+      // Adjustment only needed when we exceed max precision
+      DecimalType(precision, scale)
+    } else {
+      // Precision/scale exceed maximum precision. Result must be adjusted to MAX_PRECISION.
+      val intDigits = precision - scale
+      // If original scale is less than MINIMUM_ADJUSTED_SCALE, use original scale value; otherwise
+      // preserve at least MINIMUM_ADJUSTED_SCALE fractional digits
+      val minScaleValue = Math.min(scale, MINIMUM_ADJUSTED_SCALE)
+      // The resulting scale is the maximum between what is available without causing a loss of
+      // digits for the integer part of the decimal and the minimum guaranteed scale, which is
+      // computed above
+      val adjustedScale = Math.max(MAX_PRECISION - intDigits, minScaleValue)
+
+      DecimalType(MAX_PRECISION, adjustedScale)
+    }
+  }
+
   override private[sql] def defaultConcreteType: DataType = SYSTEM_DEFAULT
 
   override private[sql] def acceptsType(other: DataType): Boolean = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala
deleted file mode 100644
index 5b802ccc637d..000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.{InterpretedMutableProjection, Literal}
-import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, HashPartitioning}
-
-class PartitioningSuite extends SparkFunSuite {
-  test("HashPartitioning compatibility should be sensitive to expression ordering (SPARK-9785)") {
-    val expressions = Seq(Literal(2), Literal(3))
-    // Consider two HashPartitionings that have the same _set_ of hash expressions but which are
-    // created with different orderings of those expressions:
-    val partitioningA = HashPartitioning(expressions, 100)
-    val partitioningB = HashPartitioning(expressions.reverse, 100)
-    // These partitionings are not considered equal:
-    assert(partitioningA != partitioningB)
-    // However, they both satisfy the same clustered distribution:
-    val distribution = ClusteredDistribution(expressions)
-    assert(partitioningA.satisfies(distribution))
-    assert(partitioningB.satisfies(distribution))
-    // These partitionings compute different hashcodes for the same input row:
-    def computeHashCode(partitioning: HashPartitioning): Int = {
-      val hashExprProj = new InterpretedMutableProjection(partitioning.expressions, Seq.empty)
-      hashExprProj.apply(InternalRow.empty).hashCode()
-    }
-    assert(computeHashCode(partitioningA) != computeHashCode(partitioningB))
-    // Thus, these partitionings are incompatible:
-    assert(!partitioningA.compatibleWith(partitioningB))
-    assert(!partitioningB.compatibleWith(partitioningA))
-    assert(!partitioningA.guarantees(partitioningB))
-    assert(!partitioningB.guarantees(partitioningA))
-
-    // Just to be sure that we haven't cheated by having these methods always return false,
-    // check that identical partitionings are still compatible with and guarantee each other:
-    assert(partitioningA === partitioningA)
-    assert(partitioningA.guarantees(partitioningA))
-    assert(partitioningA.compatibleWith(partitioningA))
-  }
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index 23e866cdf491..8c3db48a01f1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -356,4 +356,13 @@ class ScalaReflectionSuite extends SparkFunSuite {
     assert(deserializerFor[Int].isInstanceOf[AssertNotNull])
     assert(!deserializerFor[String].isInstanceOf[AssertNotNull])
   }
+
+  test("SPARK-23025: schemaFor should support Null type") {
+    val schema = schemaFor[(Int, Null)]
+    assert(schema === Schema(
+      StructType(Seq(
+        StructField("_1", IntegerType, nullable = false),
+        StructField("_2", NullType, nullable = true))),
+      nullable = true))
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index f4514205d3ae..cd8579584ead 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -408,8 +408,8 @@ class AnalysisSuite extends AnalysisTest with Matchers {
     assertExpressionType(sum(Divide(1.0, 2.0)), DoubleType)
     assertExpressionType(sum(Divide(1, 2.0f)), DoubleType)
     assertExpressionType(sum(Divide(1.0f, 2)), DoubleType)
-    assertExpressionType(sum(Divide(1, Decimal(2))), DecimalType(31, 11))
-    assertExpressionType(sum(Divide(Decimal(1), 2)), DecimalType(31, 11))
+    assertExpressionType(sum(Divide(1, Decimal(2))), DecimalType(22, 11))
+    assertExpressionType(sum(Divide(Decimal(1), 2)), DecimalType(26, 6))
     assertExpressionType(sum(Divide(Decimal(1), 2.0)), DoubleType)
     assertExpressionType(sum(Divide(1.0, Decimal(2.0))), DoubleType)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index 60e46a9910a8..c86dc18dfa68 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -136,19 +136,19 @@ class DecimalPrecisionSuite extends AnalysisTest with BeforeAndAfter {
 
   test("maximum decimals") {
     for (expr <- Seq(d1, d2, i, u)) {
-      checkType(Add(expr, u), DecimalType.SYSTEM_DEFAULT)
-      checkType(Subtract(expr, u), DecimalType.SYSTEM_DEFAULT)
+      checkType(Add(expr, u), DecimalType(38, 17))
+      checkType(Subtract(expr, u), DecimalType(38, 17))
     }
 
-    checkType(Multiply(d1, u), DecimalType(38, 19))
-    checkType(Multiply(d2, u), DecimalType(38, 20))
-    checkType(Multiply(i, u), DecimalType(38, 18))
-    checkType(Multiply(u, u), DecimalType(38, 36))
+    checkType(Multiply(d1, u), DecimalType(38, 16))
+    checkType(Multiply(d2, u), DecimalType(38, 14))
+    checkType(Multiply(i, u), DecimalType(38, 7))
+    checkType(Multiply(u, u), DecimalType(38, 6))
 
-    checkType(Divide(u, d1), DecimalType(38, 18))
-    checkType(Divide(u, d2), DecimalType(38, 19))
-    checkType(Divide(u, i), DecimalType(38, 23))
-    checkType(Divide(u, u), DecimalType(38, 18))
+    checkType(Divide(u, d1), DecimalType(38, 17))
+    checkType(Divide(u, d2), DecimalType(38, 16))
+    checkType(Divide(u, i), DecimalType(38, 18))
+    checkType(Divide(u, u), DecimalType(38, 6))
 
     checkType(Remainder(d1, u), DecimalType(19, 18))
     checkType(Remainder(d2, u), DecimalType(21, 18))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
index 3661530cd622..52a7ebdafd7c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
@@ -923,6 +923,60 @@ class TypeCoercionSuite extends AnalysisTest {
     }
   }
 
+  test("type coercion for Elt") {
+    val rule = TypeCoercion.EltCoercion(conf)
+
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal("ab"), Literal("cde"))),
+      Elt(Seq(Literal(1), Literal("ab"), Literal("cde"))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1.toShort), Literal("ab"), Literal("cde"))),
+      Elt(Seq(Cast(Literal(1.toShort), IntegerType), Literal("ab"), Literal("cde"))))
+    ruleTest(rule,
+      Elt(Seq(Literal(2), Literal(null), Literal("abc"))),
+      Elt(Seq(Literal(2), Cast(Literal(null), StringType), Literal("abc"))))
+    ruleTest(rule,
+      Elt(Seq(Literal(2), Literal(1), Literal("234"))),
+      Elt(Seq(Literal(2), Cast(Literal(1), StringType), Literal("234"))))
+    ruleTest(rule,
+      Elt(Seq(Literal(3), Literal(1L), Literal(2.toByte), Literal(0.1))),
+      Elt(Seq(Literal(3), Cast(Literal(1L), StringType), Cast(Literal(2.toByte), StringType),
+        Cast(Literal(0.1), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(2), Literal(true), Literal(0.1f), Literal(3.toShort))),
+      Elt(Seq(Literal(2), Cast(Literal(true), StringType), Cast(Literal(0.1f), StringType),
+        Cast(Literal(3.toShort), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal(1L), Literal(0.1))),
+      Elt(Seq(Literal(1), Cast(Literal(1L), StringType), Cast(Literal(0.1), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal(Decimal(10)))),
+      Elt(Seq(Literal(1), Cast(Literal(Decimal(10)), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal(BigDecimal.valueOf(10)))),
+      Elt(Seq(Literal(1), Cast(Literal(BigDecimal.valueOf(10)), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal(java.math.BigDecimal.valueOf(10)))),
+      Elt(Seq(Literal(1), Cast(Literal(java.math.BigDecimal.valueOf(10)), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(2), Literal(new java.sql.Date(0)), Literal(new Timestamp(0)))),
+      Elt(Seq(Literal(2), Cast(Literal(new java.sql.Date(0)), StringType),
+        Cast(Literal(new Timestamp(0)), StringType))))
+
+    withSQLConf("spark.sql.function.eltOutputAsString" -> "true") {
+      ruleTest(rule,
+        Elt(Seq(Literal(1), Literal("123".getBytes), Literal("456".getBytes))),
+        Elt(Seq(Literal(1), Cast(Literal("123".getBytes), StringType),
+          Cast(Literal("456".getBytes), StringType))))
+    }
+
+    withSQLConf("spark.sql.function.eltOutputAsString" -> "false") {
+      ruleTest(rule,
+        Elt(Seq(Literal(1), Literal("123".getBytes), Literal("456".getBytes))),
+        Elt(Seq(Literal(1), Literal("123".getBytes), Literal("456".getBytes))))
+    }
+  }
+
   test("BooleanEquality type cast") {
     val be = TypeCoercion.BooleanEquality
     // Use something more than a literal to avoid triggering the simplification rules.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 95c87ffa20cb..6abab0073cca 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -279,7 +279,7 @@ abstract class SessionCatalogSuite extends AnalysisTest {
     }
   }
 
-  test("create temp table") {
+  test("create temp view") {
     withBasicCatalog { catalog =>
       val tempTable1 = Range(1, 10, 1, 10)
       val tempTable2 = Range(1, 20, 2, 10)
@@ -288,11 +288,11 @@ abstract class SessionCatalogSuite extends AnalysisTest {
       assert(catalog.getTempView("tbl1") == Option(tempTable1))
       assert(catalog.getTempView("tbl2") == Option(tempTable2))
       assert(catalog.getTempView("tbl3").isEmpty)
-      // Temporary table already exists
+      // Temporary view already exists
       intercept[TempTableAlreadyExistsException] {
         catalog.createTempView("tbl1", tempTable1, overrideIfExists = false)
       }
-      // Temporary table already exists but we override it
+      // Temporary view already exists but we override it
       catalog.createTempView("tbl1", tempTable2, overrideIfExists = true)
       assert(catalog.getTempView("tbl1") == Option(tempTable2))
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 1dd040e4696a..5b25bdf907c3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -853,4 +853,73 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     cast("2", LongType).genCode(ctx)
     assert(ctx.inlinedMutableStates.length == 0)
   }
+
+  test("SPARK-22825 Cast array to string") {
+    val ret1 = cast(Literal.create(Array(1, 2, 3, 4, 5)), StringType)
+    checkEvaluation(ret1, "[1, 2, 3, 4, 5]")
+    val ret2 = cast(Literal.create(Array("ab", "cde", "f")), StringType)
+    checkEvaluation(ret2, "[ab, cde, f]")
+    val ret3 = cast(Literal.create(Array("ab", null, "c")), StringType)
+    checkEvaluation(ret3, "[ab,, c]")
+    val ret4 = cast(Literal.create(Array("ab".getBytes, "cde".getBytes, "f".getBytes)), StringType)
+    checkEvaluation(ret4, "[ab, cde, f]")
+    val ret5 = cast(
+      Literal.create(Array("2014-12-03", "2014-12-04", "2014-12-06").map(Date.valueOf)),
+      StringType)
+    checkEvaluation(ret5, "[2014-12-03, 2014-12-04, 2014-12-06]")
+    val ret6 = cast(
+      Literal.create(Array("2014-12-03 13:01:00", "2014-12-04 15:05:00").map(Timestamp.valueOf)),
+      StringType)
+    checkEvaluation(ret6, "[2014-12-03 13:01:00, 2014-12-04 15:05:00]")
+    val ret7 = cast(Literal.create(Array(Array(1, 2, 3), Array(4, 5))), StringType)
+    checkEvaluation(ret7, "[[1, 2, 3], [4, 5]]")
+    val ret8 = cast(
+      Literal.create(Array(Array(Array("a"), Array("b", "c")), Array(Array("d")))),
+      StringType)
+    checkEvaluation(ret8, "[[[a], [b, c]], [[d]]]")
+  }
+
+  test("SPARK-22973 Cast map to string") {
+    val ret1 = cast(Literal.create(Map(1 -> "a", 2 -> "b", 3 -> "c")), StringType)
+    checkEvaluation(ret1, "[1 -> a, 2 -> b, 3 -> c]")
+    val ret2 = cast(
+      Literal.create(Map("1" -> "a".getBytes, "2" -> null, "3" -> "c".getBytes)),
+      StringType)
+    checkEvaluation(ret2, "[1 -> a, 2 ->, 3 -> c]")
+    val ret3 = cast(
+      Literal.create(Map(
+        1 -> Date.valueOf("2014-12-03"),
+        2 -> Date.valueOf("2014-12-04"),
+        3 -> Date.valueOf("2014-12-05"))),
+      StringType)
+    checkEvaluation(ret3, "[1 -> 2014-12-03, 2 -> 2014-12-04, 3 -> 2014-12-05]")
+    val ret4 = cast(
+      Literal.create(Map(
+        1 -> Timestamp.valueOf("2014-12-03 13:01:00"),
+        2 -> Timestamp.valueOf("2014-12-04 15:05:00"))),
+      StringType)
+    checkEvaluation(ret4, "[1 -> 2014-12-03 13:01:00, 2 -> 2014-12-04 15:05:00]")
+    val ret5 = cast(
+      Literal.create(Map(
+        1 -> Array(1, 2, 3),
+        2 -> Array(4, 5, 6))),
+      StringType)
+    checkEvaluation(ret5, "[1 -> [1, 2, 3], 2 -> [4, 5, 6]]")
+  }
+
+  test("SPARK-22981 Cast struct to string") {
+    val ret1 = cast(Literal.create((1, "a", 0.1)), StringType)
+    checkEvaluation(ret1, "[1, a, 0.1]")
+    val ret2 = cast(Literal.create(Tuple3[Int, String, String](1, null, "a")), StringType)
+    checkEvaluation(ret2, "[1,, a]")
+    val ret3 = cast(Literal.create(
+      (Date.valueOf("2014-12-03"), Timestamp.valueOf("2014-12-03 15:05:00"))), StringType)
+    checkEvaluation(ret3, "[2014-12-03, 2014-12-03 15:05:00]")
+    val ret4 = cast(Literal.create(((1, "a"), 5, 0.1)), StringType)
+    checkEvaluation(ret4, "[[1, a], 5, 0.1]")
+    val ret5 = cast(Literal.create((Seq(1, 2, 3), "a", 0.1)), StringType)
+    checkEvaluation(ret5, "[[1, 2, 3], a, 0.1]")
+    val ret6 = cast(Literal.create((1, Map(1 -> "a", 2 -> "b", 3 -> "c"))), StringType)
+    checkEvaluation(ret6, "[1, [1 -> a, 2 -> b, 3 -> c]]")
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index 63f6ceeb21b9..786266a2c13c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -22,6 +22,7 @@ import java.text.SimpleDateFormat
 import java.util.{Calendar, Locale, TimeZone}
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.TimeZoneGMT
@@ -791,6 +792,9 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     test(null, "UTC", null)
     test("2015-07-24 00:00:00", null, null)
     test(null, null, null)
+    // Test escaping of timezone
+    GenerateUnsafeProjection.generate(
+      ToUTCTimestamp(Literal(Timestamp.valueOf("2015-07-24 00:00:00")), Literal("\"quote")) :: Nil)
   }
 
   test("from_utc_timestamp") {
@@ -811,5 +815,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     test(null, "UTC", null)
     test("2015-07-24 00:00:00", null, null)
     test(null, null, null)
+    // Test escaping of timezone
+    GenerateUnsafeProjection.generate(FromUTCTimestamp(Literal(0), Literal("\"quote")) :: Nil)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 54cde77176e2..97ddbeba2c5c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -51,6 +51,18 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Concat(strs.map(Literal.create(_, StringType))), strs.mkString, EmptyRow)
   }
 
+  test("SPARK-22771 Check Concat.checkInputDataTypes results") {
+    assert(Concat(Seq.empty[Expression]).checkInputDataTypes().isSuccess)
+    assert(Concat(Literal.create("a") :: Literal.create("b") :: Nil)
+      .checkInputDataTypes().isSuccess)
+    assert(Concat(Literal.create("a".getBytes) :: Literal.create("b".getBytes) :: Nil)
+      .checkInputDataTypes().isSuccess)
+    assert(Concat(Literal.create(1) :: Literal.create(2) :: Nil)
+      .checkInputDataTypes().isFailure)
+    assert(Concat(Literal.create("a") :: Literal.create("b".getBytes) :: Nil)
+      .checkInputDataTypes().isFailure)
+  }
+
   test("concat_ws") {
     def testConcatWs(expected: String, sep: String, inputs: Any*): Unit = {
       val inputExprs = inputs.map {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala
index f203f25ad10d..75c6beeb3215 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala
@@ -22,8 +22,10 @@ import scala.util.Random
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.RandomDataGenerator
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.catalyst.expressions.{JoinedRow, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * Test suite for [[GenerateUnsafeRowJoiner]].
@@ -45,6 +47,32 @@ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite {
     testConcat(64, 64, fixed)
   }
 
+  test("rows with all empty strings") {
+    val schema = StructType(Seq(
+      StructField("f1", StringType), StructField("f2", StringType)))
+    val row: UnsafeRow = UnsafeProjection.create(schema).apply(
+      InternalRow(UTF8String.EMPTY_UTF8, UTF8String.EMPTY_UTF8))
+     testConcat(schema, row, schema, row)
+  }
+
+  test("rows with all empty int arrays") {
+    val schema = StructType(Seq(
+      StructField("f1", ArrayType(IntegerType)), StructField("f2", ArrayType(IntegerType))))
+    val emptyIntArray =
+      ExpressionEncoder[Array[Int]]().resolveAndBind().toRow(Array.emptyIntArray).getArray(0)
+    val row: UnsafeRow = UnsafeProjection.create(schema).apply(
+      InternalRow(emptyIntArray, emptyIntArray))
+    testConcat(schema, row, schema, row)
+  }
+
+  test("alternating empty and non-empty strings") {
+    val schema = StructType(Seq(
+      StructField("f1", StringType), StructField("f2", StringType)))
+    val row: UnsafeRow = UnsafeProjection.create(schema).apply(
+      InternalRow(UTF8String.EMPTY_UTF8, UTF8String.fromString("foo")))
+    testConcat(schema, row, schema, row)
+  }
+
   test("randomized fix width types") {
     for (i <- 0 until 20) {
       testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed)
@@ -94,27 +122,84 @@ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite {
     val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply()
     val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow])
     val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow])
+    testConcat(schema1, row1, schema2, row2)
+  }
+
+  private def testConcat(
+      schema1: StructType,
+      row1: UnsafeRow,
+      schema2: StructType,
+      row2: UnsafeRow) {
 
     // Run the joiner.
     val mergedSchema = StructType(schema1 ++ schema2)
     val concater = GenerateUnsafeRowJoiner.create(schema1, schema2)
-    val output = concater.join(row1, row2)
+    val output: UnsafeRow = concater.join(row1, row2)
+
+    // We'll also compare to an UnsafeRow produced with JoinedRow + UnsafeProjection. This ensures
+    // that unused space in the row (e.g. leftover bits in the null-tracking bitmap) is written
+    // correctly.
+    val expectedOutput: UnsafeRow = {
+      val joinedRowProjection = UnsafeProjection.create(mergedSchema)
+      val joined = new JoinedRow()
+      joinedRowProjection.apply(joined.apply(row1, row2))
+    }
 
     // Test everything equals ...
     for (i <- mergedSchema.indices) {
+      val dataType = mergedSchema(i).dataType
       if (i < schema1.size) {
         assert(output.isNullAt(i) === row1.isNullAt(i))
         if (!output.isNullAt(i)) {
-          assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType))
+          assert(output.get(i, dataType) === row1.get(i, dataType))
+          assert(output.get(i, dataType) === expectedOutput.get(i, dataType))
         }
       } else {
         assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size))
         if (!output.isNullAt(i)) {
-          assert(output.get(i, mergedSchema(i).dataType) ===
-            row2.get(i - schema1.size, mergedSchema(i).dataType))
+          assert(output.get(i, dataType) === row2.get(i - schema1.size, dataType))
+          assert(output.get(i, dataType) === expectedOutput.get(i, dataType))
         }
       }
     }
+
+
+    assert(
+      expectedOutput.getSizeInBytes == output.getSizeInBytes,
+      "output isn't same size in bytes as slow path")
+
+    // Compare the UnsafeRows byte-by-byte so that we can print more useful debug information in
+    // case this assertion fails:
+    val actualBytes = output.getBaseObject.asInstanceOf[Array[Byte]]
+      .take(output.getSizeInBytes)
+    val expectedBytes = expectedOutput.getBaseObject.asInstanceOf[Array[Byte]]
+      .take(expectedOutput.getSizeInBytes)
+
+    val bitsetWidth = UnsafeRow.calculateBitSetWidthInBytes(expectedOutput.numFields())
+    val actualBitset = actualBytes.take(bitsetWidth)
+    val expectedBitset = expectedBytes.take(bitsetWidth)
+    assert(actualBitset === expectedBitset, "bitsets were not equal")
+
+    val fixedLengthSize = expectedOutput.numFields() * 8
+    val actualFixedLength = actualBytes.slice(bitsetWidth, bitsetWidth + fixedLengthSize)
+    val expectedFixedLength = expectedBytes.slice(bitsetWidth, bitsetWidth + fixedLengthSize)
+    if (actualFixedLength !== expectedFixedLength) {
+      actualFixedLength.grouped(8)
+        .zip(expectedFixedLength.grouped(8))
+        .zip(mergedSchema.fields.toIterator)
+        .foreach {
+          case ((actual, expected), field) =>
+            assert(actual === expected, s"Fixed length sections are not equal for field $field")
+      }
+      fail("Fixed length sections were not equal")
+    }
+
+    val variableLengthStart = bitsetWidth + fixedLengthSize
+    val actualVariableLength = actualBytes.drop(variableLengthStart)
+    val expectedVariableLength = expectedBytes.drop(variableLengthStart)
+    assert(actualVariableLength === expectedVariableLength, "fixed length sections were not equal")
+
+    assert(output.hashCode() == expectedOutput.hashCode(), "hash codes were not equal")
   }
 
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 85a5e979f602..82a10254d846 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -809,6 +809,19 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("aggregate: don't push filters if the aggregate has no grouping expressions") {
+    val originalQuery = LocalRelation.apply(testRelation.output, Seq.empty)
+      .select('a, 'b)
+      .groupBy()(count(1))
+      .where(false)
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+
+    val correctAnswer = originalQuery.analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("broadcast hint") {
     val originalQuery = ResolvedHint(testRelation)
       .where('a === 2L && 'b + Rand(10).as("rnd") === 3)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
index dccb32f0379a..c28844642aed 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
@@ -147,8 +147,8 @@ class FoldablePropagationSuite extends PlanTest {
   test("Propagate in expand") {
     val c1 = Literal(1).as('a)
     val c2 = Literal(2).as('b)
-    val a1 = c1.toAttribute.withNullability(true)
-    val a2 = c2.toAttribute.withNullability(true)
+    val a1 = c1.toAttribute.newInstance().withNullability(true)
+    val a2 = c2.toAttribute.newInstance().withNullability(true)
     val expand = Expand(
       Seq(Seq(Literal(null), 'b), Seq('a, Literal(null))),
       Seq(a1, a2),
@@ -161,4 +161,23 @@ class FoldablePropagationSuite extends PlanTest {
     val correctAnswer = correctExpand.where(a1.isNotNull).select(a1, a2).analyze
     comparePlans(optimized, correctAnswer)
   }
+
+  test("Propagate above outer join") {
+    val left = LocalRelation('a.int).select('a, Literal(1).as('b))
+    val right = LocalRelation('c.int).select('c, Literal(1).as('d))
+
+    val join = left.join(
+      right,
+      joinType = LeftOuter,
+      condition = Some('a === 'c && 'b === 'd))
+    val query = join.select(('b + 3).as('res)).analyze
+    val optimized = Optimize.execute(query)
+
+    val correctAnswer = left.join(
+      right,
+      joinType = LeftOuter,
+      condition = Some('a === 'c && Literal(1) === Literal(1)))
+      .select((Literal(1) + 3).as('res)).analyze
+    comparePlans(optimized, correctAnswer)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
index 5580f8604ec7..178c4b8c270a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
@@ -34,6 +34,7 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
         PushDownPredicate,
         InferFiltersFromConstraints,
         CombineFilters,
+        SimplifyBinaryComparison,
         BooleanSimplification) :: Nil
   }
 
@@ -160,64 +161,6 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
-  test("inner join with alias: don't generate constraints for recursive functions") {
-    val t1 = testRelation.subquery('t1)
-    val t2 = testRelation.subquery('t2)
-
-    // We should prevent `Coalese(a, b)` from recursively creating complicated constraints through
-    // the constraint inference procedure.
-    val originalQuery = t1.select('a, 'b.as('d), Coalesce(Seq('a, 'b)).as('int_col))
-      // We hide an `Alias` inside the child's child's expressions, to cover the situation reported
-      // in [SPARK-20700].
-      .select('int_col, 'd, 'a).as("t")
-      .join(t2, Inner,
-        Some("t.a".attr === "t2.a".attr
-          && "t.d".attr === "t2.a".attr
-          && "t.int_col".attr === "t2.a".attr))
-      .analyze
-    val correctAnswer = t1
-      .where(IsNotNull('a) && IsNotNull(Coalesce(Seq('a, 'a)))  && IsNotNull(Coalesce(Seq('b, 'a)))
-        && IsNotNull('b) && IsNotNull(Coalesce(Seq('b, 'b))) && IsNotNull(Coalesce(Seq('a, 'b)))
-        && 'a === 'b && 'a === Coalesce(Seq('a, 'a)) && 'a === Coalesce(Seq('a, 'b))
-        && 'a === Coalesce(Seq('b, 'a)) && 'b === Coalesce(Seq('a, 'b))
-        && 'b === Coalesce(Seq('b, 'a)) && 'b === Coalesce(Seq('b, 'b)))
-      .select('a, 'b.as('d), Coalesce(Seq('a, 'b)).as('int_col))
-      .select('int_col, 'd, 'a).as("t")
-      .join(
-        t2.where(IsNotNull('a) && IsNotNull(Coalesce(Seq('a, 'a))) &&
-          'a === Coalesce(Seq('a, 'a))),
-        Inner,
-        Some("t.a".attr === "t2.a".attr && "t.d".attr === "t2.a".attr
-          && "t.int_col".attr === "t2.a".attr))
-      .analyze
-    val optimized = Optimize.execute(originalQuery)
-    comparePlans(optimized, correctAnswer)
-  }
-
-  test("inner join with EqualTo expressions containing part of each other: don't generate " +
-    "constraints for recursive functions") {
-    val t1 = testRelation.subquery('t1)
-    val t2 = testRelation.subquery('t2)
-
-    // We should prevent `c = Coalese(a, b)` and `a = Coalese(b, c)` from recursively creating
-    // complicated constraints through the constraint inference procedure.
-    val originalQuery = t1
-      .select('a, 'b, 'c, Coalesce(Seq('b, 'c)).as('d), Coalesce(Seq('a, 'b)).as('e))
-      .where('a === 'd && 'c === 'e)
-      .join(t2, Inner, Some("t1.a".attr === "t2.a".attr && "t1.c".attr === "t2.c".attr))
-      .analyze
-    val correctAnswer = t1
-      .where(IsNotNull('a) && IsNotNull('c) && 'a === Coalesce(Seq('b, 'c)) &&
-        'c === Coalesce(Seq('a, 'b)))
-      .select('a, 'b, 'c, Coalesce(Seq('b, 'c)).as('d), Coalesce(Seq('a, 'b)).as('e))
-      .join(t2.where(IsNotNull('a) && IsNotNull('c)),
-        Inner,
-        Some("t1.a".attr === "t2.a".attr && "t1.c".attr === "t2.c".attr))
-      .analyze
-    val optimized = Optimize.execute(originalQuery)
-    comparePlans(optimized, correctAnswer)
-  }
-
   test("generate correct filters for alias that don't produce recursive constraints") {
     val t1 = testRelation.subquery('t1)
 
@@ -236,4 +179,17 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
       comparePlans(optimized, originalQuery)
     }
   }
+
+  test("constraints should be inferred from aliased literals") {
+    val originalLeft = testRelation.subquery('left).as("left")
+    val optimizedLeft = testRelation.subquery('left).where(IsNotNull('a) && 'a === 2).as("left")
+
+    val right = Project(Seq(Literal(2).as("two")), testRelation.subquery('right)).as("right")
+    val condition = Some("left.a".attr === "right.two".attr)
+
+    val original = originalLeft.join(right, Inner, condition)
+    val correct = optimizedLeft.join(right, Inner, condition)
+
+    comparePlans(Optimize.execute(original.analyze), correct.analyze)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
index 0fa1aaeb9e16..e9701ffd2c54 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{Alias, Not}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Literal, Not}
 import org.apache.spark.sql.catalyst.expressions.aggregate.First
 import org.apache.spark.sql.catalyst.plans.{LeftAnti, LeftSemi, PlanTest}
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -198,6 +198,14 @@ class ReplaceOperatorSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("add one grouping key if necessary when replace Deduplicate with Aggregate") {
+    val input = LocalRelation()
+    val query = Deduplicate(Seq.empty, input) // dropDuplicates()
+    val optimized = Optimize.execute(query.analyze)
+    val correctAnswer = Aggregate(Seq(Literal(1)), input.output, input)
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("don't replace streaming Deduplicate") {
     val input = LocalRelation(Seq('a.int, 'b.int), isStreaming = true)
     val attrA = input.output(0)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index 2b9783a3295c..cb8a1fecb80a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -249,8 +249,8 @@ class ExpressionParserSuite extends PlanTest {
     assertEqual("foo(*) over (partition by a, b)", windowed(Seq('a, 'b)))
     assertEqual("foo(*) over (distribute by a, b)", windowed(Seq('a, 'b)))
     assertEqual("foo(*) over (cluster by a, b)", windowed(Seq('a, 'b)))
-    assertEqual("foo(*) over (order by a desc, b asc)", windowed(Seq.empty, Seq('a.desc, 'b.asc )))
-    assertEqual("foo(*) over (sort by a desc, b asc)", windowed(Seq.empty, Seq('a.desc, 'b.asc )))
+    assertEqual("foo(*) over (order by a desc, b asc)", windowed(Seq.empty, Seq('a.desc, 'b.asc)))
+    assertEqual("foo(*) over (sort by a desc, b asc)", windowed(Seq.empty, Seq('a.desc, 'b.asc)))
     assertEqual("foo(*) over (partition by a, b order by c)", windowed(Seq('a, 'b), Seq('c.asc)))
     assertEqual("foo(*) over (distribute by a, b sort by c)", windowed(Seq('a, 'b), Seq('c.asc)))
 
@@ -263,21 +263,62 @@ class ExpressionParserSuite extends PlanTest {
       "sum(product + 1) over (partition by ((product / 2) + 1) order by 2)",
       WindowExpression('sum.function('product + 1),
         WindowSpecDefinition(Seq('product / 2 + 1), Seq(Literal(2).asc), UnspecifiedFrame)))
+  }
+
+  test("range/rows window function expressions") {
+    val func = 'foo.function(star())
+    def windowed(
+        partitioning: Seq[Expression] = Seq.empty,
+        ordering: Seq[SortOrder] = Seq.empty,
+        frame: WindowFrame = UnspecifiedFrame): Expression = {
+      WindowExpression(func, WindowSpecDefinition(partitioning, ordering, frame))
+    }
 
-    // Range/Row
     val frameTypes = Seq(("rows", RowFrame), ("range", RangeFrame))
     val boundaries = Seq(
-      ("10 preceding", -Literal(10), CurrentRow),
+      // No between combinations
+      ("unbounded preceding", UnboundedPreceding, CurrentRow),
       ("2147483648 preceding", -Literal(2147483648L), CurrentRow),
+      ("10 preceding", -Literal(10), CurrentRow),
+      ("3 + 1 preceding", -Add(Literal(3), Literal(1)), CurrentRow),
+      ("0 preceding", -Literal(0), CurrentRow),
+      ("current row", CurrentRow, CurrentRow),
+      ("0 following", Literal(0), CurrentRow),
       ("3 + 1 following", Add(Literal(3), Literal(1)), CurrentRow),
-      ("unbounded preceding", UnboundedPreceding, CurrentRow),
+      ("10 following", Literal(10), CurrentRow),
+      ("2147483649 following", Literal(2147483649L), CurrentRow),
       ("unbounded following", UnboundedFollowing, CurrentRow), // Will fail during analysis
+
+      // Between combinations
+      ("between unbounded preceding and 5 following",
+        UnboundedPreceding, Literal(5)),
+      ("between unbounded preceding and 3 + 1 following",
+        UnboundedPreceding, Add(Literal(3), Literal(1))),
+      ("between unbounded preceding and 2147483649 following",
+        UnboundedPreceding, Literal(2147483649L)),
       ("between unbounded preceding and current row", UnboundedPreceding, CurrentRow),
-      ("between unbounded preceding and unbounded following",
-        UnboundedPreceding, UnboundedFollowing),
+      ("between 2147483648 preceding and current row", -Literal(2147483648L), CurrentRow),
       ("between 10 preceding and current row", -Literal(10), CurrentRow),
+      ("between 3 + 1 preceding and current row", -Add(Literal(3), Literal(1)), CurrentRow),
+      ("between 0 preceding and current row", -Literal(0), CurrentRow),
+      ("between current row and current row", CurrentRow, CurrentRow),
+      ("between current row and 0 following", CurrentRow, Literal(0)),
       ("between current row and 5 following", CurrentRow, Literal(5)),
-      ("between 10 preceding and 5 following", -Literal(10), Literal(5))
+      ("between current row and 3 + 1 following", CurrentRow, Add(Literal(3), Literal(1))),
+      ("between current row and 2147483649 following", CurrentRow, Literal(2147483649L)),
+      ("between current row and unbounded following", CurrentRow, UnboundedFollowing),
+      ("between 2147483648 preceding and unbounded following",
+        -Literal(2147483648L), UnboundedFollowing),
+      ("between 10 preceding and unbounded following",
+        -Literal(10), UnboundedFollowing),
+      ("between 3 + 1 preceding and unbounded following",
+        -Add(Literal(3), Literal(1)), UnboundedFollowing),
+      ("between 0 preceding and unbounded following", -Literal(0), UnboundedFollowing),
+
+      // Between partial and full range
+      ("between 10 preceding and 5 following", -Literal(10), Literal(5)),
+      ("between unbounded preceding and unbounded following",
+        UnboundedPreceding, UnboundedFollowing)
     )
     frameTypes.foreach {
       case (frameTypeSql, frameType) =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
index 866ff0d33cbb..a37e06d92264 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
@@ -134,6 +134,8 @@ class ConstraintPropagationSuite extends SparkFunSuite with PlanTest {
     verifyConstraints(aliasedRelation.analyze.constraints,
       ExpressionSet(Seq(resolveColumn(aliasedRelation.analyze, "x") > 10,
         IsNotNull(resolveColumn(aliasedRelation.analyze, "x")),
+        resolveColumn(aliasedRelation.analyze, "b") <=> resolveColumn(aliasedRelation.analyze, "y"),
+        resolveColumn(aliasedRelation.analyze, "z") <=> resolveColumn(aliasedRelation.analyze, "x"),
         resolveColumn(aliasedRelation.analyze, "z") > 10,
         IsNotNull(resolveColumn(aliasedRelation.analyze, "z")))))
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 93010c606cf4..f39bffe41766 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
@@ -195,7 +195,7 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
         <configuration>
-          <argLine>-ea -Xmx4g -Xss4m -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
+          <argLine>-ea -Xmx4g -Xss4m -XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine>
         </configuration>
       </plugin>
       <plugin>
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
new file mode 100644
index 000000000000..b6e792274da1
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc;
+
+import java.math.BigDecimal;
+
+import org.apache.orc.storage.ql.exec.vector.*;
+
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.types.TimestampType;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * A column vector class wrapping Hive's ColumnVector. Because Spark ColumnarBatch only accepts
+ * Spark's vectorized.ColumnVector, this column vector is used to adapt Hive ColumnVector with
+ * Spark ColumnarVector.
+ */
+public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVector {
+  private ColumnVector baseData;
+  private LongColumnVector longData;
+  private DoubleColumnVector doubleData;
+  private BytesColumnVector bytesData;
+  private DecimalColumnVector decimalData;
+  private TimestampColumnVector timestampData;
+  private final boolean isTimestamp;
+
+  private int batchSize;
+
+  OrcColumnVector(DataType type, ColumnVector vector) {
+    super(type);
+
+    if (type instanceof TimestampType) {
+      isTimestamp = true;
+    } else {
+      isTimestamp = false;
+    }
+
+    baseData = vector;
+    if (vector instanceof LongColumnVector) {
+      longData = (LongColumnVector) vector;
+    } else if (vector instanceof DoubleColumnVector) {
+      doubleData = (DoubleColumnVector) vector;
+    } else if (vector instanceof BytesColumnVector) {
+      bytesData = (BytesColumnVector) vector;
+    } else if (vector instanceof DecimalColumnVector) {
+      decimalData = (DecimalColumnVector) vector;
+    } else if (vector instanceof TimestampColumnVector) {
+      timestampData = (TimestampColumnVector) vector;
+    } else {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+  public void setBatchSize(int batchSize) {
+    this.batchSize = batchSize;
+  }
+
+  @Override
+  public void close() {
+
+  }
+
+  @Override
+  public int numNulls() {
+    if (baseData.isRepeating) {
+      if (baseData.isNull[0]) {
+        return batchSize;
+      } else {
+        return 0;
+      }
+    } else if (baseData.noNulls) {
+      return 0;
+    } else {
+      int count = 0;
+      for (int i = 0; i < batchSize; i++) {
+        if (baseData.isNull[i]) count++;
+      }
+      return count;
+    }
+  }
+
+  /* A helper method to get the row index in a column. */
+  private int getRowIndex(int rowId) {
+    return baseData.isRepeating ? 0 : rowId;
+  }
+
+  @Override
+  public boolean isNullAt(int rowId) {
+    return baseData.isNull[getRowIndex(rowId)];
+  }
+
+  @Override
+  public boolean getBoolean(int rowId) {
+    return longData.vector[getRowIndex(rowId)] == 1;
+  }
+
+  @Override
+  public boolean[] getBooleans(int rowId, int count) {
+    boolean[] res = new boolean[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getBoolean(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public byte getByte(int rowId) {
+    return (byte) longData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public byte[] getBytes(int rowId, int count) {
+    byte[] res = new byte[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getByte(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public short getShort(int rowId) {
+    return (short) longData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public short[] getShorts(int rowId, int count) {
+    short[] res = new short[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getShort(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public int getInt(int rowId) {
+    return (int) longData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public int[] getInts(int rowId, int count) {
+    int[] res = new int[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getInt(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public long getLong(int rowId) {
+    int index = getRowIndex(rowId);
+    if (isTimestamp) {
+      return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000;
+    } else {
+      return longData.vector[index];
+    }
+  }
+
+  @Override
+  public long[] getLongs(int rowId, int count) {
+    long[] res = new long[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getLong(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public float getFloat(int rowId) {
+    return (float) doubleData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public float[] getFloats(int rowId, int count) {
+    float[] res = new float[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getFloat(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public double getDouble(int rowId) {
+    return doubleData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public double[] getDoubles(int rowId, int count) {
+    double[] res = new double[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getDouble(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public int getArrayLength(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public int getArrayOffset(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public Decimal getDecimal(int rowId, int precision, int scale) {
+    BigDecimal data = decimalData.vector[getRowIndex(rowId)].getHiveDecimal().bigDecimalValue();
+    return Decimal.apply(data, precision, scale);
+  }
+
+  @Override
+  public UTF8String getUTF8String(int rowId) {
+    int index = getRowIndex(rowId);
+    BytesColumnVector col = bytesData;
+    return UTF8String.fromBytes(col.vector[index], col.start[index], col.length[index]);
+  }
+
+  @Override
+  public byte[] getBinary(int rowId) {
+    int index = getRowIndex(rowId);
+    byte[] binary = new byte[bytesData.length[index]];
+    System.arraycopy(bytesData.vector[index], bytesData.start[index], binary, 0, binary.length);
+    return binary;
+  }
+
+  @Override
+  public org.apache.spark.sql.vectorized.ColumnVector arrayData() {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public org.apache.spark.sql.vectorized.ColumnVector getChildColumn(int ordinal) {
+    throw new UnsupportedOperationException();
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
new file mode 100644
index 000000000000..89bae4326e93
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
@@ -0,0 +1,558 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc;
+
+import java.io.IOException;
+import java.util.stream.IntStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.orc.OrcConf;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.mapred.OrcInputFormat;
+import org.apache.orc.storage.common.type.HiveDecimal;
+import org.apache.orc.storage.ql.exec.vector.*;
+import org.apache.orc.storage.serde2.io.HiveDecimalWritable;
+
+import org.apache.spark.memory.MemoryMode;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.execution.vectorized.ColumnVectorUtils;
+import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector;
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
+import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+
+
+/**
+ * To support vectorization in WholeStageCodeGen, this reader returns ColumnarBatch.
+ * After creating, `initialize` and `initBatch` should be called sequentially.
+ */
+public class OrcColumnarBatchReader extends RecordReader<Void, ColumnarBatch> {
+  // TODO: make this configurable.
+  private static final int CAPACITY = 4 * 1024;
+
+  // Vectorized ORC Row Batch
+  private VectorizedRowBatch batch;
+
+  /**
+   * The column IDs of the physical ORC file schema which are required by this reader.
+   * -1 means this required column doesn't exist in the ORC file.
+   */
+  private int[] requestedColIds;
+
+  // Record reader from ORC row batch.
+  private org.apache.orc.RecordReader recordReader;
+
+  private StructField[] requiredFields;
+
+  // The result columnar batch for vectorized execution by whole-stage codegen.
+  private ColumnarBatch columnarBatch;
+
+  // Writable column vectors of the result columnar batch.
+  private WritableColumnVector[] columnVectors;
+
+  // The wrapped ORC column vectors. It should be null if `copyToSpark` is true.
+  private org.apache.spark.sql.vectorized.ColumnVector[] orcVectorWrappers;
+
+  // The memory mode of the columnarBatch
+  private final MemoryMode MEMORY_MODE;
+
+  // Whether or not to copy the ORC columnar batch to Spark columnar batch.
+  private final boolean copyToSpark;
+
+  public OrcColumnarBatchReader(boolean useOffHeap, boolean copyToSpark) {
+    MEMORY_MODE = useOffHeap ? MemoryMode.OFF_HEAP : MemoryMode.ON_HEAP;
+    this.copyToSpark = copyToSpark;
+  }
+
+
+  @Override
+  public Void getCurrentKey() {
+    return null;
+  }
+
+  @Override
+  public ColumnarBatch getCurrentValue() {
+    return columnarBatch;
+  }
+
+  @Override
+  public float getProgress() throws IOException {
+    return recordReader.getProgress();
+  }
+
+  @Override
+  public boolean nextKeyValue() throws IOException {
+    return nextBatch();
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (columnarBatch != null) {
+      columnarBatch.close();
+      columnarBatch = null;
+    }
+    if (recordReader != null) {
+      recordReader.close();
+      recordReader = null;
+    }
+  }
+
+  /**
+   * Initialize ORC file reader and batch record reader.
+   * Please note that `initBatch` is needed to be called after this.
+   */
+  @Override
+  public void initialize(
+      InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException {
+    FileSplit fileSplit = (FileSplit)inputSplit;
+    Configuration conf = taskAttemptContext.getConfiguration();
+    Reader reader = OrcFile.createReader(
+      fileSplit.getPath(),
+      OrcFile.readerOptions(conf)
+        .maxLength(OrcConf.MAX_FILE_LENGTH.getLong(conf))
+        .filesystem(fileSplit.getPath().getFileSystem(conf)));
+    Reader.Options options =
+      OrcInputFormat.buildOptions(conf, reader, fileSplit.getStart(), fileSplit.getLength());
+    recordReader = reader.rows(options);
+  }
+
+  /**
+   * Initialize columnar batch by setting required schema and partition information.
+   * With this information, this creates ColumnarBatch with the full schema.
+   */
+  public void initBatch(
+      TypeDescription orcSchema,
+      int[] requestedColIds,
+      StructField[] requiredFields,
+      StructType partitionSchema,
+      InternalRow partitionValues) {
+    batch = orcSchema.createRowBatch(CAPACITY);
+    assert(!batch.selectedInUse); // `selectedInUse` should be initialized with `false`.
+
+    this.requiredFields = requiredFields;
+    this.requestedColIds = requestedColIds;
+    assert(requiredFields.length == requestedColIds.length);
+
+    StructType resultSchema = new StructType(requiredFields);
+    for (StructField f : partitionSchema.fields()) {
+      resultSchema = resultSchema.add(f);
+    }
+
+    if (copyToSpark) {
+      if (MEMORY_MODE == MemoryMode.OFF_HEAP) {
+        columnVectors = OffHeapColumnVector.allocateColumns(CAPACITY, resultSchema);
+      } else {
+        columnVectors = OnHeapColumnVector.allocateColumns(CAPACITY, resultSchema);
+      }
+
+      // Initialize the missing columns once.
+      for (int i = 0; i < requiredFields.length; i++) {
+        if (requestedColIds[i] == -1) {
+          columnVectors[i].putNulls(0, CAPACITY);
+          columnVectors[i].setIsConstant();
+        }
+      }
+
+      if (partitionValues.numFields() > 0) {
+        int partitionIdx = requiredFields.length;
+        for (int i = 0; i < partitionValues.numFields(); i++) {
+          ColumnVectorUtils.populate(columnVectors[i + partitionIdx], partitionValues, i);
+          columnVectors[i + partitionIdx].setIsConstant();
+        }
+      }
+
+      columnarBatch = new ColumnarBatch(columnVectors);
+    } else {
+      // Just wrap the ORC column vector instead of copying it to Spark column vector.
+      orcVectorWrappers = new org.apache.spark.sql.vectorized.ColumnVector[resultSchema.length()];
+
+      for (int i = 0; i < requiredFields.length; i++) {
+        DataType dt = requiredFields[i].dataType();
+        int colId = requestedColIds[i];
+        // Initialize the missing columns once.
+        if (colId == -1) {
+          OnHeapColumnVector missingCol = new OnHeapColumnVector(CAPACITY, dt);
+          missingCol.putNulls(0, CAPACITY);
+          missingCol.setIsConstant();
+          orcVectorWrappers[i] = missingCol;
+        } else {
+          orcVectorWrappers[i] = new OrcColumnVector(dt, batch.cols[colId]);
+        }
+      }
+
+      if (partitionValues.numFields() > 0) {
+        int partitionIdx = requiredFields.length;
+        for (int i = 0; i < partitionValues.numFields(); i++) {
+          DataType dt = partitionSchema.fields()[i].dataType();
+          OnHeapColumnVector partitionCol = new OnHeapColumnVector(CAPACITY, dt);
+          ColumnVectorUtils.populate(partitionCol, partitionValues, i);
+          partitionCol.setIsConstant();
+          orcVectorWrappers[partitionIdx + i] = partitionCol;
+        }
+      }
+
+      columnarBatch = new ColumnarBatch(orcVectorWrappers);
+    }
+  }
+
+  /**
+   * Return true if there exists more data in the next batch. If exists, prepare the next batch
+   * by copying from ORC VectorizedRowBatch columns to Spark ColumnarBatch columns.
+   */
+  private boolean nextBatch() throws IOException {
+    recordReader.nextBatch(batch);
+    int batchSize = batch.size;
+    if (batchSize == 0) {
+      return false;
+    }
+    columnarBatch.setNumRows(batchSize);
+
+    if (!copyToSpark) {
+      for (int i = 0; i < requiredFields.length; i++) {
+        if (requestedColIds[i] != -1) {
+          ((OrcColumnVector) orcVectorWrappers[i]).setBatchSize(batchSize);
+        }
+      }
+      return true;
+    }
+
+    for (WritableColumnVector vector : columnVectors) {
+      vector.reset();
+    }
+
+    for (int i = 0; i < requiredFields.length; i++) {
+      StructField field = requiredFields[i];
+      WritableColumnVector toColumn = columnVectors[i];
+
+      if (requestedColIds[i] >= 0) {
+        ColumnVector fromColumn = batch.cols[requestedColIds[i]];
+
+        if (fromColumn.isRepeating) {
+          putRepeatingValues(batchSize, field, fromColumn, toColumn);
+        } else if (fromColumn.noNulls) {
+          putNonNullValues(batchSize, field, fromColumn, toColumn);
+        } else {
+          putValues(batchSize, field, fromColumn, toColumn);
+        }
+      }
+    }
+    return true;
+  }
+
+  private void putRepeatingValues(
+      int batchSize,
+      StructField field,
+      ColumnVector fromColumn,
+      WritableColumnVector toColumn) {
+    if (fromColumn.isNull[0]) {
+      toColumn.putNulls(0, batchSize);
+    } else {
+      DataType type = field.dataType();
+      if (type instanceof BooleanType) {
+        toColumn.putBooleans(0, batchSize, ((LongColumnVector)fromColumn).vector[0] == 1);
+      } else if (type instanceof ByteType) {
+        toColumn.putBytes(0, batchSize, (byte)((LongColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof ShortType) {
+        toColumn.putShorts(0, batchSize, (short)((LongColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof IntegerType || type instanceof DateType) {
+        toColumn.putInts(0, batchSize, (int)((LongColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof LongType) {
+        toColumn.putLongs(0, batchSize, ((LongColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof TimestampType) {
+        toColumn.putLongs(0, batchSize,
+          fromTimestampColumnVector((TimestampColumnVector)fromColumn, 0));
+      } else if (type instanceof FloatType) {
+        toColumn.putFloats(0, batchSize, (float)((DoubleColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof DoubleType) {
+        toColumn.putDoubles(0, batchSize, ((DoubleColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof StringType || type instanceof BinaryType) {
+        BytesColumnVector data = (BytesColumnVector)fromColumn;
+        WritableColumnVector arrayData = toColumn.getChildColumn(0);
+        int size = data.vector[0].length;
+        arrayData.reserve(size);
+        arrayData.putBytes(0, size, data.vector[0], 0);
+        for (int index = 0; index < batchSize; index++) {
+          toColumn.putArray(index, 0, size);
+        }
+      } else if (type instanceof DecimalType) {
+        DecimalType decimalType = (DecimalType)type;
+        putDecimalWritables(
+          toColumn,
+          batchSize,
+          decimalType.precision(),
+          decimalType.scale(),
+          ((DecimalColumnVector)fromColumn).vector[0]);
+      } else {
+        throw new UnsupportedOperationException("Unsupported Data Type: " + type);
+      }
+    }
+  }
+
+  private void putNonNullValues(
+      int batchSize,
+      StructField field,
+      ColumnVector fromColumn,
+      WritableColumnVector toColumn) {
+    DataType type = field.dataType();
+    if (type instanceof BooleanType) {
+      long[] data = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putBoolean(index, data[index] == 1);
+      }
+    } else if (type instanceof ByteType) {
+      long[] data = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putByte(index, (byte)data[index]);
+      }
+    } else if (type instanceof ShortType) {
+      long[] data = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putShort(index, (short)data[index]);
+      }
+    } else if (type instanceof IntegerType || type instanceof DateType) {
+      long[] data = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putInt(index, (int)data[index]);
+      }
+    } else if (type instanceof LongType) {
+      toColumn.putLongs(0, batchSize, ((LongColumnVector)fromColumn).vector, 0);
+    } else if (type instanceof TimestampType) {
+      TimestampColumnVector data = ((TimestampColumnVector)fromColumn);
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putLong(index, fromTimestampColumnVector(data, index));
+      }
+    } else if (type instanceof FloatType) {
+      double[] data = ((DoubleColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putFloat(index, (float)data[index]);
+      }
+    } else if (type instanceof DoubleType) {
+      toColumn.putDoubles(0, batchSize, ((DoubleColumnVector)fromColumn).vector, 0);
+    } else if (type instanceof StringType || type instanceof BinaryType) {
+      BytesColumnVector data = ((BytesColumnVector)fromColumn);
+      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      int totalNumBytes = IntStream.of(data.length).sum();
+      arrayData.reserve(totalNumBytes);
+      for (int index = 0, pos = 0; index < batchSize; pos += data.length[index], index++) {
+        arrayData.putBytes(pos, data.length[index], data.vector[index], data.start[index]);
+        toColumn.putArray(index, pos, data.length[index]);
+      }
+    } else if (type instanceof DecimalType) {
+      DecimalType decimalType = (DecimalType)type;
+      DecimalColumnVector data = ((DecimalColumnVector)fromColumn);
+      if (decimalType.precision() > Decimal.MAX_LONG_DIGITS()) {
+        WritableColumnVector arrayData = toColumn.getChildColumn(0);
+        arrayData.reserve(batchSize * 16);
+      }
+      for (int index = 0; index < batchSize; index++) {
+        putDecimalWritable(
+          toColumn,
+          index,
+          decimalType.precision(),
+          decimalType.scale(),
+          data.vector[index]);
+      }
+    } else {
+      throw new UnsupportedOperationException("Unsupported Data Type: " + type);
+    }
+  }
+
+  private void putValues(
+      int batchSize,
+      StructField field,
+      ColumnVector fromColumn,
+      WritableColumnVector toColumn) {
+    DataType type = field.dataType();
+    if (type instanceof BooleanType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putBoolean(index, vector[index] == 1);
+        }
+      }
+    } else if (type instanceof ByteType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putByte(index, (byte)vector[index]);
+        }
+      }
+    } else if (type instanceof ShortType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putShort(index, (short)vector[index]);
+        }
+      }
+    } else if (type instanceof IntegerType || type instanceof DateType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putInt(index, (int)vector[index]);
+        }
+      }
+    } else if (type instanceof LongType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putLong(index, vector[index]);
+        }
+      }
+    } else if (type instanceof TimestampType) {
+      TimestampColumnVector vector = ((TimestampColumnVector)fromColumn);
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putLong(index, fromTimestampColumnVector(vector, index));
+        }
+      }
+    } else if (type instanceof FloatType) {
+      double[] vector = ((DoubleColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putFloat(index, (float)vector[index]);
+        }
+      }
+    } else if (type instanceof DoubleType) {
+      double[] vector = ((DoubleColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putDouble(index, vector[index]);
+        }
+      }
+    } else if (type instanceof StringType || type instanceof BinaryType) {
+      BytesColumnVector vector = (BytesColumnVector)fromColumn;
+      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      int totalNumBytes = IntStream.of(vector.length).sum();
+      arrayData.reserve(totalNumBytes);
+      for (int index = 0, pos = 0; index < batchSize; pos += vector.length[index], index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          arrayData.putBytes(pos, vector.length[index], vector.vector[index], vector.start[index]);
+          toColumn.putArray(index, pos, vector.length[index]);
+        }
+      }
+    } else if (type instanceof DecimalType) {
+      DecimalType decimalType = (DecimalType)type;
+      HiveDecimalWritable[] vector = ((DecimalColumnVector)fromColumn).vector;
+      if (decimalType.precision() > Decimal.MAX_LONG_DIGITS()) {
+        WritableColumnVector arrayData = toColumn.getChildColumn(0);
+        arrayData.reserve(batchSize * 16);
+      }
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          putDecimalWritable(
+            toColumn,
+            index,
+            decimalType.precision(),
+            decimalType.scale(),
+            vector[index]);
+        }
+      }
+    } else {
+      throw new UnsupportedOperationException("Unsupported Data Type: " + type);
+    }
+  }
+
+  /**
+   * Returns the number of micros since epoch from an element of TimestampColumnVector.
+   */
+  private static long fromTimestampColumnVector(TimestampColumnVector vector, int index) {
+    return vector.time[index] * 1000L + vector.nanos[index] / 1000L;
+  }
+
+  /**
+   * Put a `HiveDecimalWritable` to a `WritableColumnVector`.
+   */
+  private static void putDecimalWritable(
+      WritableColumnVector toColumn,
+      int index,
+      int precision,
+      int scale,
+      HiveDecimalWritable decimalWritable) {
+    HiveDecimal decimal = decimalWritable.getHiveDecimal();
+    Decimal value =
+      Decimal.apply(decimal.bigDecimalValue(), decimal.precision(), decimal.scale());
+    value.changePrecision(precision, scale);
+
+    if (precision <= Decimal.MAX_INT_DIGITS()) {
+      toColumn.putInt(index, (int) value.toUnscaledLong());
+    } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
+      toColumn.putLong(index, value.toUnscaledLong());
+    } else {
+      byte[] bytes = value.toJavaBigDecimal().unscaledValue().toByteArray();
+      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      arrayData.putBytes(index * 16, bytes.length, bytes, 0);
+      toColumn.putArray(index, index * 16, bytes.length);
+    }
+  }
+
+  /**
+   * Put `HiveDecimalWritable`s to a `WritableColumnVector`.
+   */
+  private static void putDecimalWritables(
+      WritableColumnVector toColumn,
+      int size,
+      int precision,
+      int scale,
+      HiveDecimalWritable decimalWritable) {
+    HiveDecimal decimal = decimalWritable.getHiveDecimal();
+    Decimal value =
+      Decimal.apply(decimal.bigDecimalValue(), decimal.precision(), decimal.scale());
+    value.changePrecision(precision, scale);
+
+    if (precision <= Decimal.MAX_INT_DIGITS()) {
+      toColumn.putInts(0, size, (int) value.toUnscaledLong());
+    } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
+      toColumn.putLongs(0, size, value.toUnscaledLong());
+    } else {
+      byte[] bytes = value.toJavaBigDecimal().unscaledValue().toByteArray();
+      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      arrayData.reserve(bytes.length);
+      arrayData.putBytes(0, bytes.length, bytes, 0);
+      for (int index = 0; index < size; index++) {
+        toColumn.putArray(index, 0, bytes.length);
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
index 80c2f491b48c..e65cd252c3dd 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
@@ -170,7 +170,7 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont
    * Returns the list of files at 'path' recursively. This skips files that are ignored normally
    * by MapReduce.
    */
-  public static List<String> listDirectory(File path) throws IOException {
+  public static List<String> listDirectory(File path) {
     List<String> result = new ArrayList<>();
     if (path.isDirectory()) {
       for (File f: path.listFiles()) {
@@ -231,7 +231,7 @@ protected void initialize(String path, List<String> columns) throws IOException
   }
 
   @Override
-  public Void getCurrentKey() throws IOException, InterruptedException {
+  public Void getCurrentKey() {
     return null;
   }
 
@@ -259,7 +259,7 @@ public ValuesReaderIntIterator(ValuesReader delegate) {
     }
 
     @Override
-    int nextInt() throws IOException {
+    int nextInt() {
       return delegate.readInteger();
     }
   }
@@ -279,15 +279,15 @@ int nextInt() throws IOException {
 
   protected static final class NullIntIterator extends IntIterator {
     @Override
-    int nextInt() throws IOException { return 0; }
+    int nextInt() { return 0; }
   }
 
   /**
    * Creates a reader for definition and repetition levels, returning an optimized one if
    * the levels are not needed.
    */
-  protected static IntIterator createRLEIterator(int maxLevel, BytesInput bytes,
-                                              ColumnDescriptor descriptor) throws IOException {
+  protected static IntIterator createRLEIterator(
+      int maxLevel, BytesInput bytes, ColumnDescriptor descriptor) throws IOException {
     try {
       if (maxLevel == 0) return new NullIntIterator();
       return new RLEIntIterator(
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
index 6c157e85d411..bb1b23611a7d 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
@@ -31,10 +31,10 @@
 import org.apache.spark.memory.MemoryMode;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.execution.vectorized.ColumnVectorUtils;
-import org.apache.spark.sql.execution.vectorized.ColumnarBatch;
 import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector;
 import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 
@@ -50,6 +50,9 @@
  * TODO: make this always return ColumnarBatches.
  */
 public class VectorizedParquetRecordReader extends SpecificParquetRecordReaderBase<Object> {
+  // TODO: make this configurable.
+  private static final int CAPACITY = 4 * 1024;
+
   /**
    * Batch of rows that we assemble and the current index we've returned. Every time this
    * batch is used up (batchIdx == numBatched), we populated the batch.
@@ -152,7 +155,7 @@ public void close() throws IOException {
   }
 
   @Override
-  public boolean nextKeyValue() throws IOException, InterruptedException {
+  public boolean nextKeyValue() throws IOException {
     resultBatch();
 
     if (returnColumnarBatch) return nextBatch();
@@ -165,13 +168,13 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
   }
 
   @Override
-  public Object getCurrentValue() throws IOException, InterruptedException {
+  public Object getCurrentValue() {
     if (returnColumnarBatch) return columnarBatch;
     return columnarBatch.getRow(batchIdx - 1);
   }
 
   @Override
-  public float getProgress() throws IOException, InterruptedException {
+  public float getProgress() {
     return (float) rowsReturned / totalRowCount;
   }
 
@@ -181,7 +184,7 @@ public float getProgress() throws IOException, InterruptedException {
   // Columns 0,1: data columns
   // Column 2: partitionValues[0]
   // Column 3: partitionValues[1]
-  public void initBatch(
+  private void initBatch(
       MemoryMode memMode,
       StructType partitionColumns,
       InternalRow partitionValues) {
@@ -195,13 +198,12 @@ public void initBatch(
       }
     }
 
-    int capacity = ColumnarBatch.DEFAULT_BATCH_SIZE;
     if (memMode == MemoryMode.OFF_HEAP) {
-      columnVectors = OffHeapColumnVector.allocateColumns(capacity, batchSchema);
+      columnVectors = OffHeapColumnVector.allocateColumns(CAPACITY, batchSchema);
     } else {
-      columnVectors = OnHeapColumnVector.allocateColumns(capacity, batchSchema);
+      columnVectors = OnHeapColumnVector.allocateColumns(CAPACITY, batchSchema);
     }
-    columnarBatch = new ColumnarBatch(batchSchema, columnVectors, capacity);
+    columnarBatch = new ColumnarBatch(columnVectors);
     if (partitionColumns != null) {
       int partitionIdx = sparkSchema.fields().length;
       for (int i = 0; i < partitionColumns.fields().length; i++) {
@@ -213,13 +215,13 @@ public void initBatch(
     // Initialize missing columns with nulls.
     for (int i = 0; i < missingColumns.length; i++) {
       if (missingColumns[i]) {
-        columnVectors[i].putNulls(0, columnarBatch.capacity());
+        columnVectors[i].putNulls(0, CAPACITY);
         columnVectors[i].setIsConstant();
       }
     }
   }
 
-  public void initBatch() {
+  private void initBatch() {
     initBatch(MEMORY_MODE, null, null);
   }
 
@@ -248,11 +250,14 @@ public void enableReturningBatches() {
    * Advances to the next batch of rows. Returns false if there are no more.
    */
   public boolean nextBatch() throws IOException {
-    columnarBatch.reset();
+    for (WritableColumnVector vector : columnVectors) {
+      vector.reset();
+    }
+    columnarBatch.setNumRows(0);
     if (rowsReturned >= totalRowCount) return false;
     checkEndOfRowGroup();
 
-    int num = (int) Math.min((long) columnarBatch.capacity(), totalCountLoadedSoFar - rowsReturned);
+    int num = (int) Math.min((long) CAPACITY, totalCountLoadedSoFar - rowsReturned);
     for (int i = 0; i < columnReaders.length; ++i) {
       if (columnReaders[i] == null) continue;
       columnReaders[i].readBatch(num, columnVectors[i]);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
index bc62bc43484e..5ee8cc8da230 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
@@ -28,6 +28,8 @@
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.util.DateTimeUtils;
 import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -116,19 +118,19 @@ private static void appendValue(WritableColumnVector dst, DataType t, Object o)
       }
     } else {
       if (t == DataTypes.BooleanType) {
-        dst.appendBoolean(((Boolean)o).booleanValue());
+        dst.appendBoolean((Boolean) o);
       } else if (t == DataTypes.ByteType) {
-        dst.appendByte(((Byte) o).byteValue());
+        dst.appendByte((Byte) o);
       } else if (t == DataTypes.ShortType) {
-        dst.appendShort(((Short)o).shortValue());
+        dst.appendShort((Short) o);
       } else if (t == DataTypes.IntegerType) {
-        dst.appendInt(((Integer)o).intValue());
+        dst.appendInt((Integer) o);
       } else if (t == DataTypes.LongType) {
-        dst.appendLong(((Long)o).longValue());
+        dst.appendLong((Long) o);
       } else if (t == DataTypes.FloatType) {
-        dst.appendFloat(((Float)o).floatValue());
+        dst.appendFloat((Float) o);
       } else if (t == DataTypes.DoubleType) {
-        dst.appendDouble(((Double)o).doubleValue());
+        dst.appendDouble((Double) o);
       } else if (t == DataTypes.StringType) {
         byte[] b =((String)o).getBytes(StandardCharsets.UTF_8);
         dst.appendByteArray(b, 0, b.length);
@@ -190,7 +192,7 @@ private static void appendValue(WritableColumnVector dst, DataType t, Row src, i
    */
   public static ColumnarBatch toBatch(
       StructType schema, MemoryMode memMode, Iterator<Row> row) {
-    int capacity = ColumnarBatch.DEFAULT_BATCH_SIZE;
+    int capacity = 4 * 1024;
     WritableColumnVector[] columnVectors;
     if (memMode == MemoryMode.OFF_HEAP) {
       columnVectors = OffHeapColumnVector.allocateColumns(capacity, schema);
@@ -206,7 +208,7 @@ public static ColumnarBatch toBatch(
       }
       n++;
     }
-    ColumnarBatch batch = new ColumnarBatch(schema, columnVectors, capacity);
+    ColumnarBatch batch = new ColumnarBatch(columnVectors);
     batch.setNumRows(n);
     return batch;
   }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
index 06602c147dfe..70057a9def6c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
@@ -23,6 +23,10 @@
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+import org.apache.spark.sql.vectorized.ColumnarRow;
+import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 5f6f125976e1..d2ae32b06f83 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -23,6 +23,7 @@
 
 import org.apache.spark.sql.internal.SQLConf;
 import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -585,11 +586,11 @@ public final int appendArray(int length) {
   public final int appendStruct(boolean isNull) {
     if (isNull) {
       appendNull();
-      for (ColumnVector c: childColumns) {
+      for (WritableColumnVector c: childColumns) {
         if (c.type instanceof StructType) {
-          ((WritableColumnVector) c).appendStruct(true);
+          c.appendStruct(true);
         } else {
-          ((WritableColumnVector) c).appendNull();
+          c.appendNull();
         }
       }
     } else {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
index 95ee4a827832..f23c3842bf1b 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
@@ -38,7 +38,10 @@
  *   2. Information Reporting. E.g., statistics reporting, ordering reporting, etc.
  *      Names of these interfaces start with `SupportsReporting`.
  *   3. Special scans. E.g, columnar scan, unsafe row scan, etc.
- *      Names of these interfaces start with `SupportsScan`.
+ *      Names of these interfaces start with `SupportsScan`. Note that a reader should only
+ *      implement at most one of the special scans, if more than one special scans are implemented,
+ *      only one of them would be respected, according to the priority list from high to low:
+ *      {@link SupportsScanColumnarBatch}, {@link SupportsScanUnsafeRow}.
  *
  * If an exception was throw when applying any of these query optimizations, the action would fail
  * and no Spark job was submitted.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java
new file mode 100644
index 000000000000..27cf3a77724f
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.util.List;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+
+/**
+ * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to output {@link ColumnarBatch} and make the scan faster.
+ */
+@InterfaceStability.Evolving
+public interface SupportsScanColumnarBatch extends DataSourceV2Reader {
+  @Override
+  default List<ReadTask<Row>> createReadTasks() {
+    throw new IllegalStateException(
+      "createReadTasks not supported by default within SupportsScanColumnarBatch.");
+  }
+
+  /**
+   * Similar to {@link DataSourceV2Reader#createReadTasks()}, but returns columnar data in batches.
+   */
+  List<ReadTask<ColumnarBatch>> createBatchReadTasks();
+
+  /**
+   * Returns true if the concrete data source reader can read data in batch according to the scan
+   * properties like required columns, pushes filters, etc. It's possible that the implementation
+   * can only support some certain columns with certain types. Users can overwrite this method and
+   * {@link #createReadTasks()} to fallback to normal read path under some conditions.
+   */
+  default boolean enableBatchRead() {
+    return true;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
index b90ec880dc85..2d3ad0eee65f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java
@@ -35,7 +35,7 @@ public interface SupportsScanUnsafeRow extends DataSourceV2Reader {
   @Override
   default List<ReadTask<Row>> createReadTasks() {
     throw new IllegalStateException(
-        "createReadTasks should not be called with SupportsScanUnsafeRow.");
+      "createReadTasks not supported by default within SupportsScanUnsafeRow");
   }
 
   /**
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
index 3136cee1f655..9a93a806b0ef 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
@@ -19,6 +19,7 @@
 
 import java.util.Optional;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
 import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader;
@@ -28,6 +29,7 @@
  * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
  * provide data reading ability for continuous stream processing.
  */
+@InterfaceStability.Evolving
 public interface ContinuousReadSupport extends DataSourceV2 {
   /**
    * Creates a {@link ContinuousReader} to scan the data from this data source.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java
index ca9a290e97a0..3f13a4dbf579 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java
@@ -17,11 +17,13 @@
 
 package org.apache.spark.sql.sources.v2.streaming.reader;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.reader.DataReader;
 
 /**
  * A variation on {@link DataReader} for use with streaming in continuous processing mode.
  */
+@InterfaceStability.Evolving
 public interface ContinuousDataReader<T> extends DataReader<T> {
     /**
      * Get the offset of the current record, or the start offset if no records have been read.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
index f0b205869ed6..745f1ce50244 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.sources.v2.streaming.reader;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSource;
 import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
 
@@ -27,11 +28,15 @@
  * interface to allow reading in a continuous processing mode stream.
  *
  * Implementations must ensure each read task output is a {@link ContinuousDataReader}.
+ *
+ * Note: This class currently extends {@link BaseStreamingSource} to maintain compatibility with
+ * DataSource V1 APIs. This extension will be removed once we get rid of V1 completely.
  */
+@InterfaceStability.Evolving
 public interface ContinuousReader extends BaseStreamingSource, DataSourceV2Reader {
     /**
-     * Merge offsets coming from {@link ContinuousDataReader} instances in each partition to
-     * a single global offset.
+     * Merge partitioned offsets coming from {@link ContinuousDataReader} instances for each
+     * partition to a single global offset.
      */
     Offset mergeOffsets(PartitionOffset[] offsets);
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
index 70ff75680603..02f37cebc748 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.sources.v2.streaming.reader;
 
+import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSource;
 
@@ -25,7 +26,11 @@
 /**
  * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
  * interface to indicate they allow micro-batch streaming reads.
+ *
+ * Note: This class currently extends {@link BaseStreamingSource} to maintain compatibility with
+ * DataSource V1 APIs. This extension will be removed once we get rid of V1 completely.
  */
+@InterfaceStability.Evolving
 public interface MicroBatchReader extends DataSourceV2Reader, BaseStreamingSource {
     /**
      * Set the desired offset range for read tasks created from this reader. Read tasks will
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
index 60b87f2ac075..abba3e7188b1 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
@@ -17,12 +17,20 @@
 
 package org.apache.spark.sql.sources.v2.streaming.reader;
 
+import org.apache.spark.annotation.InterfaceStability;
+
 /**
- * An abstract representation of progress through a [[MicroBatchReader]] or [[ContinuousReader]].
- * During execution, Offsets provided by the data source implementation will be logged and used as
- * restart checkpoints. Sources should provide an Offset implementation which they can use to
- * reconstruct the stream position where the offset was taken.
+ * An abstract representation of progress through a {@link MicroBatchReader} or
+ * {@link ContinuousReader}.
+ * During execution, offsets provided by the data source implementation will be logged and used as
+ * restart checkpoints. Each source should provide an offset implementation which the source can use
+ * to reconstruct a position in the stream up to which data has been seen/processed.
+ *
+ * Note: This class currently extends {@link org.apache.spark.sql.execution.streaming.Offset} to
+ * maintain compatibility with DataSource V1 APIs. This extension will be removed once we
+ * get rid of V1 completely.
  */
+@InterfaceStability.Evolving
 public abstract class Offset extends org.apache.spark.sql.execution.streaming.Offset {
     /**
      * A JSON-serialized representation of an Offset that is
@@ -37,7 +45,7 @@ public abstract class Offset extends org.apache.spark.sql.execution.streaming.Of
     /**
      * Equality based on JSON string representation. We leverage the
      * JSON representation for normalization between the Offset's
-     * in memory and on disk representations.
+     * in deserialized and serialized representations.
      */
     @Override
     public boolean equals(Object obj) {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
index eca0085c8a8c..4688b85f49f5 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
@@ -19,11 +19,14 @@
 
 import java.io.Serializable;
 
+import org.apache.spark.annotation.InterfaceStability;
+
 /**
  * Used for per-partition offsets in continuous processing. ContinuousReader implementations will
  * provide a method to merge these into a global Offset.
  *
  * These offsets must be serializable.
  */
+@InterfaceStability.Evolving
 public interface PartitionOffset extends Serializable {
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
index fc37b9a516f8..f1ef41142316 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceV2Writer.java
@@ -22,11 +22,16 @@
 import org.apache.spark.sql.SaveMode;
 import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.WriteSupport;
+import org.apache.spark.sql.streaming.OutputMode;
 import org.apache.spark.sql.types.StructType;
 
 /**
  * A data source writer that is returned by
- * {@link WriteSupport#createWriter(String, StructType, SaveMode, DataSourceV2Options)}.
+ * {@link WriteSupport#createWriter(String, StructType, SaveMode, DataSourceV2Options)}/
+ * {@link org.apache.spark.sql.sources.v2.streaming.MicroBatchWriteSupport#createMicroBatchWriter(
+ * String, long, StructType, OutputMode, DataSourceV2Options)}/
+ * {@link org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport#createContinuousWriter(
+ * String, StructType, OutputMode, DataSourceV2Options)}.
  * It can mix in various writing optimization interfaces to speed up the data saving. The actual
  * writing logic is delegated to {@link DataWriter}.
  *
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
similarity index 93%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
index af5673e26a50..bfd1b4cb0ef1 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import org.apache.arrow.vector.*;
 import org.apache.arrow.vector.complex.*;
@@ -34,11 +34,7 @@ public final class ArrowColumnVector extends ColumnVector {
   private ArrowColumnVector[] childColumns;
 
   private void ensureAccessible(int index) {
-    int valueCount = accessor.getValueCount();
-    if (index < 0 || index >= valueCount) {
-      throw new IndexOutOfBoundsException(
-        String.format("index: %d, valueCount: %d", index, valueCount));
-    }
+    ensureAccessible(index, 1);
   }
 
   private void ensureAccessible(int index, int count) {
@@ -64,20 +60,12 @@ public void close() {
     accessor.close();
   }
 
-  //
-  // APIs dealing with nulls
-  //
-
   @Override
   public boolean isNullAt(int rowId) {
     ensureAccessible(rowId);
     return accessor.isNullAt(rowId);
   }
 
-  //
-  // APIs dealing with Booleans
-  //
-
   @Override
   public boolean getBoolean(int rowId) {
     ensureAccessible(rowId);
@@ -94,10 +82,6 @@ public boolean[] getBooleans(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Bytes
-  //
-
   @Override
   public byte getByte(int rowId) {
     ensureAccessible(rowId);
@@ -114,10 +98,6 @@ public byte[] getBytes(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Shorts
-  //
-
   @Override
   public short getShort(int rowId) {
     ensureAccessible(rowId);
@@ -134,10 +114,6 @@ public short[] getShorts(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Ints
-  //
-
   @Override
   public int getInt(int rowId) {
     ensureAccessible(rowId);
@@ -154,10 +130,6 @@ public int[] getInts(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Longs
-  //
-
   @Override
   public long getLong(int rowId) {
     ensureAccessible(rowId);
@@ -174,10 +146,6 @@ public long[] getLongs(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with floats
-  //
-
   @Override
   public float getFloat(int rowId) {
     ensureAccessible(rowId);
@@ -194,10 +162,6 @@ public float[] getFloats(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with doubles
-  //
-
   @Override
   public double getDouble(int rowId) {
     ensureAccessible(rowId);
@@ -214,10 +178,6 @@ public double[] getDoubles(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Arrays
-  //
-
   @Override
   public int getArrayLength(int rowId) {
     ensureAccessible(rowId);
@@ -230,45 +190,27 @@ public int getArrayOffset(int rowId) {
     return accessor.getArrayOffset(rowId);
   }
 
-  //
-  // APIs dealing with Decimals
-  //
-
   @Override
   public Decimal getDecimal(int rowId, int precision, int scale) {
     ensureAccessible(rowId);
     return accessor.getDecimal(rowId, precision, scale);
   }
 
-  //
-  // APIs dealing with UTF8Strings
-  //
-
   @Override
   public UTF8String getUTF8String(int rowId) {
     ensureAccessible(rowId);
     return accessor.getUTF8String(rowId);
   }
 
-  //
-  // APIs dealing with Binaries
-  //
-
   @Override
   public byte[] getBinary(int rowId) {
     ensureAccessible(rowId);
     return accessor.getBinary(rowId);
   }
 
-  /**
-   * Returns the data for the underlying array.
-   */
   @Override
   public ArrowColumnVector arrayData() { return childColumns[0]; }
 
-  /**
-   * Returns the ordinal's child data column.
-   */
   @Override
   public ArrowColumnVector getChildColumn(int ordinal) { return childColumns[ordinal]; }
 
@@ -305,8 +247,8 @@ public ArrowColumnVector(ValueVector vector) {
 
       childColumns = new ArrowColumnVector[1];
       childColumns[0] = new ArrowColumnVector(listVector.getDataVector());
-    } else if (vector instanceof MapVector) {
-      MapVector mapVector = (MapVector) vector;
+    } else if (vector instanceof NullableMapVector) {
+      NullableMapVector mapVector = (NullableMapVector) vector;
       accessor = new StructAccessor(mapVector);
 
       childColumns = new ArrowColumnVector[mapVector.size()];
@@ -611,9 +553,17 @@ final int getArrayOffset(int rowId) {
     }
   }
 
+  /**
+   * Any call to "get" method will throw UnsupportedOperationException.
+   *
+   * Access struct values in a ArrowColumnVector doesn't use this accessor. Instead, it uses
+   * getStruct() method defined in the parent class. Any call to "get" method in this class is a
+   * bug in the code.
+   *
+   */
   private static class StructAccessor extends ArrowVectorAccessor {
 
-    StructAccessor(MapVector vector) {
+    StructAccessor(NullableMapVector vector) {
       super(vector);
     }
   }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
similarity index 79%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
index dc7c1269bedd..d1196e1299fe 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.DataType;
@@ -22,24 +22,31 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * This class represents in-memory values of a column and provides the main APIs to access the data.
- * It supports all the types and contains get APIs as well as their batched versions. The batched
- * versions are considered to be faster and preferable whenever possible.
+ * An interface representing in-memory columnar data in Spark. This interface defines the main APIs
+ * to access the data, as well as their batched versions. The batched versions are considered to be
+ * faster and preferable whenever possible.
  *
- * To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases these
- * columns have child columns. All of the data are stored in the child columns and the parent column
- * only contains nullability. In the case of Arrays, the lengths and offsets are saved in the child
- * column and are encoded identically to INTs.
+ * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
+ * in this ColumnVector.
  *
- * Maps are just a special case of a two field struct.
+ * ColumnVector supports all the data types including nested types. To handle nested types,
+ * ColumnVector can have children and is a tree structure. For struct type, it stores the actual
+ * data of each field in the corresponding child ColumnVector, and only stores null information in
+ * the parent ColumnVector. For array type, it stores the actual array elements in the child
+ * ColumnVector, and stores null information, array offsets and lengths in the parent ColumnVector.
  *
- * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
- * in the current batch.
+ * ColumnVector is expected to be reused during the entire data loading process, to avoid allocating
+ * memory again and again.
+ *
+ * ColumnVector is meant to maximize CPU efficiency but not to minimize storage footprint.
+ * Implementations should prefer computing efficiency over storage efficiency when design the
+ * format. Since it is expected to reuse the ColumnVector instance while loading data, the storage
+ * footprint is negligible.
  */
 public abstract class ColumnVector implements AutoCloseable {
 
   /**
-   * Returns the data type of this column.
+   * Returns the data type of this column vector.
    */
   public final DataType dataType() { return type; }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
similarity index 95%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
index cbc39d1d0aec..0d89a52e7a4f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import org.apache.spark.sql.catalyst.util.ArrayData;
 import org.apache.spark.sql.catalyst.util.MapData;
@@ -23,8 +23,7 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * Array abstraction in {@link ColumnVector}. The instance of this class is intended
- * to be reused, callers should copy the data out if it needs to be stored.
+ * Array abstraction in {@link ColumnVector}.
  */
 public final class ColumnarArray extends ArrayData {
   // The data for this array. This array contains elements from
@@ -33,7 +32,7 @@ public final class ColumnarArray extends ArrayData {
   private final int offset;
   private final int length;
 
-  ColumnarArray(ColumnVector data, int offset, int length) {
+  public ColumnarArray(ColumnVector data, int offset, int length) {
     this.data = data;
     this.offset = offset;
     this.length = length;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
similarity index 60%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
index a9d09aa67972..4dc826cf60c1 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
@@ -14,32 +14,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import java.util.*;
 
 import org.apache.spark.sql.catalyst.InternalRow;
-import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.execution.vectorized.MutableColumnarRow;
 
 /**
- * This class is the in memory representation of rows as they are streamed through operators. It
- * is designed to maximize CPU efficiency and not storage footprint. Since it is expected that
- * each operator allocates one of these objects, the storage footprint on the task is negligible.
- *
- * The layout is a columnar with values encoded in their native format. Each RowBatch contains
- * a horizontal partitioning of the data, split into columns.
- *
- * The ColumnarBatch supports either on heap or offheap modes with (mostly) the identical API.
- *
- * TODO:
- *  - There are many TODOs for the existing APIs. They should throw a not implemented exception.
- *  - Compaction: The batch and columns should be able to compact based on a selection vector.
+ * This class wraps multiple ColumnVectors as a row-wise table. It provides a row view of this
+ * batch so that Spark can access the data row by row. Instance of it is meant to be reused during
+ * the entire data loading process.
  */
 public final class ColumnarBatch {
-  public static final int DEFAULT_BATCH_SIZE = 4 * 1024;
-
-  private final StructType schema;
-  private final int capacity;
   private int numRows;
   private final ColumnVector[] columns;
 
@@ -57,7 +44,7 @@ public void close() {
   }
 
   /**
-   * Returns an iterator over the rows in this batch. This skips rows that are filtered out.
+   * Returns an iterator over the rows in this batch.
    */
   public Iterator<InternalRow> rowIterator() {
     final int maxRows = numRows;
@@ -87,22 +74,9 @@ public void remove() {
   }
 
   /**
-   * Resets the batch for writing.
-   */
-  public void reset() {
-    for (int i = 0; i < numCols(); ++i) {
-      if (columns[i] instanceof WritableColumnVector) {
-        ((WritableColumnVector) columns[i]).reset();
-      }
-    }
-    this.numRows = 0;
-  }
-
-  /**
-   * Sets the number of rows that are valid.
+   * Sets the number of rows in this batch.
    */
   public void setNumRows(int numRows) {
-    assert(numRows <= this.capacity);
     this.numRows = numRows;
   }
 
@@ -116,16 +90,6 @@ public void setNumRows(int numRows) {
    */
   public int numRows() { return numRows; }
 
-  /**
-   * Returns the schema that makes up this batch.
-   */
-  public StructType schema() { return schema; }
-
-  /**
-   * Returns the max capacity (in number of rows) for this batch.
-   */
-  public int capacity() { return capacity; }
-
   /**
    * Returns the column at `ordinal`.
    */
@@ -140,10 +104,8 @@ public InternalRow getRow(int rowId) {
     return row;
   }
 
-  public ColumnarBatch(StructType schema, ColumnVector[] columns, int capacity) {
-    this.schema = schema;
+  public ColumnarBatch(ColumnVector[] columns) {
     this.columns = columns;
-    this.capacity = capacity;
     this.row = new MutableColumnarRow(columns);
   }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
similarity index 96%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
index 8bb33ed5b78c..3c6656dec77c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
@@ -24,8 +24,7 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * Row abstraction in {@link ColumnVector}. The instance of this class is intended
- * to be reused, callers should copy the data out if it needs to be stored.
+ * Row abstraction in {@link ColumnVector}.
  */
 public final class ColumnarRow extends InternalRow {
   // The data for this row.
@@ -34,7 +33,7 @@ public final class ColumnarRow extends InternalRow {
   private final int rowId;
   private final int numFields;
 
-  ColumnarRow(ColumnVector data, int rowId) {
+  public ColumnarRow(ColumnVector data, int rowId) {
     assert (data.dataType() instanceof StructType);
     this.data = data;
     this.rowId = rowId;
diff --git a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
index 6cdfe2fae564..0259c774bbf4 100644
--- a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
+++ b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -7,3 +7,4 @@ org.apache.spark.sql.execution.datasources.text.TextFileFormat
 org.apache.spark.sql.execution.streaming.ConsoleSinkProvider
 org.apache.spark.sql.execution.streaming.TextSocketSourceProvider
 org.apache.spark.sql.execution.streaming.RateSourceProvider
+org.apache.spark.sql.execution.streaming.sources.RateSourceProviderV2
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index e8d683a578f3..b714a46b5f78 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -191,6 +191,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
           ds = ds.asInstanceOf[DataSourceV2],
           conf = sparkSession.sessionState.conf)).asJava)
 
+      // Streaming also uses the data source V2 API. So it may be that the data source implements
+      // v2, but has no v2 implementation for batch reads. In that case, we fall back to loading
+      // the dataframe as a v1 source.
       val reader = (ds, userSpecifiedSchema) match {
         case (ds: ReadSupportWithSchema, Some(schema)) =>
           ds.createReader(schema, options)
@@ -208,23 +211,30 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
           }
           reader
 
-        case _ =>
-          throw new AnalysisException(s"$cls does not support data reading.")
+        case _ => null // fall back to v1
       }
 
-      Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
+      if (reader == null) {
+        loadV1Source(paths: _*)
+      } else {
+        Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
+      }
     } else {
-      // Code path for data source v1.
-      sparkSession.baseRelationToDataFrame(
-        DataSource.apply(
-          sparkSession,
-          paths = paths,
-          userSpecifiedSchema = userSpecifiedSchema,
-          className = source,
-          options = extraOptions.toMap).resolveRelation())
+      loadV1Source(paths: _*)
     }
   }
 
+  private def loadV1Source(paths: String*) = {
+    // Code path for data source v1.
+    sparkSession.baseRelationToDataFrame(
+      DataSource.apply(
+        sparkSession,
+        paths = paths,
+        userSpecifiedSchema = userSpecifiedSchema,
+        className = source,
+        options = extraOptions.toMap).resolveRelation())
+  }
+
   /**
    * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table and connection properties.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 3304f368e105..97f12ff625c4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -255,17 +255,24 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
             }
           }
 
-        case _ => throw new AnalysisException(s"$cls does not support data writing.")
+        // Streaming also uses the data source V2 API. So it may be that the data source implements
+        // v2, but has no v2 implementation for batch writes. In that case, we fall back to saving
+        // as though it's a V1 source.
+        case _ => saveToV1Source()
       }
     } else {
-      // Code path for data source v1.
-      runCommand(df.sparkSession, "save") {
-        DataSource(
-          sparkSession = df.sparkSession,
-          className = source,
-          partitionColumns = partitioningColumns.getOrElse(Nil),
-          options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
-      }
+      saveToV1Source()
+    }
+  }
+
+  private def saveToV1Source(): Unit = {
+    // Code path for data source v1.
+    runCommand(df.sparkSession, "save") {
+      DataSource(
+        sparkSession = df.sparkSession,
+        className = source,
+        partitionColumns = partitioningColumns.getOrElse(Nil),
+        options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 77e571272920..912f411fa384 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -237,13 +237,20 @@ class Dataset[T] private[sql](
   private[sql] def showString(
       _numRows: Int, truncate: Int = 20, vertical: Boolean = false): String = {
     val numRows = _numRows.max(0).min(Int.MaxValue - 1)
-    val takeResult = toDF().take(numRows + 1)
+    val newDf = toDF()
+    val castCols = newDf.logicalPlan.output.map { col =>
+      // Since binary types in top-level schema fields have a specific format to print,
+      // so we do not cast them to strings here.
+      if (col.dataType == BinaryType) {
+        Column(col)
+      } else {
+        Column(col).cast(StringType)
+      }
+    }
+    val takeResult = newDf.select(castCols: _*).take(numRows + 1)
     val hasMoreData = takeResult.length > numRows
     val data = takeResult.take(numRows)
 
-    lazy val timeZone =
-      DateTimeUtils.getTimeZone(sparkSession.sessionState.conf.sessionLocalTimeZone)
-
     // For array values, replace Seq and Array with square brackets
     // For cells that are beyond `truncate` characters, replace it with the
     // first `truncate-3` and "..."
@@ -252,12 +259,6 @@ class Dataset[T] private[sql](
         val str = cell match {
           case null => "null"
           case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]")
-          case array: Array[_] => array.mkString("[", ", ", "]")
-          case seq: Seq[_] => seq.mkString("[", ", ", "]")
-          case d: Date =>
-            DateTimeUtils.dateToString(DateTimeUtils.fromJavaDate(d))
-          case ts: Timestamp =>
-            DateTimeUtils.timestampToString(DateTimeUtils.fromJavaTimestamp(ts), timeZone)
           case _ => cell.toString
         }
         if (truncate > 0 && str.length > truncate) {
@@ -1902,7 +1903,7 @@ class Dataset[T] private[sql](
 
   /**
    * Returns a new Dataset containing rows in this Dataset but not in another Dataset.
-   * This is equivalent to `EXCEPT` in SQL.
+   * This is equivalent to `EXCEPT DISTINCT` in SQL.
    *
    * @note Equality checking is performed directly on the encoded representation of the data
    * and thus is not affected by a custom `equals` function defined on `T`.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 272eb844226d..734573ba31f7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -742,7 +742,10 @@ class SparkSession private(
   private[sql] def applySchemaToPythonRDD(
       rdd: RDD[Array[Any]],
       schema: StructType): DataFrame = {
-    val rowRdd = rdd.map(r => python.EvaluatePython.fromJava(r, schema).asInstanceOf[InternalRow])
+    val rowRdd = rdd.mapPartitions { iter =>
+      val fromJava = python.EvaluatePython.makeFromJava(schema)
+      iter.map(r => fromJava(r).asInstanceOf[InternalRow])
+    }
     internalCreateDataFrame(rowRdd, schema)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index dc2468a721e4..f94baef39dfa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import java.lang.reflect.{ParameterizedType, Type}
+import java.lang.reflect.ParameterizedType
 
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.Try
@@ -110,29 +110,29 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
 
   /* register 0-22 were generated by this script
 
-    (0 to 22).map { x =>
+    (0 to 22).foreach { x =>
       val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"})
-      val typeTags = (1 to x).map(i => s"A${i}: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _)
+      val typeTags = (1 to x).map(i => s"A$i: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _)
       val inputTypes = (1 to x).foldRight("Nil")((i, s) => {s"ScalaReflection.schemaFor[A$i].dataType :: $s"})
       println(s"""
-        /**
-         * Registers a deterministic Scala closure of ${x} arguments as user-defined function (UDF).
-         * @tparam RT return type of UDF.
-         * @since 1.3.0
-         */
-        def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = {
-          val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
-          val inputTypes = Try($inputTypes).toOption
-          def builder(e: Seq[Expression]) = if (e.length == $x) {
-            ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
-          } else {
-             throw new AnalysisException("Invalid number of arguments for function " + name +
-               ". Expected: $x; Found: " + e.length)
-          }
-          functionRegistry.createOrReplaceTempFunction(name, builder)
-          val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-          if (nullable) udf else udf.asNonNullable()
-        }""")
+        |/**
+        | * Registers a deterministic Scala closure of $x arguments as user-defined function (UDF).
+        | * @tparam RT return type of UDF.
+        | * @since 1.3.0
+        | */
+        |def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = {
+        |  val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
+        |  val inputTypes = Try($inputTypes).toOption
+        |  def builder(e: Seq[Expression]) = if (e.length == $x) {
+        |    ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
+        |  } else {
+        |    throw new AnalysisException("Invalid number of arguments for function " + name +
+        |      ". Expected: $x; Found: " + e.length)
+        |  }
+        |  functionRegistry.createOrReplaceTempFunction(name, builder)
+        |  val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+        |  if (nullable) udf else udf.asNonNullable()
+        |}""".stripMargin)
     }
 
     (0 to 22).foreach { i =>
@@ -144,7 +144,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
       val funcCall = if (i == 0) "() => func" else "func"
       println(s"""
         |/**
-        | * Register a user-defined function with ${i} arguments.
+        | * Register a deterministic Java UDF$i instance as user-defined function (UDF).
         | * @since $version
         | */
         |def register(name: String, f: UDF$i[$extTypeArgs], returnType: DataType): Unit = {
@@ -689,7 +689,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 0 arguments.
+   * Register a deterministic Java UDF0 instance as user-defined function (UDF).
    * @since 2.3.0
    */
   def register(name: String, f: UDF0[_], returnType: DataType): Unit = {
@@ -704,7 +704,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 1 arguments.
+   * Register a deterministic Java UDF1 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF1[_, _], returnType: DataType): Unit = {
@@ -719,7 +719,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 2 arguments.
+   * Register a deterministic Java UDF2 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF2[_, _, _], returnType: DataType): Unit = {
@@ -734,7 +734,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 3 arguments.
+   * Register a deterministic Java UDF3 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF3[_, _, _, _], returnType: DataType): Unit = {
@@ -749,7 +749,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 4 arguments.
+   * Register a deterministic Java UDF4 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType): Unit = {
@@ -764,7 +764,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 5 arguments.
+   * Register a deterministic Java UDF5 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType): Unit = {
@@ -779,7 +779,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 6 arguments.
+   * Register a deterministic Java UDF6 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -794,7 +794,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 7 arguments.
+   * Register a deterministic Java UDF7 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -809,7 +809,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 8 arguments.
+   * Register a deterministic Java UDF8 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -824,7 +824,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 9 arguments.
+   * Register a deterministic Java UDF9 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -839,7 +839,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 10 arguments.
+   * Register a deterministic Java UDF10 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -854,7 +854,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 11 arguments.
+   * Register a deterministic Java UDF11 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -869,7 +869,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 12 arguments.
+   * Register a deterministic Java UDF12 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -884,7 +884,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 13 arguments.
+   * Register a deterministic Java UDF13 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -899,7 +899,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 14 arguments.
+   * Register a deterministic Java UDF14 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -914,7 +914,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 15 arguments.
+   * Register a deterministic Java UDF15 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -929,7 +929,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 16 arguments.
+   * Register a deterministic Java UDF16 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -944,7 +944,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 17 arguments.
+   * Register a deterministic Java UDF17 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -959,7 +959,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 18 arguments.
+   * Register a deterministic Java UDF18 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -974,7 +974,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 19 arguments.
+   * Register a deterministic Java UDF19 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -989,7 +989,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 20 arguments.
+   * Register a deterministic Java UDF20 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -1004,7 +1004,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 21 arguments.
+   * Register a deterministic Java UDF21 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -1019,7 +1019,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 22 arguments.
+   * Register a deterministic Java UDF22 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
index 782cec5e292b..dd68df968669 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
@@ -17,21 +17,24 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.expressions.{BoundReference, UnsafeRow}
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.execution.vectorized.{ColumnarBatch, ColumnVector}
 import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
 
 
 /**
- * Helper trait for abstracting scan functionality using
- * [[org.apache.spark.sql.execution.vectorized.ColumnarBatch]]es.
+ * Helper trait for abstracting scan functionality using [[ColumnarBatch]]es.
  */
 private[sql] trait ColumnarBatchScan extends CodegenSupport {
 
   def vectorTypes: Option[Seq[String]] = None
 
+  protected def supportsBatch: Boolean = true
+
+  protected def needsUnsafeRowConversion: Boolean = true
+
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
     "scanTime" -> SQLMetrics.createTimingMetric(sparkContext, "scan time"))
@@ -71,7 +74,14 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
     // PhysicalRDD always just has one input
     val input = ctx.addMutableState("scala.collection.Iterator", "input",
       v => s"$v = inputs[0];")
+    if (supportsBatch) {
+      produceBatches(ctx, input)
+    } else {
+      produceRows(ctx, input)
+    }
+  }
 
+  private def produceBatches(ctx: CodegenContext, input: String): String = {
     // metrics
     val numOutputRows = metricTerm(ctx, "numOutputRows")
     val scanTimeMetric = metricTerm(ctx, "scanTime")
@@ -137,4 +147,25 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
      """.stripMargin
   }
 
+  private def produceRows(ctx: CodegenContext, input: String): String = {
+    val numOutputRows = metricTerm(ctx, "numOutputRows")
+    val row = ctx.freshName("row")
+
+    ctx.INPUT_ROW = row
+    ctx.currentVars = null
+    // Always provide `outputVars`, so that the framework can help us build unsafe row if the input
+    // row is not unsafe row, i.e. `needsUnsafeRowConversion` is true.
+    val outputVars = output.zipWithIndex.map { case (a, i) =>
+      BoundReference(i, a.dataType, a.nullable).genCode(ctx)
+    }
+    val inputRow = if (needsUnsafeRowConversion) null else row
+    s"""
+       |while ($input.hasNext()) {
+       |  InternalRow $row = (InternalRow) $input.next();
+       |  $numOutputRows.add(1);
+       |  ${consume(ctx, outputVars, inputRow).trim}
+       |  if (shouldStop()) return;
+       |}
+     """.stripMargin
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index d1ff82c7c06b..7c7d79c2bbd7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -164,13 +164,15 @@ case class FileSourceScanExec(
     override val tableIdentifier: Option[TableIdentifier])
   extends DataSourceScanExec with ColumnarBatchScan  {
 
-  val supportsBatch: Boolean = relation.fileFormat.supportBatch(
+  override val supportsBatch: Boolean = relation.fileFormat.supportBatch(
     relation.sparkSession, StructType.fromAttributes(output))
 
-  val needsUnsafeRowConversion: Boolean = if (relation.fileFormat.isInstanceOf[ParquetSource]) {
-    SparkSession.getActiveSession.get.sessionState.conf.parquetVectorizedReaderEnabled
-  } else {
-    false
+  override val needsUnsafeRowConversion: Boolean = {
+    if (relation.fileFormat.isInstanceOf[ParquetSource]) {
+      SparkSession.getActiveSession.get.sessionState.conf.parquetVectorizedReaderEnabled
+    } else {
+      false
+    }
   }
 
   override def vectorTypes: Option[Seq[String]] =
@@ -346,33 +348,6 @@ case class FileSourceScanExec(
 
   override val nodeNamePrefix: String = "File"
 
-  override protected def doProduce(ctx: CodegenContext): String = {
-    if (supportsBatch) {
-      return super.doProduce(ctx)
-    }
-    val numOutputRows = metricTerm(ctx, "numOutputRows")
-    // PhysicalRDD always just has one input
-    val input = ctx.addMutableState("scala.collection.Iterator", "input", v => s"$v = inputs[0];")
-    val row = ctx.freshName("row")
-
-    ctx.INPUT_ROW = row
-    ctx.currentVars = null
-    // Always provide `outputVars`, so that the framework can help us build unsafe row if the input
-    // row is not unsafe row, i.e. `needsUnsafeRowConversion` is true.
-    val outputVars = output.zipWithIndex.map{ case (a, i) =>
-      BoundReference(i, a.dataType, a.nullable).genCode(ctx)
-    }
-    val inputRow = if (needsUnsafeRowConversion) null else row
-    s"""
-       |while ($input.hasNext()) {
-       |  InternalRow $row = (InternalRow) $input.next();
-       |  $numOutputRows.add(1);
-       |  ${consume(ctx, outputVars, inputRow).trim}
-       |  if (shouldStop()) return;
-       |}
-     """.stripMargin
-  }
-
   /**
    * Create an RDD for bucketed reads.
    * The non-bucketed variant of this function is [[createNonBucketedReadRDD]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 787c1cfbfb3d..398758a3331b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -94,7 +94,21 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   /** Specifies how data is partitioned across different nodes in the cluster. */
   def outputPartitioning: Partitioning = UnknownPartitioning(0) // TODO: WRONG WIDTH!
 
-  /** Specifies any partition requirements on the input data for this operator. */
+  /**
+   * Specifies the data distribution requirements of all the children for this operator. By default
+   * it's [[UnspecifiedDistribution]] for each child, which means each child can have any
+   * distribution.
+   *
+   * If an operator overwrites this method, and specifies distribution requirements(excluding
+   * [[UnspecifiedDistribution]] and [[BroadcastDistribution]]) for more than one child, Spark
+   * guarantees that the outputs of these children will have same number of partitions, so that the
+   * operator can safely zip partitions of these children's result RDDs. Some operators can leverage
+   * this guarantee to satisfy some interesting requirement, e.g., non-broadcast joins can specify
+   * HashClusteredDistribution(a,b) for its left child, and specify HashClusteredDistribution(c,d)
+   * for its right child, then it's guaranteed that left and right child are co-partitioned by
+   * a,b/c,d, which means tuples of same value are in the partitions of same index, e.g.,
+   * (a=1,b=2) and (c=1,d=2) are both in the second partition of left and right child.
+   */
   def requiredChildDistribution: Seq[Distribution] =
     Seq.fill(children.size)(UnspecifiedDistribution)
 
@@ -337,8 +351,9 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
         if (buf.isEmpty) {
           numPartsToTry = partsScanned * limitScaleUpFactor
         } else {
-          // the left side of max is >=1 whenever partsScanned >= 2
-          numPartsToTry = Math.max((1.5 * n * partsScanned / buf.size).toInt - partsScanned, 1)
+          val left = n - buf.size
+          // As left > 0, numPartsToTry is always >= 1
+          numPartsToTry = Math.ceil(1.5 * left * partsScanned / buf.size).toInt
           numPartsToTry = Math.min(numPartsToTry, partsScanned * limitScaleUpFactor)
         }
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 29b584b55972..d3cfd2a1ffbf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -383,16 +383,19 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
    * {{{
    *   CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db_name.]table_name
    *   USING table_provider
-   *   [OPTIONS table_property_list]
-   *   [PARTITIONED BY (col_name, col_name, ...)]
-   *   [CLUSTERED BY (col_name, col_name, ...)
-   *    [SORTED BY (col_name [ASC|DESC], ...)]
-   *    INTO num_buckets BUCKETS
-   *   ]
-   *   [LOCATION path]
-   *   [COMMENT table_comment]
-   *   [TBLPROPERTIES (property_name=property_value, ...)]
+   *   create_table_clauses
    *   [[AS] select_statement];
+   *
+   *   create_table_clauses (order insensitive):
+   *     [OPTIONS table_property_list]
+   *     [PARTITIONED BY (col_name, col_name, ...)]
+   *     [CLUSTERED BY (col_name, col_name, ...)
+   *       [SORTED BY (col_name [ASC|DESC], ...)]
+   *       INTO num_buckets BUCKETS
+   *     ]
+   *     [LOCATION path]
+   *     [COMMENT table_comment]
+   *     [TBLPROPERTIES (property_name=property_value, ...)]
    * }}}
    */
   override def visitCreateTable(ctx: CreateTableContext): LogicalPlan = withOrigin(ctx) {
@@ -400,6 +403,14 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
     if (external) {
       operationNotAllowed("CREATE EXTERNAL TABLE ... USING", ctx)
     }
+
+    checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx)
+    checkDuplicateClauses(ctx.OPTIONS, "OPTIONS", ctx)
+    checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED BY", ctx)
+    checkDuplicateClauses(ctx.COMMENT, "COMMENT", ctx)
+    checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx)
+    checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx)
+
     val options = Option(ctx.options).map(visitPropertyKeyValues).getOrElse(Map.empty)
     val provider = ctx.tableProvider.qualifiedName.getText
     val schema = Option(ctx.colTypeList()).map(createSchema)
@@ -408,9 +419,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
         .map(visitIdentifierList(_).toArray)
         .getOrElse(Array.empty[String])
     val properties = Option(ctx.tableProps).map(visitPropertyKeyValues).getOrElse(Map.empty)
-    val bucketSpec = Option(ctx.bucketSpec()).map(visitBucketSpec)
+    val bucketSpec = ctx.bucketSpec().asScala.headOption.map(visitBucketSpec)
 
-    val location = Option(ctx.locationSpec).map(visitLocationSpec)
+    val location = ctx.locationSpec.asScala.headOption.map(visitLocationSpec)
     val storage = DataSource.buildStorageFormatFromOptions(options)
 
     if (location.isDefined && storage.locationUri.isDefined) {
@@ -1087,13 +1098,16 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
    * {{{
    *   CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
    *   [(col1[:] data_type [COMMENT col_comment], ...)]
-   *   [COMMENT table_comment]
-   *   [PARTITIONED BY (col2[:] data_type [COMMENT col_comment], ...)]
-   *   [ROW FORMAT row_format]
-   *   [STORED AS file_format]
-   *   [LOCATION path]
-   *   [TBLPROPERTIES (property_name=property_value, ...)]
+   *   create_table_clauses
    *   [AS select_statement];
+   *
+   *   create_table_clauses (order insensitive):
+   *     [COMMENT table_comment]
+   *     [PARTITIONED BY (col2[:] data_type [COMMENT col_comment], ...)]
+   *     [ROW FORMAT row_format]
+   *     [STORED AS file_format]
+   *     [LOCATION path]
+   *     [TBLPROPERTIES (property_name=property_value, ...)]
    * }}}
    */
   override def visitCreateHiveTable(ctx: CreateHiveTableContext): LogicalPlan = withOrigin(ctx) {
@@ -1104,15 +1118,23 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
         "CREATE TEMPORARY TABLE is not supported yet. " +
           "Please use CREATE TEMPORARY VIEW as an alternative.", ctx)
     }
-    if (ctx.skewSpec != null) {
+    if (ctx.skewSpec.size > 0) {
       operationNotAllowed("CREATE TABLE ... SKEWED BY", ctx)
     }
 
+    checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx)
+    checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED BY", ctx)
+    checkDuplicateClauses(ctx.COMMENT, "COMMENT", ctx)
+    checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx)
+    checkDuplicateClauses(ctx.createFileFormat, "STORED AS/BY", ctx)
+    checkDuplicateClauses(ctx.rowFormat, "ROW FORMAT", ctx)
+    checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx)
+
     val dataCols = Option(ctx.columns).map(visitColTypeList).getOrElse(Nil)
     val partitionCols = Option(ctx.partitionColumns).map(visitColTypeList).getOrElse(Nil)
-    val properties = Option(ctx.tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty)
+    val properties = Option(ctx.tableProps).map(visitPropertyKeyValues).getOrElse(Map.empty)
     val selectQuery = Option(ctx.query).map(plan)
-    val bucketSpec = Option(ctx.bucketSpec()).map(visitBucketSpec)
+    val bucketSpec = ctx.bucketSpec().asScala.headOption.map(visitBucketSpec)
 
     // Note: Hive requires partition columns to be distinct from the schema, so we need
     // to include the partition columns here explicitly
@@ -1120,12 +1142,12 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
 
     // Storage format
     val defaultStorage = HiveSerDe.getDefaultStorage(conf)
-    validateRowFormatFileFormat(ctx.rowFormat, ctx.createFileFormat, ctx)
-    val fileStorage = Option(ctx.createFileFormat).map(visitCreateFileFormat)
+    validateRowFormatFileFormat(ctx.rowFormat.asScala, ctx.createFileFormat.asScala, ctx)
+    val fileStorage = ctx.createFileFormat.asScala.headOption.map(visitCreateFileFormat)
       .getOrElse(CatalogStorageFormat.empty)
-    val rowStorage = Option(ctx.rowFormat).map(visitRowFormat)
+    val rowStorage = ctx.rowFormat.asScala.headOption.map(visitRowFormat)
       .getOrElse(CatalogStorageFormat.empty)
-    val location = Option(ctx.locationSpec).map(visitLocationSpec)
+    val location = ctx.locationSpec.asScala.headOption.map(visitLocationSpec)
     // If we are creating an EXTERNAL table, then the LOCATION field is required
     if (external && location.isEmpty) {
       operationNotAllowed("CREATE EXTERNAL TABLE must be accompanied by LOCATION", ctx)
@@ -1180,7 +1202,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
             ctx)
         }
 
-        val hasStorageProperties = (ctx.createFileFormat != null) || (ctx.rowFormat != null)
+        val hasStorageProperties = (ctx.createFileFormat.size != 0) || (ctx.rowFormat.size != 0)
         if (conf.convertCTAS && !hasStorageProperties) {
           // At here, both rowStorage.serdeProperties and fileStorage.serdeProperties
           // are empty Maps.
@@ -1366,6 +1388,15 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
     }
   }
 
+  private def validateRowFormatFileFormat(
+      rowFormatCtx: Seq[RowFormatContext],
+      createFileFormatCtx: Seq[CreateFileFormatContext],
+      parentCtx: ParserRuleContext): Unit = {
+    if (rowFormatCtx.size == 1 && createFileFormatCtx.size == 1) {
+      validateRowFormatFileFormat(rowFormatCtx.head, createFileFormatCtx.head, parentCtx)
+    }
+  }
+
   /**
    * Create or replace a view. This creates a [[CreateViewCommand]] command.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 9a6f1c6dfa6a..ce3c68810f3b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
-import org.apache.spark.sql.execution.vectorized.{ColumnarRow, MutableColumnarRow}
+import org.apache.spark.sql.execution.vectorized.MutableColumnarRow
 import org.apache.spark.sql.types.{DecimalType, StringType, StructType}
 import org.apache.spark.unsafe.KVIterator
 import org.apache.spark.util.Utils
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
index 0380ee8b09d6..eb48584d0c1e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
@@ -20,8 +20,9 @@ package org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
-import org.apache.spark.sql.execution.vectorized.{ColumnarBatch, MutableColumnarRow, OnHeapColumnVector}
+import org.apache.spark.sql.execution.vectorized.{MutableColumnarRow, OnHeapColumnVector}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ColumnarBatch
 
 /**
  * This is a helper class to generate an append-only vectorized hash map that can act as a 'cache'
@@ -93,7 +94,7 @@ class VectorizedHashMapGenerator(
        |
        |  public $generatedClassName() {
        |    vectors = ${classOf[OnHeapColumnVector].getName}.allocateColumns(capacity, schema);
-       |    batch = new ${classOf[ColumnarBatch].getName}(schema, vectors, capacity);
+       |    batch = new ${classOf[ColumnarBatch].getName}(vectors);
        |
        |    // Generates a projection to return the aggregate buffer only.
        |    ${classOf[OnHeapColumnVector].getName}[] aggBufferVectors =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
index bcfc41243026..7487564ed64d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -32,8 +32,8 @@ import org.apache.spark.TaskContext
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.util.Utils
 
 
@@ -175,7 +175,7 @@ private[sql] object ArrowConverters {
           new ArrowColumnVector(vector).asInstanceOf[ColumnVector]
         }.toArray
 
-        val batch = new ColumnarBatch(schemaRead, columns, root.getRowCount)
+        val batch = new ColumnarBatch(columns)
         batch.setNumRows(root.getRowCount)
         batch.rowIterator().asScala
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index 3e73393b1285..28b3875505cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partition
 import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, WholeStageCodegenExec}
 import org.apache.spark.sql.execution.vectorized._
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
 
 
 case class InMemoryTableScanExec(
@@ -48,9 +49,9 @@ case class InMemoryTableScanExec(
 
   /**
    * If true, get data from ColumnVector in ColumnarBatch, which are generally faster.
-   * If false, get data from UnsafeRow build from ColumnVector
+   * If false, get data from UnsafeRow build from CachedBatch
    */
-  override val supportCodegen: Boolean = {
+  override val supportsBatch: Boolean = {
     // In the initial implementation, for ease of review
     // support only primitive data types and # of fields is less than wholeStageMaxNumFields
     relation.schema.fields.forall(f => f.dataType match {
@@ -60,6 +61,8 @@ case class InMemoryTableScanExec(
     }) && !WholeStageCodegenExec.isTooManyFields(conf, relation.schema)
   }
 
+  override protected def needsUnsafeRowConversion: Boolean = false
+
   private val columnIndices =
     attributes.map(a => relation.output.map(o => o.exprId).indexOf(a.exprId)).toArray
 
@@ -75,11 +78,10 @@ case class InMemoryTableScanExec(
     } else {
       OffHeapColumnVector.allocateColumns(rowCount, columnarBatchSchema)
     }
-    val columnarBatch = new ColumnarBatch(
-      columnarBatchSchema, columnVectors.asInstanceOf[Array[ColumnVector]], rowCount)
+    val columnarBatch = new ColumnarBatch(columnVectors.asInstanceOf[Array[ColumnVector]])
     columnarBatch.setNumRows(rowCount)
 
-    for (i <- 0 until attributes.length) {
+    for (i <- attributes.indices) {
       ColumnAccessor.decompress(
         cachedColumnarBatch.buffers(columnIndices(i)),
         columnarBatch.column(i).asInstanceOf[WritableColumnVector],
@@ -89,14 +91,56 @@ case class InMemoryTableScanExec(
     columnarBatch
   }
 
-  override def inputRDDs(): Seq[RDD[InternalRow]] = {
-    assert(supportCodegen)
+  private lazy val inputRDD: RDD[InternalRow] = {
     val buffers = filteredCachedBatches()
-    // HACK ALERT: This is actually an RDD[ColumnarBatch].
-    // We're taking advantage of Scala's type erasure here to pass these batches along.
-    Seq(buffers.map(createAndDecompressColumn(_)).asInstanceOf[RDD[InternalRow]])
+    if (supportsBatch) {
+      // HACK ALERT: This is actually an RDD[ColumnarBatch].
+      // We're taking advantage of Scala's type erasure here to pass these batches along.
+      buffers.map(createAndDecompressColumn).asInstanceOf[RDD[InternalRow]]
+    } else {
+      val numOutputRows = longMetric("numOutputRows")
+
+      if (enableAccumulatorsForTest) {
+        readPartitions.setValue(0)
+        readBatches.setValue(0)
+      }
+
+      // Using these variables here to avoid serialization of entire objects (if referenced
+      // directly) within the map Partitions closure.
+      val relOutput: AttributeSeq = relation.output
+
+      filteredCachedBatches().mapPartitionsInternal { cachedBatchIterator =>
+        // Find the ordinals and data types of the requested columns.
+        val (requestedColumnIndices, requestedColumnDataTypes) =
+          attributes.map { a =>
+            relOutput.indexOf(a.exprId) -> a.dataType
+          }.unzip
+
+        // update SQL metrics
+        val withMetrics = cachedBatchIterator.map { batch =>
+          if (enableAccumulatorsForTest) {
+            readBatches.add(1)
+          }
+          numOutputRows += batch.numRows
+          batch
+        }
+
+        val columnTypes = requestedColumnDataTypes.map {
+          case udt: UserDefinedType[_] => udt.sqlType
+          case other => other
+        }.toArray
+        val columnarIterator = GenerateColumnAccessor.generate(columnTypes)
+        columnarIterator.initialize(withMetrics, columnTypes, requestedColumnIndices.toArray)
+        if (enableAccumulatorsForTest && columnarIterator.hasNext) {
+          readPartitions.add(1)
+        }
+        columnarIterator
+      }
+    }
   }
 
+  override def inputRDDs(): Seq[RDD[InternalRow]] = Seq(inputRDD)
+
   override def output: Seq[Attribute] = attributes
 
   private def updateAttribute(expr: Expression): Expression = {
@@ -184,7 +228,7 @@ case class InMemoryTableScanExec(
     }
   }
 
-  lazy val enableAccumulators: Boolean =
+  lazy val enableAccumulatorsForTest: Boolean =
     sqlContext.getConf("spark.sql.inMemoryTableScanStatistics.enable", "false").toBoolean
 
   // Accumulators used for testing purposes
@@ -229,43 +273,10 @@ case class InMemoryTableScanExec(
   }
 
   protected override def doExecute(): RDD[InternalRow] = {
-    val numOutputRows = longMetric("numOutputRows")
-
-    if (enableAccumulators) {
-      readPartitions.setValue(0)
-      readBatches.setValue(0)
-    }
-
-    // Using these variables here to avoid serialization of entire objects (if referenced directly)
-    // within the map Partitions closure.
-    val relOutput: AttributeSeq = relation.output
-
-    filteredCachedBatches().mapPartitionsInternal { cachedBatchIterator =>
-      // Find the ordinals and data types of the requested columns.
-      val (requestedColumnIndices, requestedColumnDataTypes) =
-        attributes.map { a =>
-          relOutput.indexOf(a.exprId) -> a.dataType
-        }.unzip
-
-      // update SQL metrics
-      val withMetrics = cachedBatchIterator.map { batch =>
-        if (enableAccumulators) {
-          readBatches.add(1)
-        }
-        numOutputRows += batch.numRows
-        batch
-      }
-
-      val columnTypes = requestedColumnDataTypes.map {
-        case udt: UserDefinedType[_] => udt.sqlType
-        case other => other
-      }.toArray
-      val columnarIterator = GenerateColumnAccessor.generate(columnTypes)
-      columnarIterator.initialize(withMetrics, columnTypes, requestedColumnIndices.toArray)
-      if (enableAccumulators && columnarIterator.hasNext) {
-        readPartitions.add(1)
-      }
-      columnarIterator
+    if (supportsBatch) {
+      WholeStageCodegenExec(this).execute()
+    } else {
+      inputRDD
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
index 1a0d67fc71fb..c27048626c8e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -116,8 +116,8 @@ object CommandUtils extends Logging {
       oldStats: Option[CatalogStatistics],
       newTotalSize: BigInt,
       newRowCount: Option[BigInt]): Option[CatalogStatistics] = {
-    val oldTotalSize = oldStats.map(_.sizeInBytes.toLong).getOrElse(-1L)
-    val oldRowCount = oldStats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L)
+    val oldTotalSize = oldStats.map(_.sizeInBytes).getOrElse(BigInt(-1))
+    val oldRowCount = oldStats.flatMap(_.rowCount).getOrElse(BigInt(-1))
     var newStats: Option[CatalogStatistics] = None
     if (newTotalSize >= 0 && newTotalSize != oldTotalSize) {
       newStats = Some(CatalogStatistics(sizeInBytes = newTotalSize))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
index 8731ee88f87f..835ce9846247 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -26,7 +26,7 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{InputFileBlockHolder, RDD}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.vectorized.ColumnarBatch
+import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.NextIterator
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index 89d8a85a9cbd..6b3463852977 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -82,7 +82,11 @@ case class HadoopFsRelation(
     }
   }
 
-  override def sizeInBytes: Long = location.sizeInBytes
+  override def sizeInBytes: Long = {
+    val compressionFactor = sqlContext.conf.fileCompressionFactor
+    (location.sizeInBytes * compressionFactor).toLong
+  }
+
 
   override def inputFiles: Array[String] = location.inputFiles
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index ad24e280d942..dd7ef0d15c14 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command._
+import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode
 import org.apache.spark.sql.util.SchemaUtils
 
 /**
@@ -89,13 +90,19 @@ case class InsertIntoHadoopFsRelationCommand(
     }
 
     val pathExists = fs.exists(qualifiedOutputPath)
-    // If we are appending data to an existing dir.
-    val isAppend = pathExists && (mode == SaveMode.Append)
+
+    val enableDynamicOverwrite =
+      sparkSession.sessionState.conf.partitionOverwriteMode == PartitionOverwriteMode.DYNAMIC
+    // This config only makes sense when we are overwriting a partitioned dataset with dynamic
+    // partition columns.
+    val dynamicPartitionOverwrite = enableDynamicOverwrite && mode == SaveMode.Overwrite &&
+      staticPartitions.size < partitionColumns.length
 
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = outputPath.toString)
+      outputPath = outputPath.toString,
+      dynamicPartitionOverwrite = dynamicPartitionOverwrite)
 
     val doInsertion = (mode, pathExists) match {
       case (SaveMode.ErrorIfExists, true) =>
@@ -103,6 +110,9 @@ case class InsertIntoHadoopFsRelationCommand(
       case (SaveMode.Overwrite, true) =>
         if (ifPartitionNotExists && matchingPartitions.nonEmpty) {
           false
+        } else if (dynamicPartitionOverwrite) {
+          // For dynamic partition overwrite, do not delete partition directories ahead.
+          true
         } else {
           deleteMatchingPartitions(fs, qualifiedOutputPath, customPartitionLocations, committer)
           true
@@ -126,7 +136,9 @@ case class InsertIntoHadoopFsRelationCommand(
               catalogTable.get.identifier, newPartitions.toSeq.map(p => (p, None)),
               ifNotExists = true).run(sparkSession)
           }
-          if (mode == SaveMode.Overwrite) {
+          // For dynamic partition overwrite, we never remove partitions but only update existing
+          // ones.
+          if (mode == SaveMode.Overwrite && !dynamicPartitionOverwrite) {
             val deletedPartitions = initialMatchingPartitions.toSet -- updatedPartitions
             if (deletedPartitions.nonEmpty) {
               AlterTableDropPartitionCommand(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 40825a1f724b..39c594a9bc61 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -29,11 +29,15 @@ import org.apache.spark.sql.internal.SQLConf
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
  * Hadoop output committer using an option specified in SQLConf.
  */
-class SQLHadoopMapReduceCommitProtocol(jobId: String, path: String)
-  extends HadoopMapReduceCommitProtocol(jobId, path) with Serializable with Logging {
+class SQLHadoopMapReduceCommitProtocol(
+    jobId: String,
+    path: String,
+    dynamicPartitionOverwrite: Boolean = false)
+  extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
+    with Serializable with Logging {
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
-    var committer = context.getOutputFormatClass.newInstance().getOutputCommitter(context)
+    var committer = super.setupCommitter(context)
 
     val configuration = context.getConfiguration
     val clazz =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
index f7471cd7debc..2dd314d16534 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
@@ -118,6 +119,13 @@ class OrcFileFormat
     }
   }
 
+  override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = {
+    val conf = sparkSession.sessionState.conf
+    conf.orcVectorizedReaderEnabled && conf.wholeStageEnabled &&
+      schema.length <= conf.wholeStageMaxNumFields &&
+      schema.forall(_.dataType.isInstanceOf[AtomicType])
+  }
+
   override def isSplitable(
       sparkSession: SparkSession,
       options: Map[String, String],
@@ -139,6 +147,12 @@ class OrcFileFormat
       }
     }
 
+    val resultSchema = StructType(requiredSchema.fields ++ partitionSchema.fields)
+    val sqlConf = sparkSession.sessionState.conf
+    val enableOffHeapColumnVector = sqlConf.offHeapColumnVectorEnabled
+    val enableVectorizedReader = supportBatch(sparkSession, resultSchema)
+    val copyToSpark = sparkSession.sessionState.conf.getConf(SQLConf.ORC_COPY_BATCH_TO_SPARK)
+
     val broadcastedConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
     val isCaseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
@@ -146,8 +160,14 @@ class OrcFileFormat
     (file: PartitionedFile) => {
       val conf = broadcastedConf.value.value
 
+      val filePath = new Path(new URI(file.filePath))
+
+      val fs = filePath.getFileSystem(conf)
+      val readerOptions = OrcFile.readerOptions(conf).filesystem(fs)
+      val reader = OrcFile.createReader(filePath, readerOptions)
+
       val requestedColIdsOrEmptyFile = OrcUtils.requestedColumnIds(
-        isCaseSensitive, dataSchema, requiredSchema, new Path(new URI(file.filePath)), conf)
+        isCaseSensitive, dataSchema, requiredSchema, reader, conf)
 
       if (requestedColIdsOrEmptyFile.isEmpty) {
         Iterator.empty
@@ -155,29 +175,46 @@ class OrcFileFormat
         val requestedColIds = requestedColIdsOrEmptyFile.get
         assert(requestedColIds.length == requiredSchema.length,
           "[BUG] requested column IDs do not match required schema")
-        conf.set(OrcConf.INCLUDE_COLUMNS.getAttribute,
+        val taskConf = new Configuration(conf)
+        taskConf.set(OrcConf.INCLUDE_COLUMNS.getAttribute,
           requestedColIds.filter(_ != -1).sorted.mkString(","))
 
-        val fileSplit =
-          new FileSplit(new Path(new URI(file.filePath)), file.start, file.length, Array.empty)
+        val fileSplit = new FileSplit(filePath, file.start, file.length, Array.empty)
         val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
-        val taskAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
-
-        val orcRecordReader = new OrcInputFormat[OrcStruct]
-          .createRecordReader(fileSplit, taskAttemptContext)
-        val iter = new RecordReaderIterator[OrcStruct](orcRecordReader)
-        Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
-
-        val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes
-        val unsafeProjection = GenerateUnsafeProjection.generate(fullSchema, fullSchema)
-        val deserializer = new OrcDeserializer(dataSchema, requiredSchema, requestedColIds)
-
-        if (partitionSchema.length == 0) {
-          iter.map(value => unsafeProjection(deserializer.deserialize(value)))
+        val taskAttemptContext = new TaskAttemptContextImpl(taskConf, attemptId)
+
+        val taskContext = Option(TaskContext.get())
+        if (enableVectorizedReader) {
+          val batchReader = new OrcColumnarBatchReader(
+            enableOffHeapColumnVector && taskContext.isDefined, copyToSpark)
+          batchReader.initialize(fileSplit, taskAttemptContext)
+          batchReader.initBatch(
+            reader.getSchema,
+            requestedColIds,
+            requiredSchema.fields,
+            partitionSchema,
+            file.partitionValues)
+
+          val iter = new RecordReaderIterator(batchReader)
+          Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
+          iter.asInstanceOf[Iterator[InternalRow]]
         } else {
-          val joinedRow = new JoinedRow()
-          iter.map(value =>
-            unsafeProjection(joinedRow(deserializer.deserialize(value), file.partitionValues)))
+          val orcRecordReader = new OrcInputFormat[OrcStruct]
+            .createRecordReader(fileSplit, taskAttemptContext)
+          val iter = new RecordReaderIterator[OrcStruct](orcRecordReader)
+          Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
+
+          val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes
+          val unsafeProjection = GenerateUnsafeProjection.generate(fullSchema, fullSchema)
+          val deserializer = new OrcDeserializer(dataSchema, requiredSchema, requestedColIds)
+
+          if (partitionSchema.length == 0) {
+            iter.map(value => unsafeProjection(deserializer.deserialize(value)))
+          } else {
+            val joinedRow = new JoinedRow()
+            iter.map(value =>
+              unsafeProjection(joinedRow(deserializer.deserialize(value), file.partitionValues)))
+          }
         }
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
index b03ee06d04a1..460194ba61c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -21,8 +21,9 @@ import scala.collection.JavaConverters._
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.orc.{OrcFile, TypeDescription}
+import org.apache.orc.{OrcFile, Reader, TypeDescription}
 
+import org.apache.spark.SparkException
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
@@ -50,23 +51,35 @@ object OrcUtils extends Logging {
     paths
   }
 
-  def readSchema(file: Path, conf: Configuration): Option[TypeDescription] = {
+  def readSchema(file: Path, conf: Configuration, ignoreCorruptFiles: Boolean)
+      : Option[TypeDescription] = {
     val fs = file.getFileSystem(conf)
     val readerOptions = OrcFile.readerOptions(conf).filesystem(fs)
-    val reader = OrcFile.createReader(file, readerOptions)
-    val schema = reader.getSchema
-    if (schema.getFieldNames.size == 0) {
-      None
-    } else {
-      Some(schema)
+    try {
+      val reader = OrcFile.createReader(file, readerOptions)
+      val schema = reader.getSchema
+      if (schema.getFieldNames.size == 0) {
+        None
+      } else {
+        Some(schema)
+      }
+    } catch {
+      case e: org.apache.orc.FileFormatException =>
+        if (ignoreCorruptFiles) {
+          logWarning(s"Skipped the footer in the corrupted file: $file", e)
+          None
+        } else {
+          throw new SparkException(s"Could not read footer for file: $file", e)
+        }
     }
   }
 
   def readSchema(sparkSession: SparkSession, files: Seq[FileStatus])
       : Option[StructType] = {
+    val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
     val conf = sparkSession.sessionState.newHadoopConf()
     // TODO: We need to support merge schema. Please see SPARK-11412.
-    files.map(_.getPath).flatMap(readSchema(_, conf)).headOption.map { schema =>
+    files.map(_.getPath).flatMap(readSchema(_, conf, ignoreCorruptFiles)).headOption.map { schema =>
       logDebug(s"Reading schema from file $files, got Hive schema string: $schema")
       CatalystSqlParser.parseDataType(schema.toString).asInstanceOf[StructType]
     }
@@ -80,11 +93,8 @@ object OrcUtils extends Logging {
       isCaseSensitive: Boolean,
       dataSchema: StructType,
       requiredSchema: StructType,
-      file: Path,
+      reader: Reader,
       conf: Configuration): Option[Array[Int]] = {
-    val fs = file.getFileSystem(conf)
-    val readerOptions = OrcFile.readerOptions(conf).filesystem(fs)
-    val reader = OrcFile.createReader(file, readerOptions)
     val orcFieldNames = reader.getSchema.getFieldNames.asScala
     if (orcFieldNames.isEmpty) {
       // SPARK-8501: Some old empty ORC files always have an empty schema stored in their footer.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
index 772d4565de54..ef67ea7d17ce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.util.Locale
 
+import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.parquet.hadoop.metadata.CompressionCodecName
 
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
@@ -42,8 +43,15 @@ private[parquet] class ParquetOptions(
    * Acceptable values are defined in [[shortParquetCompressionCodecNames]].
    */
   val compressionCodecClassName: String = {
-    val codecName = parameters.getOrElse("compression",
-      sqlConf.parquetCompressionCodec).toLowerCase(Locale.ROOT)
+    // `compression`, `parquet.compression`(i.e., ParquetOutputFormat.COMPRESSION), and
+    // `spark.sql.parquet.compression.codec`
+    // are in order of precedence from highest to lowest.
+    val parquetCompressionConf = parameters.get(ParquetOutputFormat.COMPRESSION)
+    val codecName = parameters
+      .get("compression")
+      .orElse(parquetCompressionConf)
+      .getOrElse(sqlConf.parquetCompressionCodec)
+      .toLowerCase(Locale.ROOT)
     if (!shortParquetCompressionCodecNames.contains(codecName)) {
       val availableCodecs =
         shortParquetCompressionCodecNames.keys.map(_.toLowerCase(Locale.ROOT))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
index 5f30be5ed4af..ac104d7cd0cb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
@@ -18,19 +18,19 @@
 package org.apache.spark.sql.execution.datasources.v2
 
 import scala.collection.JavaConverters._
+import scala.reflect.ClassTag
 
 import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.sources.v2.reader.ReadTask
 
-class DataSourceRDDPartition(val index: Int, val readTask: ReadTask[UnsafeRow])
+class DataSourceRDDPartition[T : ClassTag](val index: Int, val readTask: ReadTask[T])
   extends Partition with Serializable
 
-class DataSourceRDD(
+class DataSourceRDD[T: ClassTag](
     sc: SparkContext,
-    @transient private val readTasks: java.util.List[ReadTask[UnsafeRow]])
-  extends RDD[UnsafeRow](sc, Nil) {
+    @transient private val readTasks: java.util.List[ReadTask[T]])
+  extends RDD[T](sc, Nil) {
 
   override protected def getPartitions: Array[Partition] = {
     readTasks.asScala.zipWithIndex.map {
@@ -38,10 +38,10 @@ class DataSourceRDD(
     }.toArray
   }
 
-  override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = {
-    val reader = split.asInstanceOf[DataSourceRDDPartition].readTask.createDataReader()
+  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
+    val reader = split.asInstanceOf[DataSourceRDDPartition[T]].readTask.createDataReader()
     context.addTaskCompletionListener(_ => reader.close())
-    val iter = new Iterator[UnsafeRow] {
+    val iter = new Iterator[T] {
       private[this] var valuePrepared = false
 
       override def hasNext: Boolean = {
@@ -51,7 +51,7 @@ class DataSourceRDD(
         valuePrepared
       }
 
-      override def next(): UnsafeRow = {
+      override def next(): T = {
         if (!hasNext) {
           throw new java.util.NoSuchElementException("End of stream")
         }
@@ -63,6 +63,6 @@ class DataSourceRDD(
   }
 
   override def getPreferredLocations(split: Partition): Seq[String] = {
-    split.asInstanceOf[DataSourceRDDPartition].readTask.preferredLocations()
+    split.asInstanceOf[DataSourceRDDPartition[T]].readTask.preferredLocations()
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
index 7eb99a645001..cba20dd90200 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -35,6 +35,16 @@ case class DataSourceV2Relation(
   }
 }
 
+/**
+ * A specialization of DataSourceV2Relation with the streaming bit set to true. Otherwise identical
+ * to the non-streaming relation.
+ */
+class StreamingDataSourceV2Relation(
+    fullOutput: Seq[AttributeReference],
+    reader: DataSourceV2Reader) extends DataSourceV2Relation(fullOutput, reader) {
+  override def isStreaming: Boolean = true
+}
+
 object DataSourceV2Relation {
   def apply(reader: DataSourceV2Reader): DataSourceV2Relation = {
     new DataSourceV2Relation(reader.readSchema().toAttributes, reader)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index 49c506bc560c..beb66738732b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -24,10 +24,8 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.execution.LeafExecNode
-import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.execution.streaming.StreamExecution
-import org.apache.spark.sql.execution.streaming.continuous.{ContinuousDataSourceRDD, ContinuousExecution, EpochCoordinatorRef, SetReaderPartitions}
+import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, WholeStageCodegenExec}
+import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader
 import org.apache.spark.sql.types.StructType
@@ -37,40 +35,57 @@ import org.apache.spark.sql.types.StructType
  */
 case class DataSourceV2ScanExec(
     fullOutput: Seq[AttributeReference],
-    @transient reader: DataSourceV2Reader) extends LeafExecNode with DataSourceReaderHolder {
+    @transient reader: DataSourceV2Reader)
+  extends LeafExecNode with DataSourceReaderHolder with ColumnarBatchScan {
 
   override def canEqual(other: Any): Boolean = other.isInstanceOf[DataSourceV2ScanExec]
 
-  override def references: AttributeSet = AttributeSet.empty
+  override def producedAttributes: AttributeSet = AttributeSet(fullOutput)
 
-  override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
+  private lazy val readTasks: java.util.List[ReadTask[UnsafeRow]] = reader match {
+    case r: SupportsScanUnsafeRow => r.createUnsafeRowReadTasks()
+    case _ =>
+      reader.createReadTasks().asScala.map {
+        new RowToUnsafeRowReadTask(_, reader.readSchema()): ReadTask[UnsafeRow]
+      }.asJava
+  }
 
-  override protected def doExecute(): RDD[InternalRow] = {
-    val readTasks: java.util.List[ReadTask[UnsafeRow]] = reader match {
-      case r: SupportsScanUnsafeRow => r.createUnsafeRowReadTasks()
-      case _ =>
-        reader.createReadTasks().asScala.map {
-          new RowToUnsafeRowReadTask(_, reader.readSchema()): ReadTask[UnsafeRow]
-        }.asJava
-    }
+  private lazy val inputRDD: RDD[InternalRow] = reader match {
+    case r: SupportsScanColumnarBatch if r.enableBatchRead() =>
+      assert(!reader.isInstanceOf[ContinuousReader],
+        "continuous stream reader does not support columnar read yet.")
+      new DataSourceRDD(sparkContext, r.createBatchReadTasks()).asInstanceOf[RDD[InternalRow]]
+
+    case _: ContinuousReader =>
+      EpochCoordinatorRef.get(
+          sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
+          sparkContext.env)
+        .askSync[Unit](SetReaderPartitions(readTasks.size()))
+      new ContinuousDataSourceRDD(sparkContext, sqlContext, readTasks)
+        .asInstanceOf[RDD[InternalRow]]
+
+    case _ =>
+      new DataSourceRDD(sparkContext, readTasks).asInstanceOf[RDD[InternalRow]]
+  }
 
-    val inputRDD = reader match {
-      case _: ContinuousReader =>
-        EpochCoordinatorRef.get(
-          sparkContext.getLocalProperty(ContinuousExecution.RUN_ID_KEY), sparkContext.env)
-          .askSync[Unit](SetReaderPartitions(readTasks.size()))
+  override def inputRDDs(): Seq[RDD[InternalRow]] = Seq(inputRDD)
 
-        new ContinuousDataSourceRDD(sparkContext, sqlContext, readTasks)
+  override val supportsBatch: Boolean = reader match {
+    case r: SupportsScanColumnarBatch if r.enableBatchRead() => true
+    case _ => false
+  }
 
-      case _ =>
-        new DataSourceRDD(sparkContext, readTasks)
-    }
+  override protected def needsUnsafeRowConversion: Boolean = false
 
-    val numOutputRows = longMetric("numOutputRows")
-    inputRDD.asInstanceOf[RDD[InternalRow]].map { r =>
-      numOutputRows += 1
-      r
+  override protected def doExecute(): RDD[InternalRow] = {
+    if (supportsBatch) {
+      WholeStageCodegenExec(this).execute()
+    } else {
+      val numOutputRows = longMetric("numOutputRows")
+      inputRDD.map { r =>
+        numOutputRows += 1
+        r
+      }
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index f0bdf84bb7a8..3dbdae7b4df9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -64,7 +64,8 @@ case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan)
       val runTask = writer match {
         case w: ContinuousWriter =>
           EpochCoordinatorRef.get(
-            sparkContext.getLocalProperty(ContinuousExecution.RUN_ID_KEY), sparkContext.env)
+            sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
+            sparkContext.env)
             .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions))
 
           (context: TaskContext, iter: Iterator[InternalRow]) =>
@@ -81,9 +82,11 @@ case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan)
         (index, message: WriterCommitMessage) => messages(index) = message
       )
 
-      logInfo(s"Data source writer $writer is committing.")
-      writer.commit(messages)
-      logInfo(s"Data source writer $writer committed.")
+      if (!writer.isInstanceOf[ContinuousWriter]) {
+        logInfo(s"Data source writer $writer is committing.")
+        writer.commit(messages)
+        logInfo(s"Data source writer $writer committed.")
+      }
     } catch {
       case _: InterruptedException if writer.isInstanceOf[ContinuousWriter] =>
         // Interruption is how continuous queries are ended, so accept and ignore the exception.
@@ -133,7 +136,7 @@ object DataWritingSparkTask extends Logging {
       iter: Iterator[InternalRow]): WriterCommitMessage = {
     val dataWriter = writeTask.createDataWriter(context.partitionId(), context.attemptNumber())
     val epochCoordinator = EpochCoordinatorRef.get(
-      context.getLocalProperty(ContinuousExecution.RUN_ID_KEY),
+      context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
       SparkEnv.get)
     val currentMsg: WriterCommitMessage = null
     var currentEpoch = context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index c8e236be28b4..e3d28388c547 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -46,23 +46,6 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
     if (minNumPostShufflePartitions > 0) Some(minNumPostShufflePartitions) else None
   }
 
-  /**
-   * Given a required distribution, returns a partitioning that satisfies that distribution.
-   * @param requiredDistribution The distribution that is required by the operator
-   * @param numPartitions Used when the distribution doesn't require a specific number of partitions
-   */
-  private def createPartitioning(
-      requiredDistribution: Distribution,
-      numPartitions: Int): Partitioning = {
-    requiredDistribution match {
-      case AllTuples => SinglePartition
-      case ClusteredDistribution(clustering, desiredPartitions) =>
-        HashPartitioning(clustering, desiredPartitions.getOrElse(numPartitions))
-      case OrderedDistribution(ordering) => RangePartitioning(ordering, numPartitions)
-      case dist => sys.error(s"Do not know how to satisfy distribution $dist")
-    }
-  }
-
   /**
    * Adds [[ExchangeCoordinator]] to [[ShuffleExchangeExec]]s if adaptive query execution is enabled
    * and partitioning schemes of these [[ShuffleExchangeExec]]s support [[ExchangeCoordinator]].
@@ -88,8 +71,9 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
         // shuffle data when we have more than one children because data generated by
         // these children may not be partitioned in the same way.
         // Please see the comment in withCoordinator for more details.
-        val supportsDistribution =
-          requiredChildDistributions.forall(_.isInstanceOf[ClusteredDistribution])
+        val supportsDistribution = requiredChildDistributions.forall { dist =>
+          dist.isInstanceOf[ClusteredDistribution] || dist.isInstanceOf[HashClusteredDistribution]
+        }
         children.length > 1 && supportsDistribution
       }
 
@@ -142,8 +126,7 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
             //
             // It will be great to introduce a new Partitioning to represent the post-shuffle
             // partitions when one post-shuffle partition includes multiple pre-shuffle partitions.
-            val targetPartitioning =
-              createPartitioning(distribution, defaultNumPreShufflePartitions)
+            val targetPartitioning = distribution.createPartitioning(defaultNumPreShufflePartitions)
             assert(targetPartitioning.isInstanceOf[HashPartitioning])
             ShuffleExchangeExec(targetPartitioning, child, Some(coordinator))
         }
@@ -162,71 +145,56 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
     assert(requiredChildDistributions.length == children.length)
     assert(requiredChildOrderings.length == children.length)
 
-    // Ensure that the operator's children satisfy their output distribution requirements:
+    // Ensure that the operator's children satisfy their output distribution requirements.
     children = children.zip(requiredChildDistributions).map {
       case (child, distribution) if child.outputPartitioning.satisfies(distribution) =>
         child
       case (child, BroadcastDistribution(mode)) =>
         BroadcastExchangeExec(mode, child)
       case (child, distribution) =>
-        ShuffleExchangeExec(createPartitioning(distribution, defaultNumPreShufflePartitions), child)
+        val numPartitions = distribution.requiredNumPartitions
+          .getOrElse(defaultNumPreShufflePartitions)
+        ShuffleExchangeExec(distribution.createPartitioning(numPartitions), child)
     }
 
-    // If the operator has multiple children and specifies child output distributions (e.g. join),
-    // then the children's output partitionings must be compatible:
-    def requireCompatiblePartitioning(distribution: Distribution): Boolean = distribution match {
-      case UnspecifiedDistribution => false
-      case BroadcastDistribution(_) => false
+    // Get the indexes of children which have specified distribution requirements and need to have
+    // same number of partitions.
+    val childrenIndexes = requiredChildDistributions.zipWithIndex.filter {
+      case (UnspecifiedDistribution, _) => false
+      case (_: BroadcastDistribution, _) => false
       case _ => true
-    }
-    if (children.length > 1
-        && requiredChildDistributions.exists(requireCompatiblePartitioning)
-        && !Partitioning.allCompatible(children.map(_.outputPartitioning))) {
+    }.map(_._2)
 
-      // First check if the existing partitions of the children all match. This means they are
-      // partitioned by the same partitioning into the same number of partitions. In that case,
-      // don't try to make them match `defaultPartitions`, just use the existing partitioning.
-      val maxChildrenNumPartitions = children.map(_.outputPartitioning.numPartitions).max
-      val useExistingPartitioning = children.zip(requiredChildDistributions).forall {
-        case (child, distribution) =>
-          child.outputPartitioning.guarantees(
-            createPartitioning(distribution, maxChildrenNumPartitions))
+    val childrenNumPartitions =
+      childrenIndexes.map(children(_).outputPartitioning.numPartitions).toSet
+
+    if (childrenNumPartitions.size > 1) {
+      // Get the number of partitions which is explicitly required by the distributions.
+      val requiredNumPartitions = {
+        val numPartitionsSet = childrenIndexes.flatMap {
+          index => requiredChildDistributions(index).requiredNumPartitions
+        }.toSet
+        assert(numPartitionsSet.size <= 1,
+          s"$operator have incompatible requirements of the number of partitions for its children")
+        numPartitionsSet.headOption
       }
 
-      children = if (useExistingPartitioning) {
-        // We do not need to shuffle any child's output.
-        children
-      } else {
-        // We need to shuffle at least one child's output.
-        // Now, we will determine the number of partitions that will be used by created
-        // partitioning schemes.
-        val numPartitions = {
-          // Let's see if we need to shuffle all child's outputs when we use
-          // maxChildrenNumPartitions.
-          val shufflesAllChildren = children.zip(requiredChildDistributions).forall {
-            case (child, distribution) =>
-              !child.outputPartitioning.guarantees(
-                createPartitioning(distribution, maxChildrenNumPartitions))
-          }
-          // If we need to shuffle all children, we use defaultNumPreShufflePartitions as the
-          // number of partitions. Otherwise, we use maxChildrenNumPartitions.
-          if (shufflesAllChildren) defaultNumPreShufflePartitions else maxChildrenNumPartitions
-        }
+      val targetNumPartitions = requiredNumPartitions.getOrElse(childrenNumPartitions.max)
 
-        children.zip(requiredChildDistributions).map {
-          case (child, distribution) =>
-            val targetPartitioning = createPartitioning(distribution, numPartitions)
-            if (child.outputPartitioning.guarantees(targetPartitioning)) {
-              child
-            } else {
-              child match {
-                // If child is an exchange, we replace it with
-                // a new one having targetPartitioning.
-                case ShuffleExchangeExec(_, c, _) => ShuffleExchangeExec(targetPartitioning, c)
-                case _ => ShuffleExchangeExec(targetPartitioning, child)
-              }
+      children = children.zip(requiredChildDistributions).zipWithIndex.map {
+        case ((child, distribution), index) if childrenIndexes.contains(index) =>
+          if (child.outputPartitioning.numPartitions == targetNumPartitions) {
+            child
+          } else {
+            val defaultPartitioning = distribution.createPartitioning(targetNumPartitions)
+            child match {
+              // If child is an exchange, we replace it with a new one having defaultPartitioning.
+              case ShuffleExchangeExec(_, c, _) => ShuffleExchangeExec(defaultPartitioning, c)
+              case _ => ShuffleExchangeExec(defaultPartitioning, child)
+            }
           }
-        }
+
+        case ((child, _), _) => child
       }
     }
 
@@ -324,10 +292,10 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
   }
 
   def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
-    case operator @ ShuffleExchangeExec(partitioning, child, _) =>
-      child.children match {
-        case ShuffleExchangeExec(childPartitioning, baseChild, _)::Nil =>
-          if (childPartitioning.guarantees(partitioning)) child else operator
+    // TODO: remove this after we create a physical operator for `RepartitionByExpression`.
+    case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, _) =>
+      child.outputPartitioning match {
+        case lower: HashPartitioning if upper.semanticEquals(lower) => child
         case _ => operator
       }
     case operator: SparkPlan =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
index 66e8031bb519..897a4dae39f3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
@@ -46,7 +46,7 @@ case class ShuffledHashJoinExec(
     "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+    HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil
 
   private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = {
     val buildDataSize = longMetric("buildDataSize")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 94405410cce9..2de2f30eb05d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -78,7 +78,7 @@ case class SortMergeJoinExec(
   }
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+    HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil
 
   override def outputOrdering: Seq[SortOrder] = joinType match {
     // For inner join, orders of both sides keys should be kept.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
index d1bd8a707686..03d1bbf2ab88 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -456,7 +456,7 @@ case class CoGroupExec(
     right: SparkPlan) extends BinaryExecNode with ObjectProducerExec {
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftGroup) :: ClusteredDistribution(rightGroup) :: Nil
+    HashClusteredDistribution(leftGroup) :: HashClusteredDistribution(rightGroup) :: Nil
 
   override def requiredChildOrdering: Seq[Seq[SortOrder]] =
     leftGroup.map(SortOrder(_, Ascending)) :: rightGroup.map(SortOrder(_, Ascending)) :: Nil
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
index c06bc7b66ff3..47b146f076b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -74,8 +74,7 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
       schema: StructType,
       context: TaskContext): Iterator[InternalRow] = {
 
-    val schemaOut = StructType.fromAttributes(output.drop(child.output.length).zipWithIndex
-      .map { case (attr, i) => attr.withName(s"_$i") })
+    val outputTypes = output.drop(child.output.length).map(_.dataType)
 
     // DO NOT use iter.grouped(). See BatchIterator.
     val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter)
@@ -90,8 +89,9 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
 
       private var currentIter = if (columnarBatchIter.hasNext) {
         val batch = columnarBatchIter.next()
-        assert(schemaOut.equals(batch.schema),
-          s"Invalid schema from pandas_udf: expected $schemaOut, got ${batch.schema}")
+        val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType())
+        assert(outputTypes == actualDataTypes, "Invalid schema from pandas_udf: " +
+          s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}")
         batch.rowIterator.asScala
       } else {
         Iterator.empty
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
index 5cc8ed353565..5fcdcddca7d5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -30,8 +30,8 @@ import org.apache.spark._
 import org.apache.spark.api.python._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.arrow.{ArrowUtils, ArrowWriter}
-import org.apache.spark.sql.execution.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.util.Utils
 
 /**
@@ -138,7 +138,7 @@ class ArrowPythonRunner(
           if (reader != null && batchLoaded) {
             batchLoaded = reader.loadNextBatch()
             if (batchLoaded) {
-              val batch = new ColumnarBatch(schema, vectors, root.getRowCount)
+              val batch = new ColumnarBatch(vectors)
               batch.setNumRows(root.getRowCount)
               batch
             } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
index 26ee25f633ea..f4d83e8dc7c2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
@@ -79,16 +79,19 @@ case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
     } else {
       StructType(udfs.map(u => StructField("", u.dataType, u.nullable)))
     }
+
+    val fromJava = EvaluatePython.makeFromJava(resultType)
+
     outputIterator.flatMap { pickedResult =>
       val unpickledBatch = unpickle.loads(pickedResult)
       unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
     }.map { result =>
       if (udfs.length == 1) {
         // fast path for single UDF
-        mutableRow(0) = EvaluatePython.fromJava(result, resultType)
+        mutableRow(0) = fromJava(result)
         mutableRow
       } else {
-        EvaluatePython.fromJava(result, resultType).asInstanceOf[InternalRow]
+        fromJava(result).asInstanceOf[InternalRow]
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
index 9bbfa6018ba7..520afad28764 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
@@ -83,82 +83,134 @@ object EvaluatePython {
   }
 
   /**
-   * Converts `obj` to the type specified by the data type, or returns null if the type of obj is
-   * unexpected. Because Python doesn't enforce the type.
+   * Make a converter that converts `obj` to the type specified by the data type, or returns
+   * null if the type of obj is unexpected. Because Python doesn't enforce the type.
    */
-  def fromJava(obj: Any, dataType: DataType): Any = (obj, dataType) match {
-    case (null, _) => null
-
-    case (c: Boolean, BooleanType) => c
+  def makeFromJava(dataType: DataType): Any => Any = dataType match {
+    case BooleanType => (obj: Any) => nullSafeConvert(obj) {
+      case b: Boolean => b
+    }
 
-    case (c: Byte, ByteType) => c
-    case (c: Short, ByteType) => c.toByte
-    case (c: Int, ByteType) => c.toByte
-    case (c: Long, ByteType) => c.toByte
+    case ByteType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Byte => c
+      case c: Short => c.toByte
+      case c: Int => c.toByte
+      case c: Long => c.toByte
+    }
 
-    case (c: Byte, ShortType) => c.toShort
-    case (c: Short, ShortType) => c
-    case (c: Int, ShortType) => c.toShort
-    case (c: Long, ShortType) => c.toShort
+    case ShortType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Byte => c.toShort
+      case c: Short => c
+      case c: Int => c.toShort
+      case c: Long => c.toShort
+    }
 
-    case (c: Byte, IntegerType) => c.toInt
-    case (c: Short, IntegerType) => c.toInt
-    case (c: Int, IntegerType) => c
-    case (c: Long, IntegerType) => c.toInt
+    case IntegerType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Byte => c.toInt
+      case c: Short => c.toInt
+      case c: Int => c
+      case c: Long => c.toInt
+    }
 
-    case (c: Byte, LongType) => c.toLong
-    case (c: Short, LongType) => c.toLong
-    case (c: Int, LongType) => c.toLong
-    case (c: Long, LongType) => c
+    case LongType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Byte => c.toLong
+      case c: Short => c.toLong
+      case c: Int => c.toLong
+      case c: Long => c
+    }
 
-    case (c: Float, FloatType) => c
-    case (c: Double, FloatType) => c.toFloat
+    case FloatType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Float => c
+      case c: Double => c.toFloat
+    }
 
-    case (c: Float, DoubleType) => c.toDouble
-    case (c: Double, DoubleType) => c
+    case DoubleType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Float => c.toDouble
+      case c: Double => c
+    }
 
-    case (c: java.math.BigDecimal, dt: DecimalType) => Decimal(c, dt.precision, dt.scale)
+    case dt: DecimalType => (obj: Any) => nullSafeConvert(obj) {
+      case c: java.math.BigDecimal => Decimal(c, dt.precision, dt.scale)
+    }
 
-    case (c: Int, DateType) => c
+    case DateType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Int => c
+    }
 
-    case (c: Long, TimestampType) => c
-    // Py4J serializes values between MIN_INT and MAX_INT as Ints, not Longs
-    case (c: Int, TimestampType) => c.toLong
+    case TimestampType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Long => c
+      // Py4J serializes values between MIN_INT and MAX_INT as Ints, not Longs
+      case c: Int => c.toLong
+    }
 
-    case (c, StringType) => UTF8String.fromString(c.toString)
+    case StringType => (obj: Any) => nullSafeConvert(obj) {
+      case _ => UTF8String.fromString(obj.toString)
+    }
 
-    case (c: String, BinaryType) => c.getBytes(StandardCharsets.UTF_8)
-    case (c, BinaryType) if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c
+    case BinaryType => (obj: Any) => nullSafeConvert(obj) {
+      case c: String => c.getBytes(StandardCharsets.UTF_8)
+      case c if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c
+    }
 
-    case (c: java.util.List[_], ArrayType(elementType, _)) =>
-      new GenericArrayData(c.asScala.map { e => fromJava(e, elementType)}.toArray)
+    case ArrayType(elementType, _) =>
+      val elementFromJava = makeFromJava(elementType)
 
-    case (c, ArrayType(elementType, _)) if c.getClass.isArray =>
-      new GenericArrayData(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType)))
+      (obj: Any) => nullSafeConvert(obj) {
+        case c: java.util.List[_] =>
+          new GenericArrayData(c.asScala.map { e => elementFromJava(e) }.toArray)
+        case c if c.getClass.isArray =>
+          new GenericArrayData(c.asInstanceOf[Array[_]].map(e => elementFromJava(e)))
+      }
 
-    case (javaMap: java.util.Map[_, _], MapType(keyType, valueType, _)) =>
-      ArrayBasedMapData(
-        javaMap,
-        (key: Any) => fromJava(key, keyType),
-        (value: Any) => fromJava(value, valueType))
+    case MapType(keyType, valueType, _) =>
+      val keyFromJava = makeFromJava(keyType)
+      val valueFromJava = makeFromJava(valueType)
+
+      (obj: Any) => nullSafeConvert(obj) {
+        case javaMap: java.util.Map[_, _] =>
+          ArrayBasedMapData(
+            javaMap,
+            (key: Any) => keyFromJava(key),
+            (value: Any) => valueFromJava(value))
+      }
 
-    case (c, StructType(fields)) if c.getClass.isArray =>
-      val array = c.asInstanceOf[Array[_]]
-      if (array.length != fields.length) {
-        throw new IllegalStateException(
-          s"Input row doesn't have expected number of values required by the schema. " +
-            s"${fields.length} fields are required while ${array.length} values are provided."
-        )
+    case StructType(fields) =>
+      val fieldsFromJava = fields.map(f => makeFromJava(f.dataType)).toArray
+
+      (obj: Any) => nullSafeConvert(obj) {
+        case c if c.getClass.isArray =>
+          val array = c.asInstanceOf[Array[_]]
+          if (array.length != fields.length) {
+            throw new IllegalStateException(
+              s"Input row doesn't have expected number of values required by the schema. " +
+                s"${fields.length} fields are required while ${array.length} values are provided."
+            )
+          }
+
+          val row = new GenericInternalRow(fields.length)
+          var i = 0
+          while (i < fields.length) {
+            row(i) = fieldsFromJava(i)(array(i))
+            i += 1
+          }
+          row
       }
-      new GenericInternalRow(array.zip(fields).map {
-        case (e, f) => fromJava(e, f.dataType)
-      })
 
-    case (_, udt: UserDefinedType[_]) => fromJava(obj, udt.sqlType)
+    case udt: UserDefinedType[_] => makeFromJava(udt.sqlType)
+
+    case other => (obj: Any) => nullSafeConvert(other)(PartialFunction.empty)
+  }
 
-    // all other unexpected type should be null, or we will have runtime exception
-    // TODO(davies): we could improve this by try to cast the object to expected type
-    case (c, _) => null
+  private def nullSafeConvert(input: Any)(f: PartialFunction[Any, Any]): Any = {
+    if (input == null) {
+      null
+    } else {
+      f.applyOrElse(input, {
+        // all other unexpected type should be null, or we will have runtime exception
+        // TODO(davies): we could improve this by try to cast the object to expected type
+        _: Any => null
+      })
+    }
   }
 
   private val module = "pyspark.sql.types"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index 0debd7db8475..8c016abc5b64 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -166,7 +166,7 @@ class FileStreamSource(
     val newDataSource =
       DataSource(
         sparkSession,
-        paths = files.map(_.path),
+        paths = files.map(f => new Path(new URI(f.path)).toString),
         userSpecifiedSchema = Some(schema),
         partitionColumns = partitionColumns,
         className = fileFormatClassName,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
index b84e6ce64c61..66b11ecddf23 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
@@ -17,15 +17,11 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import java.{util => ju}
-
-import scala.collection.mutable
-
 import com.codahale.metrics.{Gauge, MetricRegistry}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.metrics.source.{Source => CodahaleSource}
-import org.apache.spark.util.Clock
+import org.apache.spark.sql.streaming.StreamingQueryProgress
 
 /**
  * Serves metrics from a [[org.apache.spark.sql.streaming.StreamingQuery]] to
@@ -39,14 +35,17 @@ class MetricsReporter(
 
   // Metric names should not have . in them, so that all the metrics of a query are identified
   // together in Ganglia as a single metric group
-  registerGauge("inputRate-total", () => stream.lastProgress.inputRowsPerSecond)
-  registerGauge("processingRate-total", () => stream.lastProgress.processedRowsPerSecond)
-  registerGauge("latency", () => stream.lastProgress.durationMs.get("triggerExecution").longValue())
-
-  private def registerGauge[T](name: String, f: () => T)(implicit num: Numeric[T]): Unit = {
+  registerGauge("inputRate-total", _.inputRowsPerSecond, 0.0)
+  registerGauge("processingRate-total", _.processedRowsPerSecond, 0.0)
+  registerGauge("latency", _.durationMs.get("triggerExecution").longValue(), 0L)
+
+  private def registerGauge[T](
+      name: String,
+      f: StreamingQueryProgress => T,
+      default: T): Unit = {
     synchronized {
       metricRegistry.register(name, new Gauge[T] {
-        override def getValue: T = f()
+        override def getValue: T = Option(stream.lastProgress).map(f).getOrElse(default)
       })
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 9a7a13fcc580..7c3804547b73 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import java.util.Optional
+
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
 
 import org.apache.spark.sql.{Dataset, SparkSession}
@@ -24,7 +27,10 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.sources.v2.streaming.MicroBatchReadSupport
+import org.apache.spark.sql.execution.datasources.v2.{StreamingDataSourceV2Relation, WriteToDataSourceV2}
+import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.streaming.{MicroBatchReadSupport, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.reader.{MicroBatchReader, Offset => OffsetV2}
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.util.{Clock, Utils}
 
@@ -33,10 +39,11 @@ class MicroBatchExecution(
     name: String,
     checkpointRoot: String,
     analyzedPlan: LogicalPlan,
-    sink: Sink,
+    sink: BaseStreamingSink,
     trigger: Trigger,
     triggerClock: Clock,
     outputMode: OutputMode,
+    extraOptions: Map[String, String],
     deleteCheckpointOnStop: Boolean)
   extends StreamExecution(
     sparkSession, name, checkpointRoot, analyzedPlan, sink,
@@ -57,6 +64,13 @@ class MicroBatchExecution(
     var nextSourceId = 0L
     val toExecutionRelationMap = MutableMap[StreamingRelation, StreamingExecutionRelation]()
     val v2ToExecutionRelationMap = MutableMap[StreamingRelationV2, StreamingExecutionRelation]()
+    // We transform each distinct streaming relation into a StreamingExecutionRelation, keeping a
+    // map as we go to ensure each identical relation gets the same StreamingExecutionRelation
+    // object. For each microbatch, the StreamingExecutionRelation will be replaced with a logical
+    // plan for the data within that batch.
+    // Note that we have to use the previous `output` as attributes in StreamingExecutionRelation,
+    // since the existing logical plan has already used those attributes. The per-microbatch
+    // transformation is responsible for replacing attributes with their final values.
     val _logicalPlan = analyzedPlan.transform {
       case streamingRelation@StreamingRelation(dataSource, _, output) =>
         toExecutionRelationMap.getOrElseUpdate(streamingRelation, {
@@ -64,19 +78,29 @@ class MicroBatchExecution(
           val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
           val source = dataSource.createSource(metadataPath)
           nextSourceId += 1
-          // We still need to use the previous `output` instead of `source.schema` as attributes in
-          // "df.logicalPlan" has already used attributes of the previous `output`.
           StreamingExecutionRelation(source, output)(sparkSession)
         })
-      case s @ StreamingRelationV2(v2DataSource, _, _, output, v1DataSource)
-          if !v2DataSource.isInstanceOf[MicroBatchReadSupport] =>
+      case s @ StreamingRelationV2(source: MicroBatchReadSupport, _, options, output, _) =>
         v2ToExecutionRelationMap.getOrElseUpdate(s, {
           // Materialize source to avoid creating it in every batch
           val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
-          val source = v1DataSource.createSource(metadataPath)
+          val reader = source.createMicroBatchReader(
+            Optional.empty(), // user specified schema
+            metadataPath,
+            new DataSourceV2Options(options.asJava))
+          nextSourceId += 1
+          StreamingExecutionRelation(reader, output)(sparkSession)
+        })
+      case s @ StreamingRelationV2(_, sourceName, _, output, v1Relation) =>
+        v2ToExecutionRelationMap.getOrElseUpdate(s, {
+          // Materialize source to avoid creating it in every batch
+          val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
+          if (v1Relation.isEmpty) {
+            throw new UnsupportedOperationException(
+              s"Data source $sourceName does not support microbatch processing.")
+          }
+          val source = v1Relation.get.dataSource.createSource(metadataPath)
           nextSourceId += 1
-          // We still need to use the previous `output` instead of `source.schema` as attributes in
-          // "df.logicalPlan" has already used attributes of the previous `output`.
           StreamingExecutionRelation(source, output)(sparkSession)
         })
     }
@@ -187,12 +211,11 @@ class MicroBatchExecution(
                * batch will be executed before getOffset is called again. */
               availableOffsets.foreach {
                 case (source: Source, end: Offset) =>
-                  if (committedOffsets.get(source).map(_ != end).getOrElse(true)) {
-                    val start = committedOffsets.get(source)
-                    source.getBatch(start, end)
-                  }
+                  val start = committedOffsets.get(source)
+                  source.getBatch(start, end)
                 case nonV1Tuple =>
-                  throw new IllegalStateException(s"Unexpected V2 source in $nonV1Tuple")
+                  // The V2 API does not have the same edge case requiring getBatch to be called
+                  // here, so we do nothing here.
               }
               currentBatchId = latestCommittedBatchId + 1
               committedOffsets ++= availableOffsets
@@ -236,14 +259,27 @@ class MicroBatchExecution(
     val hasNewData = {
       awaitProgressLock.lock()
       try {
-        val latestOffsets: Map[Source, Option[Offset]] = uniqueSources.map {
+        // Generate a map from each unique source to the next available offset.
+        val latestOffsets: Map[BaseStreamingSource, Option[Offset]] = uniqueSources.map {
           case s: Source =>
             updateStatusMessage(s"Getting offsets from $s")
             reportTimeTaken("getOffset") {
               (s, s.getOffset)
             }
+          case s: MicroBatchReader =>
+            updateStatusMessage(s"Getting offsets from $s")
+            reportTimeTaken("getOffset") {
+            // Once v1 streaming source execution is gone, we can refactor this away.
+            // For now, we set the range here to get the source to infer the available end offset,
+            // get that offset, and then set the range again when we later execute.
+            s.setOffsetRange(
+              toJava(availableOffsets.get(s).map(off => s.deserializeOffset(off.json))),
+              Optional.empty())
+
+              (s, Some(s.getEndOffset))
+            }
         }.toMap
-        availableOffsets ++= latestOffsets.filter { case (s, o) => o.nonEmpty }.mapValues(_.get)
+        availableOffsets ++= latestOffsets.filter { case (_, o) => o.nonEmpty }.mapValues(_.get)
 
         if (dataAvailable) {
           true
@@ -317,6 +353,8 @@ class MicroBatchExecution(
           if (prevBatchOff.isDefined) {
             prevBatchOff.get.toStreamProgress(sources).foreach {
               case (src: Source, off) => src.commit(off)
+              case (reader: MicroBatchReader, off) =>
+                reader.commit(reader.deserializeOffset(off.json))
             }
           } else {
             throw new IllegalStateException(s"batch $currentBatchId doesn't exist")
@@ -357,7 +395,16 @@ class MicroBatchExecution(
             s"DataFrame returned by getBatch from $source did not have isStreaming=true\n" +
               s"${batch.queryExecution.logical}")
           logDebug(s"Retrieving data from $source: $current -> $available")
-          Some(source -> batch)
+          Some(source -> batch.logicalPlan)
+        case (reader: MicroBatchReader, available)
+          if committedOffsets.get(reader).map(_ != available).getOrElse(true) =>
+          val current = committedOffsets.get(reader).map(off => reader.deserializeOffset(off.json))
+          reader.setOffsetRange(
+            toJava(current),
+            Optional.of(available.asInstanceOf[OffsetV2]))
+          logDebug(s"Retrieving data from $reader: $current -> $available")
+          Some(reader ->
+            new StreamingDataSourceV2Relation(reader.readSchema().toAttributes, reader))
         case _ => None
       }
     }
@@ -365,15 +412,14 @@ class MicroBatchExecution(
     // A list of attributes that will need to be updated.
     val replacements = new ArrayBuffer[(Attribute, Attribute)]
     // Replace sources in the logical plan with data that has arrived since the last batch.
-    val withNewSources = logicalPlan transform {
+    val newBatchesPlan = logicalPlan transform {
       case StreamingExecutionRelation(source, output) =>
-        newData.get(source).map { data =>
-          val newPlan = data.logicalPlan
-          assert(output.size == newPlan.output.size,
+        newData.get(source).map { dataPlan =>
+          assert(output.size == dataPlan.output.size,
             s"Invalid batch: ${Utils.truncatedString(output, ",")} != " +
-              s"${Utils.truncatedString(newPlan.output, ",")}")
-          replacements ++= output.zip(newPlan.output)
-          newPlan
+              s"${Utils.truncatedString(dataPlan.output, ",")}")
+          replacements ++= output.zip(dataPlan.output)
+          dataPlan
         }.getOrElse {
           LocalRelation(output, isStreaming = true)
         }
@@ -381,7 +427,7 @@ class MicroBatchExecution(
 
     // Rewire the plan to use the new attributes that were returned by the source.
     val replacementMap = AttributeMap(replacements)
-    val triggerLogicalPlan = withNewSources transformAllExpressions {
+    val newAttributePlan = newBatchesPlan transformAllExpressions {
       case a: Attribute if replacementMap.contains(a) =>
         replacementMap(a).withMetadata(a.metadata)
       case ct: CurrentTimestamp =>
@@ -392,6 +438,20 @@ class MicroBatchExecution(
           cd.dataType, cd.timeZoneId)
     }
 
+    val triggerLogicalPlan = sink match {
+      case _: Sink => newAttributePlan
+      case s: MicroBatchWriteSupport =>
+        val writer = s.createMicroBatchWriter(
+          s"$runId",
+          currentBatchId,
+          newAttributePlan.schema,
+          outputMode,
+          new DataSourceV2Options(extraOptions.asJava))
+        assert(writer.isPresent, "microbatch writer must always be present")
+        WriteToDataSourceV2(writer.get, newAttributePlan)
+      case _ => throw new IllegalArgumentException(s"unknown sink type for $sink")
+    }
+
     reportTimeTaken("queryPlanning") {
       lastExecution = new IncrementalExecution(
         sparkSessionToRunBatch,
@@ -409,7 +469,12 @@ class MicroBatchExecution(
 
     reportTimeTaken("addBatch") {
       SQLExecution.withNewExecutionId(sparkSessionToRunBatch, lastExecution) {
-        sink.addBatch(currentBatchId, nextBatch)
+        sink match {
+          case s: Sink => s.addBatch(currentBatchId, nextBatch)
+          case s: MicroBatchWriteSupport =>
+            // This doesn't accumulate any data - it just forces execution of the microbatch writer.
+            nextBatch.collect()
+        }
       }
     }
 
@@ -421,4 +486,8 @@ class MicroBatchExecution(
       awaitProgressLock.unlock()
     }
   }
+
+  private def toJava(scalaOption: Option[OffsetV2]): Optional[OffsetV2] = {
+    Optional.ofNullable(scalaOption.orNull)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index 1c9043613cb6..d1e5be9c1276 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -53,7 +53,7 @@ trait ProgressReporter extends Logging {
   protected def triggerClock: Clock
   protected def logicalPlan: LogicalPlan
   protected def lastExecution: QueryExecution
-  protected def newData: Map[BaseStreamingSource, DataFrame]
+  protected def newData: Map[BaseStreamingSource, LogicalPlan]
   protected def availableOffsets: StreamProgress
   protected def committedOffsets: StreamProgress
   protected def sources: Seq[BaseStreamingSource]
@@ -225,8 +225,8 @@ trait ProgressReporter extends Logging {
     //
     // 3. For each source, we sum the metrics of the associated execution plan leaves.
     //
-    val logicalPlanLeafToSource = newData.flatMap { case (source, df) =>
-      df.logicalPlan.collectLeaves().map { leaf => leaf -> source }
+    val logicalPlanLeafToSource = newData.flatMap { case (source, logicalPlan) =>
+      logicalPlan.collectLeaves().map { leaf => leaf -> source }
     }
     val allLogicalPlanLeaves = lastExecution.logical.collectLeaves() // includes non-streaming
     val allExecPlanLeaves = lastExecution.executedPlan.collectLeaves()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
index d02cf882b61a..66eb0169ac1e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
@@ -29,12 +29,12 @@ import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
-import org.apache.spark.sql.execution.streaming.continuous.ContinuousRateStreamReader
-import org.apache.spark.sql.execution.streaming.sources.RateStreamV2Reader
+import org.apache.spark.sql.execution.streaming.continuous.RateStreamContinuousReader
+import org.apache.spark.sql.execution.streaming.sources.RateStreamMicroBatchReader
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
 import org.apache.spark.sql.sources.v2._
-import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
-import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, MicroBatchReader}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{ManualClock, SystemClock}
 
@@ -112,7 +112,7 @@ class RateSourceProvider extends StreamSourceProvider with DataSourceRegister
       schema: Optional[StructType],
       checkpointLocation: String,
       options: DataSourceV2Options): ContinuousReader = {
-    new ContinuousRateStreamReader(options)
+    new RateStreamContinuousReader(options)
   }
 
   override def shortName(): String = "rate"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 3e76bf7b7ca8..e7982d7880ce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -163,7 +163,7 @@ abstract class StreamExecution(
   var lastExecution: IncrementalExecution = _
 
   /** Holds the most recent input data for each source. */
-  protected var newData: Map[BaseStreamingSource, DataFrame] = _
+  protected var newData: Map[BaseStreamingSource, LogicalPlan] = _
 
   @volatile
   protected var streamDeathCause: StreamingQueryException = null
@@ -418,11 +418,17 @@ abstract class StreamExecution(
    * Blocks the current thread until processing for data from the given `source` has reached at
    * least the given `Offset`. This method is intended for use primarily when writing tests.
    */
-  private[sql] def awaitOffset(source: Source, newOffset: Offset): Unit = {
+  private[sql] def awaitOffset(sourceIndex: Int, newOffset: Offset): Unit = {
     assertAwaitThread()
     def notDone = {
       val localCommittedOffsets = committedOffsets
-      !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
+      if (sources == null) {
+        // sources might not be initialized yet
+        false
+      } else {
+        val source = sources(sourceIndex)
+        !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
+      }
     }
 
     while (notDone) {
@@ -436,7 +442,7 @@ abstract class StreamExecution(
         awaitProgressLock.unlock()
       }
     }
-    logDebug(s"Unblocked at $newOffset for $source")
+    logDebug(s"Unblocked at $newOffset for ${sources(sourceIndex)}")
   }
 
   /** A flag to indicate that a batch has completed with no new data available. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
index a9d50e3a112e..a0ee683a895d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
@@ -61,7 +61,7 @@ case class StreamingRelation(dataSource: DataSource, sourceName: String, output:
  * [[org.apache.spark.sql.catalyst.plans.logical.LogicalPlan]].
  */
 case class StreamingExecutionRelation(
-    source: Source,
+    source: BaseStreamingSource,
     output: Seq[Attribute])(session: SparkSession)
   extends LeafNode {
 
@@ -92,7 +92,7 @@ case class StreamingRelationV2(
     sourceName: String,
     extraOptions: Map[String, String],
     output: Seq[Attribute],
-    v1DataSource: DataSource)(session: SparkSession)
+    v1Relation: Option[StreamingRelation])(session: SparkSession)
   extends LeafNode {
   override def isStreaming: Boolean = true
   override def toString: String = sourceName
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
index 271bc4da99c0..19e3e55cb282 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.sql.streaming.Trigger
 
 /**
- * A [[Trigger]] that process only one batch of data in a streaming query then terminates
+ * A [[Trigger]] that processes only one batch of data in a streaming query then terminates
  * the query.
  */
 @Experimental
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
index 71eaabe273fe..f2aa3259731d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala
@@ -17,58 +17,44 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
-import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, StreamSinkProvider}
+import java.util.Optional
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.execution.streaming.sources.{ConsoleContinuousWriter, ConsoleMicroBatchWriter, ConsoleWriter}
+import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister}
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousWriteSupport, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
-class ConsoleSink(options: Map[String, String]) extends Sink with Logging {
-  // Number of rows to display, by default 20 rows
-  private val numRowsToShow = options.get("numRows").map(_.toInt).getOrElse(20)
-
-  // Truncate the displayed data if it is too long, by default it is true
-  private val isTruncated = options.get("truncate").map(_.toBoolean).getOrElse(true)
-
-  // Track the batch id
-  private var lastBatchId = -1L
-
-  override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized {
-    val batchIdStr = if (batchId <= lastBatchId) {
-      s"Rerun batch: $batchId"
-    } else {
-      lastBatchId = batchId
-      s"Batch: $batchId"
-    }
-
-    // scalastyle:off println
-    println("-------------------------------------------")
-    println(batchIdStr)
-    println("-------------------------------------------")
-    // scalastyle:off println
-    data.sparkSession.createDataFrame(
-      data.sparkSession.sparkContext.parallelize(data.collect()), data.schema)
-      .show(numRowsToShow, isTruncated)
-  }
-
-  override def toString(): String = s"ConsoleSink[numRows=$numRowsToShow, truncate=$isTruncated]"
-}
-
 case class ConsoleRelation(override val sqlContext: SQLContext, data: DataFrame)
   extends BaseRelation {
   override def schema: StructType = data.schema
 }
 
-class ConsoleSinkProvider extends StreamSinkProvider
+class ConsoleSinkProvider extends DataSourceV2
+  with MicroBatchWriteSupport
+  with ContinuousWriteSupport
   with DataSourceRegister
   with CreatableRelationProvider {
-  def createSink(
-      sqlContext: SQLContext,
-      parameters: Map[String, String],
-      partitionColumns: Seq[String],
-      outputMode: OutputMode): Sink = {
-    new ConsoleSink(parameters)
+
+  override def createMicroBatchWriter(
+      queryId: String,
+      batchId: Long,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): Optional[DataSourceV2Writer] = {
+    Optional.of(new ConsoleMicroBatchWriter(batchId, schema, options))
+  }
+
+  override def createContinuousWriter(
+      queryId: String,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): Optional[ContinuousWriter] = {
+    Optional.of(new ConsoleContinuousWriter(schema, options))
   }
 
   def createRelation(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
index d79e4bd65f56..cd7065f5e660 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -52,9 +52,14 @@ class ContinuousDataSourceRDD(
   }
 
   override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = {
-    val reader = split.asInstanceOf[DataSourceRDDPartition].readTask.createDataReader()
+    // If attempt number isn't 0, this is a task retry, which we don't support.
+    if (context.attemptNumber() != 0) {
+      throw new ContinuousTaskRetryException()
+    }
+
+    val reader = split.asInstanceOf[DataSourceRDDPartition[UnsafeRow]].readTask.createDataReader()
 
-    val runId = context.getLocalProperty(ContinuousExecution.RUN_ID_KEY)
+    val coordinatorId = context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY)
 
     // This queue contains two types of messages:
     // * (null, null) representing an epoch boundary.
@@ -63,7 +68,7 @@ class ContinuousDataSourceRDD(
 
     val epochPollFailed = new AtomicBoolean(false)
     val epochPollExecutor = ThreadUtils.newDaemonSingleThreadScheduledExecutor(
-      s"epoch-poll--${runId}--${context.partitionId()}")
+      s"epoch-poll--$coordinatorId--${context.partitionId()}")
     val epochPollRunnable = new EpochPollRunnable(queue, context, epochPollFailed)
     epochPollExecutor.scheduleWithFixedDelay(
       epochPollRunnable, 0, epochPollIntervalMs, TimeUnit.MILLISECONDS)
@@ -77,12 +82,11 @@ class ContinuousDataSourceRDD(
     dataReaderThread.start()
 
     context.addTaskCompletionListener(_ => {
-      reader.close()
       dataReaderThread.interrupt()
       epochPollExecutor.shutdown()
     })
 
-    val epochEndpoint = EpochCoordinatorRef.get(runId, SparkEnv.get)
+    val epochEndpoint = EpochCoordinatorRef.get(coordinatorId, SparkEnv.get)
     new Iterator[UnsafeRow] {
       private val POLL_TIMEOUT_MS = 1000
 
@@ -132,7 +136,7 @@ class ContinuousDataSourceRDD(
   }
 
   override def getPreferredLocations(split: Partition): Seq[String] = {
-    split.asInstanceOf[DataSourceRDDPartition].readTask.preferredLocations()
+    split.asInstanceOf[DataSourceRDDPartition[UnsafeRow]].readTask.preferredLocations()
   }
 }
 
@@ -146,7 +150,7 @@ class EpochPollRunnable(
   private[continuous] var failureReason: Throwable = _
 
   private val epochEndpoint = EpochCoordinatorRef.get(
-    context.getLocalProperty(ContinuousExecution.RUN_ID_KEY), SparkEnv.get)
+    context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), SparkEnv.get)
   private var currentEpoch = context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong
 
   override def run(): Unit = {
@@ -173,10 +177,11 @@ class DataReaderThread(
     failedFlag: AtomicBoolean)
   extends Thread(
     s"continuous-reader--${context.partitionId()}--" +
-    s"${context.getLocalProperty(ContinuousExecution.RUN_ID_KEY)}") {
+    s"${context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY)}") {
   private[continuous] var failureReason: Throwable = _
 
   override def run(): Unit = {
+    TaskContext.setTaskContext(context)
     val baseReader = ContinuousDataSourceRDD.getBaseReader(reader)
     try {
       while (!context.isInterrupted && !context.isCompleted()) {
@@ -201,6 +206,8 @@ class DataReaderThread(
         failedFlag.set(true)
         // Don't rethrow the exception in this thread. It's not needed, and the default Spark
         // exception handler will kill the executor.
+    } finally {
+      reader.close()
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 2843ab13bde2..462e7d9721d2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -17,17 +17,19 @@
 
 package org.apache.spark.sql.execution.streaming.continuous
 
+import java.util.UUID
 import java.util.concurrent.TimeUnit
+import java.util.function.UnaryOperator
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
 
-import org.apache.spark.SparkEnv
+import org.apache.spark.{SparkEnv, SparkException}
 import org.apache.spark.sql.{AnalysisException, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, WriteToDataSourceV2}
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, StreamingDataSourceV2Relation, WriteToDataSourceV2}
 import org.apache.spark.sql.execution.streaming.{ContinuousExecutionRelation, StreamingRelationV2, _}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
 import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport}
@@ -52,13 +54,13 @@ class ContinuousExecution(
     sparkSession, name, checkpointRoot, analyzedPlan, sink,
     trigger, triggerClock, outputMode, deleteCheckpointOnStop) {
 
-  @volatile protected var continuousSources: Seq[ContinuousReader] = Seq.empty
+  @volatile protected var continuousSources: Seq[ContinuousReader] = Seq()
   override protected def sources: Seq[BaseStreamingSource] = continuousSources
 
-  override lazy val logicalPlan: LogicalPlan = {
-    assert(queryExecutionThread eq Thread.currentThread,
-      "logicalPlan must be initialized in StreamExecutionThread " +
-        s"but the current thread was ${Thread.currentThread}")
+  // For use only in test harnesses.
+  private[sql] var currentEpochCoordinatorId: String = _
+
+  override val logicalPlan: LogicalPlan = {
     val toExecutionRelationMap = MutableMap[StreamingRelationV2, ContinuousExecutionRelation]()
     analyzedPlan.transform {
       case r @ StreamingRelationV2(
@@ -67,7 +69,7 @@ class ContinuousExecution(
           ContinuousExecutionRelation(source, extraReaderOptions, output)(sparkSession)
         })
       case StreamingRelationV2(_, sourceName, _, _, _) =>
-        throw new AnalysisException(
+        throw new UnsupportedOperationException(
           s"Data source $sourceName does not support continuous processing.")
     }
   }
@@ -78,15 +80,17 @@ class ContinuousExecution(
   }
 
   override protected def runActivatedStream(sparkSessionForStream: SparkSession): Unit = {
-    do {
-      try {
-        runContinuous(sparkSessionForStream)
-      } catch {
-        case _: InterruptedException if state.get().equals(RECONFIGURING) =>
-          // swallow exception and run again
-          state.set(ACTIVE)
+    val stateUpdate = new UnaryOperator[State] {
+      override def apply(s: State) = s match {
+        // If we ended the query to reconfigure, reset the state to active.
+        case RECONFIGURING => ACTIVE
+        case _ => s
       }
-    } while (state.get() == ACTIVE)
+    }
+
+    do {
+      runContinuous(sparkSessionForStream)
+    } while (state.updateAndGet(stateUpdate) == ACTIVE)
   }
 
   /**
@@ -120,12 +124,16 @@ class ContinuousExecution(
         }
         committedOffsets = nextOffsets.toStreamProgress(sources)
 
-        // Forcibly align commit and offset logs by slicing off any spurious offset logs from
-        // a previous run. We can't allow commits to an epoch that a previous run reached but
-        // this run has not.
-        offsetLog.purgeAfter(latestEpochId)
+        // Get to an epoch ID that has definitely never been sent to a sink before. Since sink
+        // commit happens between offset log write and commit log write, this means an epoch ID
+        // which is not in the offset log.
+        val (latestOffsetEpoch, _) = offsetLog.getLatest().getOrElse {
+          throw new IllegalStateException(
+            s"Offset log had no latest element. This shouldn't be possible because nextOffsets is" +
+              s"an element.")
+        }
+        currentBatchId = latestOffsetEpoch + 1
 
-        currentBatchId = latestEpochId + 1
         logDebug(s"Resuming at epoch $currentBatchId with committed offsets $committedOffsets")
         nextOffsets
       case None =>
@@ -174,7 +182,7 @@ class ContinuousExecution(
         val loggedOffset = offsets.offsets(0)
         val realOffset = loggedOffset.map(off => reader.deserializeOffset(off.json))
         reader.setOffset(java.util.Optional.ofNullable(realOffset.orNull))
-        DataSourceV2Relation(newOutput, reader)
+        new StreamingDataSourceV2Relation(newOutput, reader)
     }
 
     // Rewire the plan to use the new attributes that were returned by the source.
@@ -210,28 +218,30 @@ class ContinuousExecution(
       lastExecution.executedPlan // Force the lazy generation of execution plan
     }
 
-    sparkSession.sparkContext.setLocalProperty(
+    sparkSessionForQuery.sparkContext.setLocalProperty(
       ContinuousExecution.START_EPOCH_KEY, currentBatchId.toString)
-    sparkSession.sparkContext.setLocalProperty(
-      ContinuousExecution.RUN_ID_KEY, runId.toString)
+    // Add another random ID on top of the run ID, to distinguish epoch coordinators across
+    // reconfigurations.
+    val epochCoordinatorId = s"$runId--${UUID.randomUUID}"
+    currentEpochCoordinatorId = epochCoordinatorId
+    sparkSessionForQuery.sparkContext.setLocalProperty(
+      ContinuousExecution.EPOCH_COORDINATOR_ID_KEY, epochCoordinatorId)
 
     // Use the parent Spark session for the endpoint since it's where this query ID is registered.
     val epochEndpoint =
       EpochCoordinatorRef.create(
-        writer.get(), reader, this, currentBatchId, sparkSession, SparkEnv.get)
+        writer.get(), reader, this, epochCoordinatorId, currentBatchId, sparkSession, SparkEnv.get)
     val epochUpdateThread = new Thread(new Runnable {
       override def run: Unit = {
         try {
           triggerExecutor.execute(() => {
             startTrigger()
 
-            if (reader.needsReconfiguration()) {
-              state.set(RECONFIGURING)
+            if (reader.needsReconfiguration() && state.compareAndSet(ACTIVE, RECONFIGURING)) {
               stopSources()
               if (queryExecutionThread.isAlive) {
                 sparkSession.sparkContext.cancelJobGroup(runId.toString)
                 queryExecutionThread.interrupt()
-                // No need to join - this thread is about to end anyway.
               }
               false
             } else if (isActive) {
@@ -259,6 +269,7 @@ class ContinuousExecution(
           sparkSessionForQuery, lastExecution)(lastExecution.toRdd)
       }
     } finally {
+      epochEndpoint.askSync[Unit](StopContinuousExecutionWrites)
       SparkEnv.get.rpcEnv.stop(epochEndpoint)
 
       epochUpdateThread.interrupt()
@@ -273,17 +284,22 @@ class ContinuousExecution(
       epoch: Long, reader: ContinuousReader, partitionOffsets: Seq[PartitionOffset]): Unit = {
     assert(continuousSources.length == 1, "only one continuous source supported currently")
 
-    if (partitionOffsets.contains(null)) {
-      // If any offset is null, that means the corresponding partition hasn't seen any data yet, so
-      // there's nothing meaningful to add to the offset log.
-    }
     val globalOffset = reader.mergeOffsets(partitionOffsets.toArray)
-    synchronized {
-      if (queryExecutionThread.isAlive) {
-        offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
-      } else {
-        return
-      }
+    val oldOffset = synchronized {
+      offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
+      offsetLog.get(epoch - 1)
+    }
+
+    // If offset hasn't changed since last epoch, there's been no new data.
+    if (oldOffset.contains(OffsetSeq.fill(globalOffset))) {
+      noNewData = true
+    }
+
+    awaitProgressLock.lock()
+    try {
+      awaitProgressLockCondition.signalAll()
+    } finally {
+      awaitProgressLock.unlock()
     }
   }
 
@@ -346,5 +362,5 @@ class ContinuousExecution(
 
 object ContinuousExecution {
   val START_EPOCH_KEY = "__continuous_start_epoch"
-  val RUN_ID_KEY = "__run_id"
+  val EPOCH_COORDINATOR_ID_KEY = "__epoch_coordinator_id"
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
index c9aa78a5a2e2..b4b21e7d2052 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -32,10 +32,10 @@ import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
 
-case class ContinuousRateStreamPartitionOffset(
+case class RateStreamPartitionOffset(
    partition: Int, currentValue: Long, currentTimeMs: Long) extends PartitionOffset
 
-class ContinuousRateStreamReader(options: DataSourceV2Options)
+class RateStreamContinuousReader(options: DataSourceV2Options)
   extends ContinuousReader {
   implicit val defaultFormats: DefaultFormats = DefaultFormats
 
@@ -48,7 +48,7 @@ class ContinuousRateStreamReader(options: DataSourceV2Options)
   override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
     assert(offsets.length == numPartitions)
     val tuples = offsets.map {
-      case ContinuousRateStreamPartitionOffset(i, currVal, nextRead) =>
+      case RateStreamPartitionOffset(i, currVal, nextRead) =>
         (i, ValueRunTimeMsPair(currVal, nextRead))
     }
     RateStreamOffset(Map(tuples: _*))
@@ -86,7 +86,7 @@ class ContinuousRateStreamReader(options: DataSourceV2Options)
       val start = partitionStartMap(i)
       // Have each partition advance by numPartitions each row, with starting points staggered
       // by their partition index.
-      RateStreamReadTask(
+      RateStreamContinuousReadTask(
         start.value,
         start.runTimeMs,
         i,
@@ -101,7 +101,7 @@ class ContinuousRateStreamReader(options: DataSourceV2Options)
 
 }
 
-case class RateStreamReadTask(
+case class RateStreamContinuousReadTask(
     startValue: Long,
     startTimeMs: Long,
     partitionIndex: Int,
@@ -109,10 +109,11 @@ case class RateStreamReadTask(
     rowsPerSecond: Double)
   extends ReadTask[Row] {
   override def createDataReader(): DataReader[Row] =
-    new RateStreamDataReader(startValue, startTimeMs, partitionIndex, increment, rowsPerSecond)
+    new RateStreamContinuousDataReader(
+      startValue, startTimeMs, partitionIndex, increment, rowsPerSecond)
 }
 
-class RateStreamDataReader(
+class RateStreamContinuousDataReader(
     startValue: Long,
     startTimeMs: Long,
     partitionIndex: Int,
@@ -151,5 +152,5 @@ class RateStreamDataReader(
   override def close(): Unit = {}
 
   override def getOffset(): PartitionOffset =
-    ContinuousRateStreamPartitionOffset(partitionIndex, currentValue, nextReadTime)
+    RateStreamPartitionOffset(partitionIndex, currentValue, nextReadTime)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTaskRetryException.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTaskRetryException.scala
new file mode 100644
index 000000000000..e0a6f6dd50bb
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTaskRetryException.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.continuous
+
+import org.apache.spark.SparkException
+
+/**
+ * An exception thrown when a continuous processing task runs with a nonzero attempt ID.
+ */
+class ContinuousTaskRetryException
+  extends SparkException("Continuous execution does not support task retry", null)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
index 98017c3ac6a3..90b3584aa043 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -39,6 +39,15 @@ private[continuous] sealed trait EpochCoordinatorMessage extends Serializable
  */
 private[sql] case object IncrementAndGetEpoch extends EpochCoordinatorMessage
 
+/**
+ * The RpcEndpoint stop() will wait to clear out the message queue before terminating the
+ * object. This can lead to a race condition where the query restarts at epoch n, a new
+ * EpochCoordinator starts at epoch n, and then the old epoch coordinator commits epoch n + 1.
+ * The framework doesn't provide a handle to wait on the message queue, so we use a synchronous
+ * message to stop any writes to the ContinuousExecution object.
+ */
+private[sql] case object StopContinuousExecutionWrites extends EpochCoordinatorMessage
+
 // Init messages
 /**
  * Set the reader and writer partition counts. Tasks may not be started until the coordinator
@@ -70,7 +79,7 @@ private[sql] case class ReportPartitionOffset(
 
 /** Helper object used to create reference to [[EpochCoordinator]]. */
 private[sql] object EpochCoordinatorRef extends Logging {
-  private def endpointName(runId: String) = s"EpochCoordinator-$runId"
+  private def endpointName(id: String) = s"EpochCoordinator-$id"
 
   /**
    * Create a reference to a new [[EpochCoordinator]].
@@ -79,18 +88,19 @@ private[sql] object EpochCoordinatorRef extends Logging {
       writer: ContinuousWriter,
       reader: ContinuousReader,
       query: ContinuousExecution,
+      epochCoordinatorId: String,
       startEpoch: Long,
       session: SparkSession,
       env: SparkEnv): RpcEndpointRef = synchronized {
     val coordinator = new EpochCoordinator(
       writer, reader, query, startEpoch, session, env.rpcEnv)
-    val ref = env.rpcEnv.setupEndpoint(endpointName(query.runId.toString()), coordinator)
+    val ref = env.rpcEnv.setupEndpoint(endpointName(epochCoordinatorId), coordinator)
     logInfo("Registered EpochCoordinator endpoint")
     ref
   }
 
-  def get(runId: String, env: SparkEnv): RpcEndpointRef = synchronized {
-    val rpcEndpointRef = RpcUtils.makeDriverRef(endpointName(runId), env.conf, env.rpcEnv)
+  def get(id: String, env: SparkEnv): RpcEndpointRef = synchronized {
+    val rpcEndpointRef = RpcUtils.makeDriverRef(endpointName(id), env.conf, env.rpcEnv)
     logDebug("Retrieved existing EpochCoordinator endpoint")
     rpcEndpointRef
   }
@@ -116,6 +126,8 @@ private[continuous] class EpochCoordinator(
     override val rpcEnv: RpcEnv)
   extends ThreadSafeRpcEndpoint with Logging {
 
+  private var queryWritesStopped: Boolean = false
+
   private var numReaderPartitions: Int = _
   private var numWriterPartitions: Int = _
 
@@ -147,12 +159,16 @@ private[continuous] class EpochCoordinator(
         partitionCommits.remove(k)
       }
       for (k <- partitionOffsets.keys.filter { case (e, _) => e < epoch }) {
-        partitionCommits.remove(k)
+        partitionOffsets.remove(k)
       }
     }
   }
 
   override def receive: PartialFunction[Any, Unit] = {
+    // If we just drop these messages, we won't do any writes to the query. The lame duck tasks
+    // won't shed errors or anything.
+    case _ if queryWritesStopped => ()
+
     case CommitPartitionEpoch(partitionId, epoch, message) =>
       logDebug(s"Got commit from partition $partitionId at epoch $epoch: $message")
       if (!partitionCommits.isDefinedAt((epoch, partitionId))) {
@@ -188,5 +204,9 @@ private[continuous] class EpochCoordinator(
     case SetWriterPartitions(numPartitions) =>
       numWriterPartitions = numPartitions
       context.reply(())
+
+    case StopContinuousExecutionWrites =>
+      queryWritesStopped = true
+      context.reply(())
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 3041d4d703cb..509a69dd922f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -119,9 +119,15 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
     val newBlocks = synchronized {
       val sliceStart = startOrdinal - lastOffsetCommitted.offset.toInt - 1
       val sliceEnd = endOrdinal - lastOffsetCommitted.offset.toInt - 1
+      assert(sliceStart <= sliceEnd, s"sliceStart: $sliceStart sliceEnd: $sliceEnd")
       batches.slice(sliceStart, sliceEnd)
     }
 
+    if (newBlocks.isEmpty) {
+      return sqlContext.internalCreateDataFrame(
+        sqlContext.sparkContext.emptyRDD, schema, isStreaming = true)
+    }
+
     logDebug(generateDebugString(newBlocks, startOrdinal, endOrdinal))
 
     newBlocks
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
new file mode 100644
index 000000000000..6fb61dff6004
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.sources
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.writer.{DataSourceV2Writer, DataWriterFactory, WriterCommitMessage}
+import org.apache.spark.sql.types.StructType
+
+/** Common methods used to create writes for the the console sink */
+trait ConsoleWriter extends Logging {
+
+  def options: DataSourceV2Options
+
+  // Number of rows to display, by default 20 rows
+  protected val numRowsToShow = options.getInt("numRows", 20)
+
+  // Truncate the displayed data if it is too long, by default it is true
+  protected val isTruncated = options.getBoolean("truncate", true)
+
+  assert(SparkSession.getActiveSession.isDefined)
+  protected val spark = SparkSession.getActiveSession.get
+
+  def createWriterFactory(): DataWriterFactory[Row] = PackedRowWriterFactory
+
+  def abort(messages: Array[WriterCommitMessage]): Unit = {}
+
+  protected def printRows(
+      commitMessages: Array[WriterCommitMessage],
+      schema: StructType,
+      printMessage: String): Unit = {
+    val rows = commitMessages.collect {
+      case PackedRowCommitMessage(rows) => rows
+    }.flatten
+
+    // scalastyle:off println
+    println("-------------------------------------------")
+    println(printMessage)
+    println("-------------------------------------------")
+    // scalastyle:off println
+    spark
+      .createDataFrame(spark.sparkContext.parallelize(rows), schema)
+      .show(numRowsToShow, isTruncated)
+  }
+}
+
+
+/**
+ * A [[DataSourceV2Writer]] that collects results from a micro-batch query to the driver and
+ * prints them in the console. Created by
+ * [[org.apache.spark.sql.execution.streaming.ConsoleSinkProvider]].
+ *
+ * This sink should not be used for production, as it requires sending all rows to the driver
+ * and does not support recovery.
+ */
+class ConsoleMicroBatchWriter(batchId: Long, schema: StructType, val options: DataSourceV2Options)
+  extends DataSourceV2Writer with ConsoleWriter {
+
+  override def commit(messages: Array[WriterCommitMessage]): Unit = {
+    printRows(messages, schema, s"Batch: $batchId")
+  }
+
+  override def toString(): String = {
+    s"ConsoleMicroBatchWriter[numRows=$numRowsToShow, truncate=$isTruncated]"
+  }
+}
+
+
+/**
+ * A [[DataSourceV2Writer]] that collects results from a continuous query to the driver and
+ * prints them in the console. Created by
+ * [[org.apache.spark.sql.execution.streaming.ConsoleSinkProvider]].
+ *
+ * This sink should not be used for production, as it requires sending all rows to the driver
+ * and does not support recovery.
+ */
+class ConsoleContinuousWriter(schema: StructType, val options: DataSourceV2Options)
+  extends ContinuousWriter with ConsoleWriter {
+
+  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {
+    printRows(messages, schema, s"Continuous processing epoch $epochId")
+  }
+
+  override def toString(): String = {
+    s"ConsoleContinuousWriter[numRows=$numRowsToShow, truncate=$isTruncated]"
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala
new file mode 100644
index 000000000000..9282ba05bdb7
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.sources
+
+import scala.collection.mutable
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage}
+
+/**
+ * A simple [[DataWriterFactory]] whose tasks just pack rows into the commit message for delivery
+ * to a [[org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer]] on the driver.
+ *
+ * Note that, because it sends all rows to the driver, this factory will generally be unsuitable
+ * for production-quality sinks. It's intended for use in tests.
+ */
+case object PackedRowWriterFactory extends DataWriterFactory[Row] {
+  def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[Row] = {
+    new PackedRowDataWriter()
+  }
+}
+
+/**
+ * Commit message for a [[PackedRowDataWriter]], containing all the rows written in the most
+ * recent interval.
+ */
+case class PackedRowCommitMessage(rows: Array[Row]) extends WriterCommitMessage
+
+/**
+ * A simple [[DataWriter]] that just sends all the rows it's received as a commit message.
+ */
+class PackedRowDataWriter() extends DataWriter[Row] with Logging {
+  private val data = mutable.Buffer[Row]()
+
+  override def write(row: Row): Unit = data.append(row)
+
+  override def commit(): PackedRowCommitMessage = {
+    val msg = PackedRowCommitMessage(data.toArray)
+    data.clear()
+    msg
+  }
+
+  override def abort(): Unit = data.clear()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
index 97bada08bcd2..c0ed12cec25e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
@@ -28,17 +28,38 @@ import org.json4s.jackson.Serialization
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming.{RateStreamOffset, ValueRunTimeMsPair}
-import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.DataSourceRegister
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
 import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.streaming.MicroBatchReadSupport
 import org.apache.spark.sql.sources.v2.streaming.reader.{MicroBatchReader, Offset}
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
-import org.apache.spark.util.SystemClock
+import org.apache.spark.util.{ManualClock, SystemClock}
 
-class RateStreamV2Reader(options: DataSourceV2Options)
+/**
+ * This is a temporary register as we build out v2 migration. Microbatch read support should
+ * be implemented in the same register as v1.
+ */
+class RateSourceProviderV2 extends DataSourceV2 with MicroBatchReadSupport with DataSourceRegister {
+  override def createMicroBatchReader(
+      schema: Optional[StructType],
+      checkpointLocation: String,
+      options: DataSourceV2Options): MicroBatchReader = {
+    new RateStreamMicroBatchReader(options)
+  }
+
+  override def shortName(): String = "ratev2"
+}
+
+class RateStreamMicroBatchReader(options: DataSourceV2Options)
   extends MicroBatchReader {
   implicit val defaultFormats: DefaultFormats = DefaultFormats
 
-  val clock = new SystemClock
+  val clock = {
+    // The option to use a manual clock is provided only for unit testing purposes.
+    if (options.get("useManualClock").orElse("false").toBoolean) new ManualClock
+    else new SystemClock
+  }
 
   private val numPartitions =
     options.get(RateStreamSourceV2.NUM_PARTITIONS).orElse("5").toInt
@@ -111,7 +132,7 @@ class RateStreamV2Reader(options: DataSourceV2Options)
 
       val packedRows = mutable.ListBuffer[(Long, Long)]()
       var outVal = startVal + numPartitions
-      var outTimeMs = startTimeMs + msPerPartitionBetweenRows
+      var outTimeMs = startTimeMs
       while (outVal <= endVal) {
         packedRows.append((outTimeMs, outVal))
         outVal += numPartitions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
index d8adbe7bee13..73a105266e1c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -99,7 +99,7 @@ class SQLAppStatusListener(
     // Reset the metrics tracking object for the new attempt.
     Option(stageMetrics.get(event.stageInfo.stageId)).foreach { metrics =>
       metrics.taskMetrics.clear()
-      metrics.attemptId = event.stageInfo.attemptId
+      metrics.attemptId = event.stageInfo.attemptNumber
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 03b654f83052..40a058d2cadd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -66,6 +66,7 @@ case class UserDefinedFunction protected[sql] (
    *
    * @since 1.3.0
    */
+  @scala.annotation.varargs
   def apply(exprs: Column*): Column = {
     Column(ScalaUDF(
       f,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 530a525a01de..0d11682d80a3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -24,6 +24,7 @@ import scala.util.Try
 import scala.util.control.NonFatal
 
 import org.apache.spark.annotation.InterfaceStability
+import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedFunction}
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
@@ -32,7 +33,6 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, ResolvedHint}
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.expressions.UserDefinedFunction
-import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -3254,42 +3254,66 @@ object functions {
    */
   def map_values(e: Column): Column = withExpr { MapValues(e.expr) }
 
-  //////////////////////////////////////////////////////////////////////////////////////////////
-  //////////////////////////////////////////////////////////////////////////////////////////////
-
   // scalastyle:off line.size.limit
   // scalastyle:off parameter.number
 
   /* Use the following code to generate:
-  (0 to 10).map { x =>
+
+  (0 to 10).foreach { x =>
     val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"})
     val typeTags = (1 to x).map(i => s"A$i: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _)
     val inputTypes = (1 to x).foldRight("Nil")((i, s) => {s"ScalaReflection.schemaFor(typeTag[A$i]).dataType :: $s"})
     println(s"""
-    /**
-     * Defines a deterministic user-defined function of ${x} arguments as user-defined
-     * function (UDF). The data types are automatically inferred based on the function's
-     * signature. To change a UDF to nondeterministic, call the API
-     * `UserDefinedFunction.asNondeterministic()`.
-     *
-     * @group udf_funcs
-     * @since 1.3.0
-     */
-    def udf[$typeTags](f: Function$x[$types]): UserDefinedFunction = {
-      val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
-      val inputTypes = Try($inputTypes).toOption
-      val udf = UserDefinedFunction(f, dataType, inputTypes)
-      if (nullable) udf else udf.asNonNullable()
-    }""")
+      |/**
+      | * Defines a Scala closure of $x arguments as user-defined function (UDF).
+      | * The data types are automatically inferred based on the Scala closure's
+      | * signature. By default the returned UDF is deterministic. To change it to
+      | * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
+      | *
+      | * @group udf_funcs
+      | * @since 1.3.0
+      | */
+      |def udf[$typeTags](f: Function$x[$types]): UserDefinedFunction = {
+      |  val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
+      |  val inputTypes = Try($inputTypes).toOption
+      |  val udf = UserDefinedFunction(f, dataType, inputTypes)
+      |  if (nullable) udf else udf.asNonNullable()
+      |}""".stripMargin)
+  }
+
+  (0 to 10).foreach { i =>
+    val extTypeArgs = (0 to i).map(_ => "_").mkString(", ")
+    val anyTypeArgs = (0 to i).map(_ => "Any").mkString(", ")
+    val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs]]"
+    val anyParams = (1 to i).map(_ => "_: Any").mkString(", ")
+    val funcCall = if (i == 0) "() => func" else "func"
+    println(s"""
+      |/**
+      | * Defines a Java UDF$i instance as user-defined function (UDF).
+      | * The caller must specify the output data type, and there is no automatic input type coercion.
+      | * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+      | * API `UserDefinedFunction.asNondeterministic()`.
+      | *
+      | * @group udf_funcs
+      | * @since 2.3.0
+      | */
+      |def udf(f: UDF$i[$extTypeArgs], returnType: DataType): UserDefinedFunction = {
+      |  val func = f$anyCast.call($anyParams)
+      |  UserDefinedFunction($funcCall, returnType, inputTypes = None)
+      |}""".stripMargin)
   }
 
   */
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // Scala UDF functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
   /**
-   * Defines a deterministic user-defined function of 0 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 0 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3302,10 +3326,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 1 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 1 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3318,10 +3342,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 2 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 2 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3334,10 +3358,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 3 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 3 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3350,10 +3374,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 4 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 4 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3366,10 +3390,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 5 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 5 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3382,10 +3406,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 6 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 6 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3398,10 +3422,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 7 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 7 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3414,10 +3438,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 8 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 8 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3430,10 +3454,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 9 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 9 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3446,10 +3470,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 10 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 10 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3461,13 +3485,172 @@ object functions {
     if (nullable) udf else udf.asNonNullable()
   }
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // Java UDF functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Defines a Java UDF0 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF0[_], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF0[Any]].call()
+    UserDefinedFunction(() => func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF1 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF1[_, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF1[Any, Any]].call(_: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF2 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF2[_, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF3 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF3[_, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF4 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF4[_, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF5 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF5[_, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF6 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF6[_, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF7 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF8 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF9 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF10 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
   // scalastyle:on parameter.number
   // scalastyle:on line.size.limit
 
   /**
    * Defines a deterministic user-defined function (UDF) using a Scala closure. For this variant,
    * the caller must specify the output data type, and there is no automatic input type coercion.
-   * To change a UDF to nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
    *
    * @param f  A closure in Scala
    * @param dataType  The output data type of the UDF
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
index b9515ec7bca2..dac463641cfa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
@@ -73,6 +73,7 @@ object HiveSerDe {
     val key = source.toLowerCase(Locale.ROOT) match {
       case s if s.startsWith("org.apache.spark.sql.parquet") => "parquet"
       case s if s.startsWith("org.apache.spark.sql.orc") => "orc"
+      case s if s.startsWith("org.apache.spark.sql.hive.orc") => "orc"
       case s if s.equals("orcfile") => "orc"
       case s if s.equals("parquetfile") => "parquet"
       case s if s.equals("avrofile") => "avro"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 2e92beecf2c1..52f2e2639cd8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.streaming
 
-import java.util.Locale
+import java.util.{Locale, Optional}
 
 import scala.collection.JavaConverters._
 
@@ -27,8 +27,9 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.{StreamingRelation, StreamingRelationV2}
+import org.apache.spark.sql.sources.StreamSourceProvider
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
-import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
@@ -166,19 +167,31 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
       userSpecifiedSchema = userSpecifiedSchema,
       className = source,
       options = extraOptions.toMap)
+    val v1Relation = ds match {
+      case _: StreamSourceProvider => Some(StreamingRelation(v1DataSource))
+      case _ => None
+    }
     ds match {
+      case s: MicroBatchReadSupport =>
+        val tempReader = s.createMicroBatchReader(
+          Optional.ofNullable(userSpecifiedSchema.orNull),
+          Utils.createTempDir(namePrefix = s"temporaryReader").getCanonicalPath,
+          options)
+        Dataset.ofRows(
+          sparkSession,
+          StreamingRelationV2(
+            s, source, extraOptions.toMap,
+            tempReader.readSchema().toAttributes, v1Relation)(sparkSession))
       case s: ContinuousReadSupport =>
         val tempReader = s.createContinuousReader(
-          java.util.Optional.ofNullable(userSpecifiedSchema.orNull),
+          Optional.ofNullable(userSpecifiedSchema.orNull),
           Utils.createTempDir(namePrefix = s"temporaryReader").getCanonicalPath,
           options)
-        // Generate the V1 node to catch errors thrown within generation.
-        StreamingRelation(v1DataSource)
         Dataset.ofRows(
           sparkSession,
           StreamingRelationV2(
             s, source, extraOptions.toMap,
-            tempReader.readSchema().toAttributes, v1DataSource)(sparkSession))
+            tempReader.readSchema().toAttributes, v1Relation)(sparkSession))
       case _ =>
         // Code path for data source v1.
         Dataset.ofRows(sparkSession, StreamingRelation(v1DataSource))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index db588ae282f3..d24f0ddeab4d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.execution.streaming.sources.{MemoryPlanV2, MemorySinkV2}
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousWriteSupport, MicroBatchWriteSupport}
 
 /**
  * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
@@ -279,18 +280,27 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
         useTempCheckpointLocation = true,
         trigger = trigger)
     } else {
-      val dataSource =
-        DataSource(
-          df.sparkSession,
-          className = source,
-          options = extraOptions.toMap,
-          partitionColumns = normalizedParCols.getOrElse(Nil))
+      val ds = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
+      val sink = (ds.newInstance(), trigger) match {
+        case (w: ContinuousWriteSupport, _: ContinuousTrigger) => w
+        case (_, _: ContinuousTrigger) => throw new UnsupportedOperationException(
+            s"Data source $source does not support continuous writing")
+        case (w: MicroBatchWriteSupport, _) => w
+        case _ =>
+          val ds = DataSource(
+            df.sparkSession,
+            className = source,
+            options = extraOptions.toMap,
+            partitionColumns = normalizedParCols.getOrElse(Nil))
+          ds.createSink(outputMode)
+      }
+
       df.sparkSession.sessionState.streamingQueryManager.startQuery(
         extraOptions.get("queryName"),
         extraOptions.get("checkpointLocation"),
         df,
         extraOptions.toMap,
-        dataSource.createSink(outputMode),
+        sink,
         outputMode,
         useTempCheckpointLocation = source == "console",
         recoverFromCheckpointLocation = true,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index b508f4406138..4b27e0d4ef47 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -29,10 +29,10 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, ContinuousTrigger}
 import org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousWriteSupport, MicroBatchWriteSupport}
 import org.apache.spark.util.{Clock, SystemClock, Utils}
 
 /**
@@ -240,31 +240,35 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
           "is not supported in streaming DataFrames/Datasets and will be disabled.")
     }
 
-    sink match {
-      case v1Sink: Sink =>
-        new StreamingQueryWrapper(new MicroBatchExecution(
+    (sink, trigger) match {
+      case (v2Sink: ContinuousWriteSupport, trigger: ContinuousTrigger) =>
+        UnsupportedOperationChecker.checkForContinuous(analyzedPlan, outputMode)
+        new StreamingQueryWrapper(new ContinuousExecution(
           sparkSession,
           userSpecifiedName.orNull,
           checkpointLocation,
           analyzedPlan,
-          v1Sink,
+          v2Sink,
           trigger,
           triggerClock,
           outputMode,
+          extraOptions,
           deleteCheckpointOnStop))
-      case v2Sink: ContinuousWriteSupport =>
-        UnsupportedOperationChecker.checkForContinuous(analyzedPlan, outputMode)
-        new StreamingQueryWrapper(new ContinuousExecution(
+      case (_: MicroBatchWriteSupport, _) | (_: Sink, _) =>
+        new StreamingQueryWrapper(new MicroBatchExecution(
           sparkSession,
           userSpecifiedName.orNull,
           checkpointLocation,
           analyzedPlan,
-          v2Sink,
+          sink,
           trigger,
           triggerClock,
           outputMode,
           extraOptions,
           deleteCheckpointOnStop))
+      case (_: ContinuousWriteSupport, t) if !t.isInstanceOf[ContinuousTrigger] =>
+        throw new AnalysisException(
+          "Sink only supports continuous writes, but a continuous trigger was not specified.")
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
index a73e4272950a..8bab7e1c5876 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
@@ -34,6 +34,8 @@ private[sql] class ExamplePoint(val x: Double, val y: Double) extends Serializab
     case that: ExamplePoint => this.x == that.x && this.y == that.y
     case _ => false
   }
+
+  override def toString(): String = s"($x, $y)"
 }
 
 /**
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index b007093dad84..69a2904f5f3f 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -36,6 +36,7 @@
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.expressions.UserDefinedFunction;
 import org.apache.spark.sql.test.TestSparkSession;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.util.sketch.BloomFilter;
@@ -455,4 +456,15 @@ public void testCircularReferenceBean() {
     CircularReference1Bean bean = new CircularReference1Bean();
     spark.createDataFrame(Arrays.asList(bean), CircularReference1Bean.class);
   }
+
+  @Test
+  public void testUDF() {
+    UserDefinedFunction foo = udf((Integer i, String s) -> i.toString() + s, DataTypes.StringType);
+    Dataset<Row> df = spark.table("testData").select(foo.apply(col("key"), col("value")));
+    String[] result = df.collectAsList().stream().map(row -> row.getString(0))
+      .toArray(String[]::new);
+    String[] expected = spark.table("testData").collectAsList().stream()
+      .map(row -> row.get(0).toString() + row.getString(1)).toArray(String[]::new);
+    Assert.assertArrayEquals(expected, result);
+  }
 }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
new file mode 100644
index 000000000000..a5d77a90ece4
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaBatchDataSourceV2.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package test.org.apache.spark.sql.sources.v2;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
+import org.apache.spark.sql.sources.v2.DataSourceV2;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.ReadSupport;
+import org.apache.spark.sql.sources.v2.reader.*;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.vectorized.ColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+
+
+public class JavaBatchDataSourceV2 implements DataSourceV2, ReadSupport {
+
+  class Reader implements DataSourceV2Reader, SupportsScanColumnarBatch {
+    private final StructType schema = new StructType().add("i", "int").add("j", "int");
+
+    @Override
+    public StructType readSchema() {
+      return schema;
+    }
+
+    @Override
+    public List<ReadTask<ColumnarBatch>> createBatchReadTasks() {
+      return java.util.Arrays.asList(new JavaBatchReadTask(0, 50), new JavaBatchReadTask(50, 90));
+    }
+  }
+
+  static class JavaBatchReadTask implements ReadTask<ColumnarBatch>, DataReader<ColumnarBatch> {
+    private int start;
+    private int end;
+
+    private static final int BATCH_SIZE = 20;
+
+    private OnHeapColumnVector i;
+    private OnHeapColumnVector j;
+    private ColumnarBatch batch;
+
+    JavaBatchReadTask(int start, int end) {
+      this.start = start;
+      this.end = end;
+    }
+
+    @Override
+    public DataReader<ColumnarBatch> createDataReader() {
+      this.i = new OnHeapColumnVector(BATCH_SIZE, DataTypes.IntegerType);
+      this.j = new OnHeapColumnVector(BATCH_SIZE, DataTypes.IntegerType);
+      ColumnVector[] vectors = new ColumnVector[2];
+      vectors[0] = i;
+      vectors[1] = j;
+      this.batch = new ColumnarBatch(vectors);
+      return this;
+    }
+
+    @Override
+    public boolean next() {
+      i.reset();
+      j.reset();
+      int count = 0;
+      while (start < end && count < BATCH_SIZE) {
+        i.putInt(count, start);
+        j.putInt(count, -start);
+        start += 1;
+        count += 1;
+      }
+
+      if (count == 0) {
+        return false;
+      } else {
+        batch.setNumRows(count);
+        return true;
+      }
+    }
+
+    @Override
+    public ColumnarBatch get() {
+      return batch;
+    }
+
+    @Override
+    public void close() throws IOException {
+      batch.close();
+    }
+  }
+
+
+  @Override
+  public DataSourceV2Reader createReader(DataSourceV2Options options) {
+    return new Reader();
+  }
+}
diff --git a/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
index c6973bf41d34..a0b25b4e8236 100644
--- a/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
+++ b/sql/core/src/test/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -5,3 +5,11 @@ org.apache.spark.sql.sources.FakeSourceFour
 org.apache.fakesource.FakeExternalSourceOne
 org.apache.fakesource.FakeExternalSourceTwo
 org.apache.fakesource.FakeExternalSourceThree
+org.apache.spark.sql.streaming.sources.FakeReadMicroBatchOnly
+org.apache.spark.sql.streaming.sources.FakeReadContinuousOnly
+org.apache.spark.sql.streaming.sources.FakeReadBothModes
+org.apache.spark.sql.streaming.sources.FakeReadNeitherMode
+org.apache.spark.sql.streaming.sources.FakeWriteMicroBatchOnly
+org.apache.spark.sql.streaming.sources.FakeWriteContinuousOnly
+org.apache.spark.sql.streaming.sources.FakeWriteBothModes
+org.apache.spark.sql.streaming.sources.FakeWriteNeitherMode
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index 1e1384549a41..c5070b734d52 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -60,3 +60,12 @@ SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a;
 -- Aggregate with empty input and empty GroupBy expressions.
 SELECT COUNT(1) FROM testData WHERE false;
 SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t;
+
+-- Aggregate with empty GroupBy expressions and filter on top
+SELECT 1 from (
+  SELECT 1 AS z,
+  MIN(a.x)
+  FROM (select 1 as x) a
+  WHERE false
+) b
+where b.z != b.z
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
index fb0d07fbdace..1661209093fc 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql
@@ -173,6 +173,16 @@ WHERE  t1a = (SELECT   max(t2a)
               HAVING   count(*) >= 0)
 OR     t1i > '2014-12-31';
 
+-- TC 02.03.01
+SELECT t1a
+FROM   t1
+WHERE  t1a = (SELECT   max(t2a)
+              FROM     t2
+              WHERE    t2c = t1c
+              GROUP BY t2c
+              HAVING   count(*) >= 1)
+OR     t1i > '2014-12-31';
+
 -- TC 02.04
 -- t1 on the right of an outer join
 -- can be reduced to inner join
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
index c8e108ac2c45..c6d8a49d4b93 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
@@ -22,6 +22,51 @@ select a / b from t;
 select a % b from t;
 select pmod(a, b) from t;
 
+-- tests for decimals handling in operations
+create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet;
+
+insert into decimals_test values(1, 100.0, 999.0), (2, 12345.123, 12345.123),
+  (3, 0.1234567891011, 1234.1), (4, 123456789123456789.0, 1.123456789123456789);
+
+-- test decimal operations
+select id, a+b, a-b, a*b, a/b from decimals_test order by id;
+
+-- test operations between decimals and constants
+select id, a*10, b/10 from decimals_test order by id;
+
+-- test operations on constants
+select 10.3 * 3.0;
+select 10.3000 * 3.0;
+select 10.30000 * 30.0;
+select 10.300000000000000000 * 3.000000000000000000;
+select 10.300000000000000000 * 3.0000000000000000000;
+
+-- arithmetic operations causing an overflow return NULL
+select (5e36 + 0.1) + 5e36;
+select (-4e36 - 0.1) - 7e36;
+select 12345678901234567890.0 * 12345678901234567890.0;
+select 1e35 / 0.1;
+
+-- arithmetic operations causing a precision loss are truncated
+select 123456789123456789.1234567890 * 1.123456789123456789;
+select 0.001 / 9876543210987654321098765432109876543.2
+
+-- return NULL instead of rounding, according to old Spark versions' behavior
+set spark.sql.decimalOperations.allowPrecisionLoss=false;
+
+-- test decimal operations
+select id, a+b, a-b, a*b, a/b from decimals_test order by id;
+
+-- test operations between decimals and constants
+select id, a*10, b/10 from decimals_test order by id;
+
+-- test operations on constants
+select 10.3 * 3.0;
+select 10.3000 * 3.0;
+select 10.30000 * 30.0;
+select 10.300000000000000000 * 3.000000000000000000;
+select 10.300000000000000000 * 3.0000000000000000000;
+
 -- arithmetic operations causing an overflow return NULL
 select (5e36 + 0.1) + 5e36;
 select (-4e36 - 0.1) - 7e36;
@@ -31,3 +76,5 @@ select 1e35 / 0.1;
 -- arithmetic operations causing a precision loss return NULL
 select 123456789123456789.1234567890 * 1.123456789123456789;
 select 0.001 / 9876543210987654321098765432109876543.2
+
+drop table decimals_test;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/elt.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/elt.sql
new file mode 100644
index 000000000000..717616f91db0
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/elt.sql
@@ -0,0 +1,44 @@
+-- Mixed inputs (output type is string)
+SELECT elt(2, col1, col2, col3, col4, col5) col
+FROM (
+  SELECT
+    'prefix_' col1,
+    id col2,
+    string(id + 1) col3,
+    encode(string(id + 2), 'utf-8') col4,
+    CAST(id AS DOUBLE) col5
+  FROM range(10)
+);
+
+SELECT elt(3, col1, col2, col3, col4) col
+FROM (
+  SELECT
+    string(id) col1,
+    string(id + 1) col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+);
+
+-- turn on eltOutputAsString
+set spark.sql.function.eltOutputAsString=true;
+
+SELECT elt(1, col1, col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+);
+
+-- turn off eltOutputAsString
+set spark.sql.function.eltOutputAsString=false;
+
+-- Elt binary inputs (output type is binary)
+SELECT elt(2, col1, col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+);
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index 986bb01c13fe..c1abc6dff754 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 25
+-- Number of queries: 26
 
 
 -- !query 0
@@ -227,3 +227,17 @@ SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t
 struct<1:int>
 -- !query 24 output
 1
+
+
+-- !query 25
+SELECT 1 from (
+  SELECT 1 AS z,
+  MIN(a.x)
+  FROM (select 1 as x) a
+  WHERE false
+) b
+where b.z != b.z
+-- !query 25 schema
+struct<1:int>
+-- !query 25 output
+
diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index d9dc728a18e8..581dddc89d0b 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -129,7 +129,7 @@ select to_json()
 struct<>
 -- !query 12 output
 org.apache.spark.sql.AnalysisException
-Invalid number of arguments for function to_json; line 1 pos 7
+Invalid number of arguments for function to_json. Expected: one of 1, 2 and 3; Found: 0; line 1 pos 7
 
 
 -- !query 13
@@ -225,7 +225,7 @@ select from_json()
 struct<>
 -- !query 21 output
 org.apache.spark.sql.AnalysisException
-Invalid number of arguments for function from_json; line 1 pos 7
+Invalid number of arguments for function from_json. Expected: one of 2, 3 and 4; Found: 0; line 1 pos 7
 
 
 -- !query 22
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
index 8b29300e71f9..a2b86db3e4f4 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 26
+-- Number of queries: 29
 
 
 -- !query 0
@@ -293,6 +293,21 @@ val1d
 
 
 -- !query 19
+SELECT t1a
+FROM   t1
+WHERE  t1a = (SELECT   max(t2a)
+              FROM     t2
+              WHERE    t2c = t1c
+              GROUP BY t2c
+              HAVING   count(*) >= 1)
+OR     t1i > '2014-12-31'
+-- !query 19 schema
+struct<t1a:string>
+-- !query 19 output
+val1c
+val1d
+
+-- !query 22
 SELECT count(t1a)
 FROM   t1 RIGHT JOIN t2
 ON     t1d = t2d
@@ -300,13 +315,13 @@ WHERE  t1a < (SELECT   max(t2a)
               FROM     t2
               WHERE    t2c = t1c
               GROUP BY t2c)
--- !query 19 schema
+-- !query 22 schema
 struct<count(t1a):bigint>
--- !query 19 output
+-- !query 22 output
 7
 
 
--- !query 20
+-- !query 23
 SELECT t1a
 FROM   t1
 WHERE  t1b <= (SELECT   max(t2b)
@@ -317,14 +332,14 @@ AND    t1b >= (SELECT   min(t2b)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 20 schema
+-- !query 23 schema
 struct<t1a:string>
--- !query 20 output
+-- !query 23 output
 val1b
 val1c
 
 
--- !query 21
+-- !query 24
 SELECT t1a
 FROM   t1
 WHERE  t1a <= (SELECT   max(t2a)
@@ -338,14 +353,14 @@ WHERE  t1a >= (SELECT   min(t2a)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 21 schema
+-- !query 24 schema
 struct<t1a:string>
--- !query 21 output
+-- !query 24 output
 val1b
 val1c
 
 
--- !query 22
+-- !query 25
 SELECT t1a
 FROM   t1
 WHERE  t1a <= (SELECT   max(t2a)
@@ -359,9 +374,9 @@ WHERE  t1a >= (SELECT   min(t2a)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 22 schema
+-- !query 25 schema
 struct<t1a:string>
--- !query 22 output
+-- !query 25 output
 val1a
 val1a
 val1b
@@ -372,7 +387,7 @@ val1d
 val1d
 
 
--- !query 23
+-- !query 26
 SELECT t1a
 FROM   t1
 WHERE  t1a <= (SELECT   max(t2a)
@@ -386,16 +401,16 @@ WHERE  t1a >= (SELECT   min(t2a)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 23 schema
+-- !query 26 schema
 struct<t1a:string>
--- !query 23 output
+-- !query 26 output
 val1a
 val1b
 val1c
 val1d
 
 
--- !query 24
+-- !query 27
 SELECT t1a
 FROM   t1
 WHERE  t1a <= (SELECT   max(t2a)
@@ -409,13 +424,13 @@ WHERE  t1a >= (SELECT   min(t2a)
                FROM     t2
                WHERE    t2c = t1c
                GROUP BY t2c)
--- !query 24 schema
+-- !query 27 schema
 struct<t1a:string>
--- !query 24 output
+-- !query 27 output
 val1a
 
 
--- !query 25
+-- !query 28
 SELECT   t1a
 FROM     t1
 GROUP BY t1a, t1c
@@ -423,8 +438,8 @@ HAVING   max(t1b) <= (SELECT   max(t2b)
                       FROM     t2
                       WHERE    t2c = t1c
                       GROUP BY t2c)
--- !query 25 schema
+-- !query 28 schema
 struct<t1a:string>
--- !query 25 output
+-- !query 28 output
 val1b
 val1c
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
index ce02f6adc456..4d70fe19d539 100644
--- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 10
+-- Number of queries: 32
 
 
 -- !query 0
@@ -35,48 +35,257 @@ NULL
 
 
 -- !query 4
-select (5e36 + 0.1) + 5e36
+create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet
 -- !query 4 schema
-struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
+struct<>
 -- !query 4 output
-NULL
+
 
 
 -- !query 5
-select (-4e36 - 0.1) - 7e36
+insert into decimals_test values(1, 100.0, 999.0), (2, 12345.123, 12345.123),
+  (3, 0.1234567891011, 1234.1), (4, 123456789123456789.0, 1.123456789123456789)
 -- !query 5 schema
-struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
+struct<>
 -- !query 5 output
-NULL
+
 
 
 -- !query 6
-select 12345678901234567890.0 * 12345678901234567890.0
+select id, a+b, a-b, a*b, a/b from decimals_test order by id
 -- !query 6 schema
-struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
+struct<id:int,(a + b):decimal(38,17),(a - b):decimal(38,17),(a * b):decimal(38,6),(a / b):decimal(38,6)>
 -- !query 6 output
-NULL
+1	1099	-899	99900	0.1001
+2	24690.246	0	152402061.885129	1
+3	1234.2234567891011	-1233.9765432108989	152.358023	0.0001
+4	123456789123456790.12345678912345679	123456789123456787.87654321087654321	138698367904130467.515623	109890109097814272.043109
 
 
 -- !query 7
-select 1e35 / 0.1
+select id, a*10, b/10 from decimals_test order by id
 -- !query 7 schema
-struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,3)>
+struct<id:int,(CAST(a AS DECIMAL(38,18)) * CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,15),(CAST(b AS DECIMAL(38,18)) / CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,18)>
 -- !query 7 output
-NULL
+1	1000	99.9
+2	123451.23	1234.5123
+3	1.234567891011	123.41
+4	1234567891234567890	0.112345678912345679
 
 
 -- !query 8
-select 123456789123456789.1234567890 * 1.123456789123456789
+select 10.3 * 3.0
 -- !query 8 schema
-struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,28)>
+struct<(CAST(10.3 AS DECIMAL(3,1)) * CAST(3.0 AS DECIMAL(3,1))):decimal(6,2)>
 -- !query 8 output
-NULL
+30.9
 
 
 -- !query 9
-select 0.001 / 9876543210987654321098765432109876543.2
+select 10.3000 * 3.0
 -- !query 9 schema
-struct<(CAST(0.001 AS DECIMAL(38,3)) / CAST(9876543210987654321098765432109876543.2 AS DECIMAL(38,3))):decimal(38,37)>
+struct<(CAST(10.3000 AS DECIMAL(6,4)) * CAST(3.0 AS DECIMAL(6,4))):decimal(9,5)>
 -- !query 9 output
+30.9
+
+
+-- !query 10
+select 10.30000 * 30.0
+-- !query 10 schema
+struct<(CAST(10.30000 AS DECIMAL(7,5)) * CAST(30.0 AS DECIMAL(7,5))):decimal(11,6)>
+-- !query 10 output
+309
+
+
+-- !query 11
+select 10.300000000000000000 * 3.000000000000000000
+-- !query 11 schema
+struct<(CAST(10.300000000000000000 AS DECIMAL(20,18)) * CAST(3.000000000000000000 AS DECIMAL(20,18))):decimal(38,34)>
+-- !query 11 output
+30.9
+
+
+-- !query 12
+select 10.300000000000000000 * 3.0000000000000000000
+-- !query 12 schema
+struct<(CAST(10.300000000000000000 AS DECIMAL(21,19)) * CAST(3.0000000000000000000 AS DECIMAL(21,19))):decimal(38,34)>
+-- !query 12 output
+30.9
+
+
+-- !query 13
+select (5e36 + 0.1) + 5e36
+-- !query 13 schema
+struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
+-- !query 13 output
+NULL
+
+
+-- !query 14
+select (-4e36 - 0.1) - 7e36
+-- !query 14 schema
+struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
+-- !query 14 output
+NULL
+
+
+-- !query 15
+select 12345678901234567890.0 * 12345678901234567890.0
+-- !query 15 schema
+struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
+-- !query 15 output
 NULL
+
+
+-- !query 16
+select 1e35 / 0.1
+-- !query 16 schema
+struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,6)>
+-- !query 16 output
+NULL
+
+
+-- !query 17
+select 123456789123456789.1234567890 * 1.123456789123456789
+-- !query 17 schema
+struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,18)>
+-- !query 17 output
+138698367904130467.654320988515622621
+
+
+-- !query 18
+select 0.001 / 9876543210987654321098765432109876543.2
+
+set spark.sql.decimalOperations.allowPrecisionLoss=false
+-- !query 18 schema
+struct<>
+-- !query 18 output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+mismatched input 'spark' expecting <EOF>(line 3, pos 4)
+
+== SQL ==
+select 0.001 / 9876543210987654321098765432109876543.2
+
+set spark.sql.decimalOperations.allowPrecisionLoss=false
+----^^^
+
+
+-- !query 19
+select id, a+b, a-b, a*b, a/b from decimals_test order by id
+-- !query 19 schema
+struct<id:int,(a + b):decimal(38,17),(a - b):decimal(38,17),(a * b):decimal(38,6),(a / b):decimal(38,6)>
+-- !query 19 output
+1	1099	-899	99900	0.1001
+2	24690.246	0	152402061.885129	1
+3	1234.2234567891011	-1233.9765432108989	152.358023	0.0001
+4	123456789123456790.12345678912345679	123456789123456787.87654321087654321	138698367904130467.515623	109890109097814272.043109
+
+
+-- !query 20
+select id, a*10, b/10 from decimals_test order by id
+-- !query 20 schema
+struct<id:int,(CAST(a AS DECIMAL(38,18)) * CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,15),(CAST(b AS DECIMAL(38,18)) / CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,18)>
+-- !query 20 output
+1	1000	99.9
+2	123451.23	1234.5123
+3	1.234567891011	123.41
+4	1234567891234567890	0.112345678912345679
+
+
+-- !query 21
+select 10.3 * 3.0
+-- !query 21 schema
+struct<(CAST(10.3 AS DECIMAL(3,1)) * CAST(3.0 AS DECIMAL(3,1))):decimal(6,2)>
+-- !query 21 output
+30.9
+
+
+-- !query 22
+select 10.3000 * 3.0
+-- !query 22 schema
+struct<(CAST(10.3000 AS DECIMAL(6,4)) * CAST(3.0 AS DECIMAL(6,4))):decimal(9,5)>
+-- !query 22 output
+30.9
+
+
+-- !query 23
+select 10.30000 * 30.0
+-- !query 23 schema
+struct<(CAST(10.30000 AS DECIMAL(7,5)) * CAST(30.0 AS DECIMAL(7,5))):decimal(11,6)>
+-- !query 23 output
+309
+
+
+-- !query 24
+select 10.300000000000000000 * 3.000000000000000000
+-- !query 24 schema
+struct<(CAST(10.300000000000000000 AS DECIMAL(20,18)) * CAST(3.000000000000000000 AS DECIMAL(20,18))):decimal(38,34)>
+-- !query 24 output
+30.9
+
+
+-- !query 25
+select 10.300000000000000000 * 3.0000000000000000000
+-- !query 25 schema
+struct<(CAST(10.300000000000000000 AS DECIMAL(21,19)) * CAST(3.0000000000000000000 AS DECIMAL(21,19))):decimal(38,34)>
+-- !query 25 output
+30.9
+
+
+-- !query 26
+select (5e36 + 0.1) + 5e36
+-- !query 26 schema
+struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
+-- !query 26 output
+NULL
+
+
+-- !query 27
+select (-4e36 - 0.1) - 7e36
+-- !query 27 schema
+struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
+-- !query 27 output
+NULL
+
+
+-- !query 28
+select 12345678901234567890.0 * 12345678901234567890.0
+-- !query 28 schema
+struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
+-- !query 28 output
+NULL
+
+
+-- !query 29
+select 1e35 / 0.1
+-- !query 29 schema
+struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,6)>
+-- !query 29 output
+NULL
+
+
+-- !query 30
+select 123456789123456789.1234567890 * 1.123456789123456789
+-- !query 30 schema
+struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,18)>
+-- !query 30 output
+138698367904130467.654320988515622621
+
+
+-- !query 31
+select 0.001 / 9876543210987654321098765432109876543.2
+
+drop table decimals_test
+-- !query 31 schema
+struct<>
+-- !query 31 output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+mismatched input 'table' expecting <EOF>(line 3, pos 5)
+
+== SQL ==
+select 0.001 / 9876543210987654321098765432109876543.2
+
+drop table decimals_test
+-----^^^
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out
index ebc8201ed5a1..6ee7f59d6987 100644
--- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out
@@ -2329,7 +2329,7 @@ struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(C
 -- !query 280
 SELECT cast(1 as bigint) / cast(1 as decimal(20, 0)) FROM t
 -- !query 280 schema
-struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) / CAST(1 AS DECIMAL(20,0))):decimal(38,19)>
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) / CAST(1 AS DECIMAL(20,0))):decimal(38,18)>
 -- !query 280 output
 1
 
@@ -2661,7 +2661,7 @@ struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS BI
 -- !query 320
 SELECT cast(1 as decimal(20, 0)) / cast(1 as bigint) FROM t
 -- !query 320 schema
-struct<(CAST(1 AS DECIMAL(20,0)) / CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(38,19)>
+struct<(CAST(1 AS DECIMAL(20,0)) / CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(38,18)>
 -- !query 320 output
 1
 
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out
new file mode 100644
index 000000000000..b62e1b682604
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out
@@ -0,0 +1,115 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 6
+
+
+-- !query 0
+SELECT elt(2, col1, col2, col3, col4, col5) col
+FROM (
+  SELECT
+    'prefix_' col1,
+    id col2,
+    string(id + 1) col3,
+    encode(string(id + 2), 'utf-8') col4,
+    CAST(id AS DOUBLE) col5
+  FROM range(10)
+)
+-- !query 0 schema
+struct<col:string>
+-- !query 0 output
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+
+
+-- !query 1
+SELECT elt(3, col1, col2, col3, col4) col
+FROM (
+  SELECT
+    string(id) col1,
+    string(id + 1) col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+)
+-- !query 1 schema
+struct<col:string>
+-- !query 1 output
+10
+11
+2
+3
+4
+5
+6
+7
+8
+9
+
+
+-- !query 2
+set spark.sql.function.eltOutputAsString=true
+-- !query 2 schema
+struct<key:string,value:string>
+-- !query 2 output
+spark.sql.function.eltOutputAsString	true
+
+
+-- !query 3
+SELECT elt(1, col1, col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+)
+-- !query 3 schema
+struct<col:string>
+-- !query 3 output
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+
+
+-- !query 4
+set spark.sql.function.eltOutputAsString=false
+-- !query 4 schema
+struct<key:string,value:string>
+-- !query 4 output
+spark.sql.function.eltOutputAsString	false
+
+
+-- !query 5
+SELECT elt(2, col1, col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+)
+-- !query 5 schema
+struct<col:binary>
+-- !query 5 output
+1
+10
+2
+3
+4
+5
+6
+7
+8
+9
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 06848e4d2b29..e7776e36702a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql
 
 import scala.util.Random
 
+import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
+import org.apache.spark.sql.catalyst.expressions.aggregate.Count
 import org.apache.spark.sql.execution.WholeStageCodegenExec
 import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
@@ -27,7 +29,7 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData.DecimalData
-import org.apache.spark.sql.types.{Decimal, DecimalType}
+import org.apache.spark.sql.types.DecimalType
 
 case class Fact(date: Int, hour: Int, minute: Int, room_name: String, temp: Double)
 
@@ -456,7 +458,6 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
 
   test("null moments") {
     val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b")
-
     checkAnswer(
       emptyTableData.agg(variance('a), var_samp('a), var_pop('a), skewness('a), kurtosis('a)),
       Row(null, null, null, null, null))
@@ -666,4 +667,23 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       assert(exchangePlans.length == 1)
     }
   }
+
+  Seq(true, false).foreach { codegen =>
+    test("SPARK-22951: dropDuplicates on empty dataFrames should produce correct aggregate " +
+      s"results when codegen is enabled: $codegen") {
+      withSQLConf((SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, codegen.toString)) {
+        // explicit global aggregations
+        val emptyAgg = Map.empty[String, String]
+        checkAnswer(spark.emptyDataFrame.agg(emptyAgg), Seq(Row()))
+        checkAnswer(spark.emptyDataFrame.groupBy().agg(emptyAgg), Seq(Row()))
+        checkAnswer(spark.emptyDataFrame.groupBy().agg(count("*")), Seq(Row(0)))
+        checkAnswer(spark.emptyDataFrame.dropDuplicates().agg(emptyAgg), Seq(Row()))
+        checkAnswer(spark.emptyDataFrame.dropDuplicates().groupBy().agg(emptyAgg), Seq(Row()))
+        checkAnswer(spark.emptyDataFrame.dropDuplicates().groupBy().agg(count("*")), Seq(Row(0)))
+
+        // global aggregation is converted to grouping aggregation:
+        assert(spark.emptyDataFrame.dropDuplicates().count() == 0)
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 46b21c3b64a2..5169d2b5fc6b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -260,6 +260,14 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
     assert(res2(1).isEmpty)
   }
 
+  // SPARK-22957: check for 32bit overflow when computing rank.
+  // ignored - takes 4 minutes to run.
+  ignore("approx quantile 4: test for Int overflow") {
+    val res = spark.range(3000000000L).stat.approxQuantile("id", Array(0.8, 0.9), 0.05)
+    assert(res(0) > 2200000000.0)
+    assert(res(1) > 2200000000.0)
+  }
+
   test("crosstab") {
     withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
       val rng = new Random()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 5e4c1a6a484f..33707080c130 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1255,6 +1255,34 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     assert(testData.select($"*").showString(1, vertical = true) === expectedAnswer)
   }
 
+  test("SPARK-23023 Cast rows to strings in showString") {
+    val df1 = Seq(Seq(1, 2, 3, 4)).toDF("a")
+    assert(df1.showString(10) ===
+      s"""+------------+
+         ||           a|
+         |+------------+
+         ||[1, 2, 3, 4]|
+         |+------------+
+         |""".stripMargin)
+    val df2 = Seq(Map(1 -> "a", 2 -> "b")).toDF("a")
+    assert(df2.showString(10) ===
+      s"""+----------------+
+         ||               a|
+         |+----------------+
+         ||[1 -> a, 2 -> b]|
+         |+----------------+
+         |""".stripMargin)
+    val df3 = Seq(((1, "a"), 0), ((2, "b"), 0)).toDF("a", "b")
+    assert(df3.showString(10) ===
+      s"""+------+---+
+         ||     a|  b|
+         |+------+---+
+         ||[1, a]|  0|
+         ||[2, b]|  0|
+         |+------+---+
+         |""".stripMargin)
+  }
+
   test("SPARK-7327 show with empty dataFrame") {
     val expectedAnswer = """+---+-----+
                            ||key|value|
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala
new file mode 100644
index 000000000000..0ee9b0edc02b
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala
@@ -0,0 +1,405 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.sql.{Date, Timestamp}
+
+import org.apache.spark.sql.expressions.Window
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.unsafe.types.CalendarInterval
+
+/**
+ * Window frame testing for DataFrame API.
+ */
+class DataFrameWindowFramesSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  test("lead/lag with empty data frame") {
+    val df = Seq.empty[(Int, String)].toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        lead("value", 1).over(window),
+        lag("value", 1).over(window)),
+      Nil)
+  }
+
+  test("lead/lag with positive offset") {
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        lead("value", 1).over(window),
+        lag("value", 1).over(window)),
+      Row(1, "3", null) :: Row(1, null, "1") :: Row(2, "4", null) :: Row(2, null, "2") :: Nil)
+  }
+
+  test("reverse lead/lag with positive offset") {
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value".desc)
+
+    checkAnswer(
+      df.select(
+        $"key",
+        lead("value", 1).over(window),
+        lag("value", 1).over(window)),
+      Row(1, "1", null) :: Row(1, null, "3") :: Row(2, "2", null) :: Row(2, null, "4") :: Nil)
+  }
+
+  test("lead/lag with negative offset") {
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        lead("value", -1).over(window),
+        lag("value", -1).over(window)),
+      Row(1, null, "3") :: Row(1, "1", null) :: Row(2, null, "4") :: Row(2, "2", null) :: Nil)
+  }
+
+  test("reverse lead/lag with negative offset") {
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value".desc)
+
+    checkAnswer(
+      df.select(
+        $"key",
+        lead("value", -1).over(window),
+        lag("value", -1).over(window)),
+      Row(1, null, "1") :: Row(1, "3", null) :: Row(2, null, "2") :: Row(2, "4", null) :: Nil)
+  }
+
+  test("lead/lag with default value") {
+    val default = "n/a"
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4"), (2, "5")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        lead("value", 2, default).over(window),
+        lag("value", 2, default).over(window),
+        lead("value", -2, default).over(window),
+        lag("value", -2, default).over(window)),
+      Row(1, default, default, default, default) :: Row(1, default, default, default, default) ::
+        Row(2, "5", default, default, "5") :: Row(2, default, "2", "2", default) ::
+        Row(2, default, default, default, default) :: Nil)
+  }
+
+  test("rows/range between with empty data frame") {
+    val df = Seq.empty[(String, Int)].toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        'key,
+        first("value").over(
+          window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)),
+        first("value").over(
+          window.rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing))),
+      Nil)
+  }
+
+  test("rows between should accept int/long values as boundary") {
+    val df = Seq((1L, "1"), (1L, "1"), (2147483650L, "1"), (3L, "2"), (2L, "1"), (2147483650L, "2"))
+      .toDF("key", "value")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rowsBetween(0, 2147483647))),
+      Seq(Row(1, 3), Row(1, 4), Row(2, 2), Row(3, 2), Row(2147483650L, 1), Row(2147483650L, 1))
+    )
+
+    val e = intercept[AnalysisException](
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rowsBetween(0, 2147483648L))))
+    assert(e.message.contains("Boundary end is not a valid integer: 2147483648"))
+  }
+
+  test("range between should accept at most one ORDER BY expression when unbounded") {
+    val df = Seq((1, 1)).toDF("key", "value")
+    val window = Window.orderBy($"key", $"value")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        min("key").over(
+          window.rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing))),
+      Seq(Row(1, 1))
+    )
+
+    val e1 = intercept[AnalysisException](
+      df.select(
+        min("key").over(window.rangeBetween(Window.unboundedPreceding, 1))))
+    assert(e1.message.contains("A range window frame with value boundaries cannot be used in a " +
+      "window specification with multiple order by expressions"))
+
+    val e2 = intercept[AnalysisException](
+      df.select(
+        min("key").over(window.rangeBetween(-1, Window.unboundedFollowing))))
+    assert(e2.message.contains("A range window frame with value boundaries cannot be used in a " +
+      "window specification with multiple order by expressions"))
+
+    val e3 = intercept[AnalysisException](
+      df.select(
+        min("key").over(window.rangeBetween(-1, 1))))
+    assert(e3.message.contains("A range window frame with value boundaries cannot be used in a " +
+      "window specification with multiple order by expressions"))
+  }
+
+  test("range between should accept numeric values only when bounded") {
+    val df = Seq("non_numeric").toDF("value")
+    val window = Window.orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        $"value",
+        min("value").over(
+          window.rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing))),
+      Row("non_numeric", "non_numeric") :: Nil)
+
+    val e1 = intercept[AnalysisException](
+      df.select(
+        min("value").over(window.rangeBetween(Window.unboundedPreceding, 1))))
+    assert(e1.message.contains("The data type of the upper bound 'string' " +
+      "does not match the expected data type"))
+
+    val e2 = intercept[AnalysisException](
+      df.select(
+        min("value").over(window.rangeBetween(-1, Window.unboundedFollowing))))
+    assert(e2.message.contains("The data type of the lower bound 'string' " +
+      "does not match the expected data type"))
+
+    val e3 = intercept[AnalysisException](
+      df.select(
+        min("value").over(window.rangeBetween(-1, 1))))
+    assert(e3.message.contains("The data type of the lower bound 'string' " +
+      "does not match the expected data type"))
+  }
+
+  test("range between should accept int/long values as boundary") {
+    val df = Seq((1L, "1"), (1L, "1"), (2147483650L, "1"), (3L, "2"), (2L, "1"), (2147483650L, "2"))
+      .toDF("key", "value")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rangeBetween(0, 2147483648L))),
+      Seq(Row(1, 3), Row(1, 3), Row(2, 2), Row(3, 2), Row(2147483650L, 1), Row(2147483650L, 1))
+    )
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(
+          Window.partitionBy($"value").orderBy($"key").rangeBetween(-2147483649L, 0))),
+      Seq(Row(1, 2), Row(1, 2), Row(2, 3), Row(2147483650L, 2), Row(2147483650L, 4), Row(3, 1))
+    )
+
+    def dt(date: String): Date = Date.valueOf(date)
+
+    val df2 = Seq((dt("2017-08-01"), "1"), (dt("2017-08-01"), "1"), (dt("2020-12-31"), "1"),
+      (dt("2017-08-03"), "2"), (dt("2017-08-02"), "1"), (dt("2020-12-31"), "2"))
+      .toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key").rangeBetween(lit(0), lit(2))
+
+    checkAnswer(
+      df2.select(
+        $"key",
+        count("key").over(window)),
+      Seq(Row(dt("2017-08-01"), 3), Row(dt("2017-08-01"), 3), Row(dt("2020-12-31"), 1),
+        Row(dt("2017-08-03"), 1), Row(dt("2017-08-02"), 1), Row(dt("2020-12-31"), 1))
+    )
+  }
+
+  test("range between should accept double values as boundary") {
+    val df = Seq((1.0D, "1"), (1.0D, "1"), (100.001D, "1"), (3.3D, "2"), (2.02D, "1"),
+      (100.001D, "2")).toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key").rangeBetween(currentRow, lit(2.5D))
+
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(window)),
+      Seq(Row(1.0, 3), Row(1.0, 3), Row(100.001, 1), Row(3.3, 1), Row(2.02, 1), Row(100.001, 1))
+    )
+  }
+
+  test("range between should accept interval values as boundary") {
+    def ts(timestamp: Long): Timestamp = new Timestamp(timestamp * 1000)
+
+    val df = Seq((ts(1501545600), "1"), (ts(1501545600), "1"), (ts(1609372800), "1"),
+      (ts(1503000000), "2"), (ts(1502000000), "1"), (ts(1609372800), "2"))
+      .toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key")
+      .rangeBetween(currentRow, lit(CalendarInterval.fromString("interval 23 days 4 hours")))
+
+    checkAnswer(
+      df.select(
+        $"key",
+        count("key").over(window)),
+      Seq(Row(ts(1501545600), 3), Row(ts(1501545600), 3), Row(ts(1609372800), 1),
+        Row(ts(1503000000), 1), Row(ts(1502000000), 1), Row(ts(1609372800), 1))
+    )
+  }
+
+  test("unbounded rows/range between with aggregation") {
+    val df = Seq(("one", 1), ("two", 2), ("one", 3), ("two", 4)).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    checkAnswer(
+      df.select(
+        'key,
+        sum("value").over(window.
+          rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)),
+        sum("value").over(window.
+          rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing))),
+      Row("one", 4, 4) :: Row("one", 4, 4) :: Row("two", 6, 6) :: Row("two", 6, 6) :: Nil)
+  }
+
+  test("unbounded preceding/following rows between with aggregation") {
+    val df = Seq((1, "1"), (2, "2"), (2, "3"), (1, "3"), (3, "2"), (4, "3")).toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        last("key").over(
+          window.rowsBetween(Window.currentRow, Window.unboundedFollowing)),
+        last("key").over(
+          window.rowsBetween(Window.unboundedPreceding, Window.currentRow))),
+      Row(1, 1, 1) :: Row(2, 3, 2) :: Row(3, 3, 3) :: Row(1, 4, 1) :: Row(2, 4, 2) ::
+        Row(4, 4, 4) :: Nil)
+  }
+
+  test("reverse unbounded preceding/following rows between with aggregation") {
+    val df = Seq((1, "1"), (2, "2"), (2, "3"), (1, "3"), (3, "2"), (4, "3")).toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key".desc)
+
+    checkAnswer(
+      df.select(
+        $"key",
+        last("key").over(
+          window.rowsBetween(Window.currentRow, Window.unboundedFollowing)),
+        last("key").over(
+          window.rowsBetween(Window.unboundedPreceding, Window.currentRow))),
+      Row(1, 1, 1) :: Row(3, 2, 3) :: Row(2, 2, 2) :: Row(4, 1, 4) :: Row(2, 1, 2) ::
+        Row(1, 1, 1) :: Nil)
+  }
+
+  test("unbounded preceding/following range between with aggregation") {
+    val df = Seq((5, "1"), (5, "2"), (4, "2"), (6, "2"), (3, "1"), (2, "2")).toDF("key", "value")
+    val window = Window.partitionBy("value").orderBy("key")
+
+    checkAnswer(
+      df.select(
+        $"key",
+        avg("key").over(window.rangeBetween(Window.unboundedPreceding, 1))
+          .as("avg_key1"),
+        avg("key").over(window.rangeBetween(Window.currentRow, Window.unboundedFollowing))
+          .as("avg_key2")),
+      Row(3, 3.0d, 4.0d) :: Row(5, 4.0d, 5.0d) :: Row(2, 2.0d, 17.0d / 4.0d) ::
+        Row(4, 11.0d / 3.0d, 5.0d) :: Row(5, 17.0d / 4.0d, 11.0d / 2.0d) ::
+        Row(6, 17.0d / 4.0d, 6.0d) :: Nil)
+  }
+
+  // This is here to illustrate the fact that reverse order also reverses offsets.
+  test("reverse preceding/following range between with aggregation") {
+    val df = Seq(1, 2, 4, 3, 2, 1).toDF("value")
+    val window = Window.orderBy($"value".desc)
+
+    checkAnswer(
+      df.select(
+        $"value",
+        sum($"value").over(window.rangeBetween(Window.unboundedPreceding, 1)),
+        sum($"value").over(window.rangeBetween(1, Window.unboundedFollowing))),
+      Row(1, 13, null) :: Row(2, 13, 2) :: Row(4, 7, 9) :: Row(3, 11, 6) ::
+        Row(2, 13, 2) :: Row(1, 13, null) :: Nil)
+  }
+
+  test("sliding rows between with aggregation") {
+    val df = Seq((1, "1"), (2, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key").rowsBetween(-1, 2)
+
+    checkAnswer(
+      df.select(
+        $"key",
+        avg("key").over(window)),
+      Row(1, 4.0d / 3.0d) :: Row(1, 4.0d / 3.0d) :: Row(2, 3.0d / 2.0d) :: Row(2, 2.0d) ::
+        Row(2, 2.0d) :: Nil)
+  }
+
+  test("reverse sliding rows between with aggregation") {
+    val df = Seq((1, "1"), (2, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key".desc).rowsBetween(-1, 2)
+
+    checkAnswer(
+      df.select(
+        $"key",
+        avg("key").over(window)),
+      Row(1, 1.0d) :: Row(1, 4.0d / 3.0d) :: Row(2, 4.0d / 3.0d) :: Row(2, 2.0d) ::
+        Row(2, 2.0d) :: Nil)
+  }
+
+  test("sliding range between with aggregation") {
+    val df = Seq((1, "1"), (1, "1"), (3, "1"), (2, "2"), (2, "1"), (2, "2")).toDF("key", "value")
+    val window = Window.partitionBy($"value").orderBy($"key").rangeBetween(-1, 1)
+
+    checkAnswer(
+      df.select(
+        $"key",
+        avg("key").over(window)),
+      Row(1, 4.0d / 3.0d) :: Row(1, 4.0d / 3.0d) :: Row(2, 7.0d / 4.0d) :: Row(3, 5.0d / 2.0d) ::
+        Row(2, 2.0d) :: Row(2, 2.0d) :: Nil)
+  }
+
+  test("reverse sliding range between with aggregation") {
+    val df = Seq(
+      (1, "Thin", "Cell Phone", 6000),
+      (2, "Normal", "Tablet", 1500),
+      (3, "Mini", "Tablet", 5500),
+      (4, "Ultra thin", "Cell Phone", 5500),
+      (5, "Very thin", "Cell Phone", 6000),
+      (6, "Big", "Tablet", 2500),
+      (7, "Bendable", "Cell Phone", 3000),
+      (8, "Foldable", "Cell Phone", 3000),
+      (9, "Pro", "Tablet", 4500),
+      (10, "Pro2", "Tablet", 6500)).
+      toDF("id", "product", "category", "revenue")
+    val window = Window.partitionBy($"category").orderBy($"revenue".desc).
+      rangeBetween(-2000L, 1000L)
+
+    checkAnswer(
+      df.select(
+        $"id",
+        avg($"revenue").over(window).cast("int")),
+      Row(1, 5833) :: Row(2, 2000) :: Row(3, 5500) ::
+        Row(4, 5833) :: Row(5, 5833) :: Row(6, 2833) ::
+        Row(7, 3000) :: Row(8, 3000) :: Row(9, 5500) ::
+        Row(10, 6000) :: Nil)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
index 01c988ecc372..281147835abd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
@@ -55,56 +55,6 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext {
       Row(1, "1") :: Row(2, "2") :: Row(null, null) :: Row(null, null) :: Nil)
   }
 
-  test("Window.rowsBetween") {
-    val df = Seq(("one", 1), ("two", 2)).toDF("key", "value")
-    // Running (cumulative) sum
-    checkAnswer(
-      df.select('key, sum("value").over(
-        Window.rowsBetween(Window.unboundedPreceding, Window.currentRow))),
-      Row("one", 1) :: Row("two", 3) :: Nil
-    )
-  }
-
-  test("lead") {
-    val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-
-    checkAnswer(
-      df.select(
-        lead("value", 1).over(Window.partitionBy($"key").orderBy($"value"))),
-      Row("1") :: Row(null) :: Row("2") :: Row(null) :: Nil)
-  }
-
-  test("lag") {
-    val df = Seq((1, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-
-    checkAnswer(
-      df.select(
-        lag("value", 1).over(Window.partitionBy($"key").orderBy($"value"))),
-      Row(null) :: Row("1") :: Row(null) :: Row("2") :: Nil)
-  }
-
-  test("lead with default value") {
-    val df = Seq((1, "1"), (1, "1"), (2, "2"), (1, "1"),
-                 (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        lead("value", 2, "n/a").over(Window.partitionBy("key").orderBy("value"))),
-      Seq(Row("1"), Row("1"), Row("n/a"), Row("n/a"), Row("2"), Row("n/a"), Row("n/a")))
-  }
-
-  test("lag with default value") {
-    val df = Seq((1, "1"), (1, "1"), (2, "2"), (1, "1"),
-                 (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        lag("value", 2, "n/a").over(Window.partitionBy($"key").orderBy($"value"))),
-      Seq(Row("n/a"), Row("n/a"), Row("1"), Row("1"), Row("n/a"), Row("n/a"), Row("2")))
-  }
-
   test("rank functions in unspecific window") {
     val df = Seq((1, "1"), (2, "2"), (1, "2"), (2, "2")).toDF("key", "value")
     df.createOrReplaceTempView("window_table")
@@ -136,199 +86,6 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext {
     assert(e.message.contains("requires window to be ordered"))
   }
 
-  test("aggregation and rows between") {
-    val df = Seq((1, "1"), (2, "1"), (2, "2"), (1, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        avg("key").over(Window.partitionBy($"value").orderBy($"key").rowsBetween(-1, 2))),
-      Seq(Row(4.0d / 3.0d), Row(4.0d / 3.0d), Row(3.0d / 2.0d), Row(2.0d), Row(2.0d)))
-  }
-
-  test("aggregation and range between") {
-    val df = Seq((1, "1"), (1, "1"), (3, "1"), (2, "2"), (2, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        avg("key").over(Window.partitionBy($"value").orderBy($"key").rangeBetween(-1, 1))),
-      Seq(Row(4.0d / 3.0d), Row(4.0d / 3.0d), Row(7.0d / 4.0d), Row(5.0d / 2.0d),
-        Row(2.0d), Row(2.0d)))
-  }
-
-  test("row between should accept integer values as boundary") {
-    val df = Seq((1L, "1"), (1L, "1"), (2147483650L, "1"),
-      (3L, "2"), (2L, "1"), (2147483650L, "2"))
-      .toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key").rowsBetween(0, 2147483647))),
-      Seq(Row(1, 3), Row(1, 4), Row(2, 2), Row(3, 2), Row(2147483650L, 1), Row(2147483650L, 1))
-    )
-
-    val e = intercept[AnalysisException](
-      df.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key").rowsBetween(0, 2147483648L))))
-    assert(e.message.contains("Boundary end is not a valid integer: 2147483648"))
-  }
-
-  test("range between should accept int/long values as boundary") {
-    val df = Seq((1L, "1"), (1L, "1"), (2147483650L, "1"),
-      (3L, "2"), (2L, "1"), (2147483650L, "2"))
-      .toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key").rangeBetween(0, 2147483648L))),
-      Seq(Row(1, 3), Row(1, 3), Row(2, 2), Row(3, 2), Row(2147483650L, 1), Row(2147483650L, 1))
-    )
-    checkAnswer(
-      df.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key").rangeBetween(-2147483649L, 0))),
-      Seq(Row(1, 2), Row(1, 2), Row(2, 3), Row(2147483650L, 2), Row(2147483650L, 4), Row(3, 1))
-    )
-
-    def dt(date: String): Date = Date.valueOf(date)
-
-    val df2 = Seq((dt("2017-08-01"), "1"), (dt("2017-08-01"), "1"), (dt("2020-12-31"), "1"),
-      (dt("2017-08-03"), "2"), (dt("2017-08-02"), "1"), (dt("2020-12-31"), "2"))
-      .toDF("key", "value")
-    checkAnswer(
-      df2.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key").rangeBetween(lit(0), lit(2)))),
-      Seq(Row(dt("2017-08-01"), 3), Row(dt("2017-08-01"), 3), Row(dt("2020-12-31"), 1),
-        Row(dt("2017-08-03"), 1), Row(dt("2017-08-02"), 1), Row(dt("2020-12-31"), 1))
-    )
-  }
-
-  test("range between should accept double values as boundary") {
-    val df = Seq((1.0D, "1"), (1.0D, "1"), (100.001D, "1"),
-      (3.3D, "2"), (2.02D, "1"), (100.001D, "2"))
-      .toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key")
-            .rangeBetween(currentRow, lit(2.5D)))),
-      Seq(Row(1.0, 3), Row(1.0, 3), Row(100.001, 1), Row(3.3, 1), Row(2.02, 1), Row(100.001, 1))
-    )
-  }
-
-  test("range between should accept interval values as boundary") {
-    def ts(timestamp: Long): Timestamp = new Timestamp(timestamp * 1000)
-
-    val df = Seq((ts(1501545600), "1"), (ts(1501545600), "1"), (ts(1609372800), "1"),
-      (ts(1503000000), "2"), (ts(1502000000), "1"), (ts(1609372800), "2"))
-      .toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        $"key",
-        count("key").over(
-          Window.partitionBy($"value").orderBy($"key")
-            .rangeBetween(currentRow,
-              lit(CalendarInterval.fromString("interval 23 days 4 hours"))))),
-      Seq(Row(ts(1501545600), 3), Row(ts(1501545600), 3), Row(ts(1609372800), 1),
-        Row(ts(1503000000), 1), Row(ts(1502000000), 1), Row(ts(1609372800), 1))
-    )
-  }
-
-  test("aggregation and rows between with unbounded") {
-    val df = Seq((1, "1"), (2, "2"), (2, "3"), (1, "3"), (3, "2"), (4, "3")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        $"key",
-        last("key").over(
-          Window.partitionBy($"value").orderBy($"key")
-            .rowsBetween(Window.currentRow, Window.unboundedFollowing)),
-        last("key").over(
-          Window.partitionBy($"value").orderBy($"key")
-            .rowsBetween(Window.unboundedPreceding, Window.currentRow)),
-        last("key").over(Window.partitionBy($"value").orderBy($"key").rowsBetween(-1, 1))),
-      Seq(Row(1, 1, 1, 1), Row(2, 3, 2, 3), Row(3, 3, 3, 3), Row(1, 4, 1, 2), Row(2, 4, 2, 4),
-        Row(4, 4, 4, 4)))
-  }
-
-  test("aggregation and range between with unbounded") {
-    val df = Seq((5, "1"), (5, "2"), (4, "2"), (6, "2"), (3, "1"), (2, "2")).toDF("key", "value")
-    df.createOrReplaceTempView("window_table")
-    checkAnswer(
-      df.select(
-        $"key",
-        last("value").over(
-          Window.partitionBy($"value").orderBy($"key").rangeBetween(-2, -1))
-          .equalTo("2")
-          .as("last_v"),
-        avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(Long.MinValue, 1))
-          .as("avg_key1"),
-        avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(0, Long.MaxValue))
-          .as("avg_key2"),
-        avg("key").over(Window.partitionBy("value").orderBy("key").rangeBetween(-1, 0))
-          .as("avg_key3")
-      ),
-      Seq(Row(3, null, 3.0d, 4.0d, 3.0d),
-        Row(5, false, 4.0d, 5.0d, 5.0d),
-        Row(2, null, 2.0d, 17.0d / 4.0d, 2.0d),
-        Row(4, true, 11.0d / 3.0d, 5.0d, 4.0d),
-        Row(5, true, 17.0d / 4.0d, 11.0d / 2.0d, 4.5d),
-        Row(6, true, 17.0d / 4.0d, 6.0d, 11.0d / 2.0d)))
-  }
-
-  test("reverse sliding range frame") {
-    val df = Seq(
-      (1, "Thin", "Cell Phone", 6000),
-      (2, "Normal", "Tablet", 1500),
-      (3, "Mini", "Tablet", 5500),
-      (4, "Ultra thin", "Cell Phone", 5500),
-      (5, "Very thin", "Cell Phone", 6000),
-      (6, "Big", "Tablet", 2500),
-      (7, "Bendable", "Cell Phone", 3000),
-      (8, "Foldable", "Cell Phone", 3000),
-      (9, "Pro", "Tablet", 4500),
-      (10, "Pro2", "Tablet", 6500)).
-      toDF("id", "product", "category", "revenue")
-    val window = Window.
-      partitionBy($"category").
-      orderBy($"revenue".desc).
-      rangeBetween(-2000L, 1000L)
-    checkAnswer(
-      df.select(
-        $"id",
-        avg($"revenue").over(window).cast("int")),
-      Row(1, 5833) :: Row(2, 2000) :: Row(3, 5500) ::
-        Row(4, 5833) :: Row(5, 5833) :: Row(6, 2833) ::
-        Row(7, 3000) :: Row(8, 3000) :: Row(9, 5500) ::
-        Row(10, 6000) :: Nil)
-  }
-
-  // This is here to illustrate the fact that reverse order also reverses offsets.
-  test("reverse unbounded range frame") {
-    val df = Seq(1, 2, 4, 3, 2, 1).
-      map(Tuple1.apply).
-      toDF("value")
-    val window = Window.orderBy($"value".desc)
-    checkAnswer(
-      df.select(
-        $"value",
-        sum($"value").over(window.rangeBetween(Long.MinValue, 1)),
-        sum($"value").over(window.rangeBetween(1, Long.MaxValue))),
-      Row(1, 13, null) :: Row(2, 13, 2) :: Row(4, 7, 9) ::
-        Row(3, 11, 6) :: Row(2, 13, 2) :: Row(1, 13, null) :: Nil)
-  }
-
   test("statistical functions") {
     val df = Seq(("a", 1), ("a", 1), ("a", 2), ("a", 2), ("b", 4), ("b", 3), ("b", 2)).
       toDF("key", "value")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index d535896723bd..49c59cf695dc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -958,12 +958,12 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     ).toDS()
 
     val expected =
-      """+-------+
-        ||      f|
-        |+-------+
-        ||[foo,1]|
-        ||[bar,2]|
-        |+-------+
+      """+--------+
+        ||       f|
+        |+--------+
+        ||[foo, 1]|
+        ||[bar, 2]|
+        |+--------+
         |""".stripMargin
 
     checkShowString(ds, expected)
@@ -1441,6 +1441,11 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       assert(e.getCause.isInstanceOf[NullPointerException])
     }
   }
+
+  test("SPARK-23025: Add support for null type in scala reflection") {
+    val data = Seq(("a", null))
+    checkDataset(data.toDS(), data: _*)
+  }
 }
 
 case class SingleData(id: Int)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
new file mode 100644
index 000000000000..22fb496bc838
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.test.SharedSQLContext
+
+class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  private val allFileBasedDataSources = Seq("orc", "parquet", "csv", "json", "text")
+
+  allFileBasedDataSources.foreach { format =>
+    test(s"Writing empty datasets should not fail - $format") {
+      withTempPath { dir =>
+        Seq("str").toDS().limit(0).write.format(format).save(dir.getCanonicalPath)
+      }
+    }
+  }
+
+  // `TEXT` data source always has a single column whose name is `value`.
+  allFileBasedDataSources.filterNot(_ == "text").foreach { format =>
+    test(s"SPARK-23072 Write and read back unicode column names - $format") {
+      withTempPath { path =>
+        val dir = path.getCanonicalPath
+
+        // scalastyle:off nonascii
+        val df = Seq("a").toDF("한글")
+        // scalastyle:on nonascii
+
+        df.write.format(format).option("header", "true").save(dir)
+        val answerDf = spark.read.format(format).option("header", "true").load(dir)
+
+        assert(df.schema.sameType(answerDf.schema))
+        checkAnswer(df, answerDf)
+      }
+    }
+  }
+
+  // Only ORC/Parquet support this. `CSV` and `JSON` returns an empty schema.
+  // `TEXT` data source always has a single column whose name is `value`.
+  Seq("orc", "parquet").foreach { format =>
+    test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
+      withTempPath { file =>
+        val path = file.getCanonicalPath
+        val emptyDf = Seq((true, 1, "str")).toDF().limit(0)
+        emptyDf.write.format(format).save(path)
+
+        val df = spark.read.format(format).load(path)
+        assert(df.schema.sameType(emptyDf.schema))
+        checkAnswer(df, emptyDf)
+      }
+    }
+  }
+
+  allFileBasedDataSources.foreach { format =>
+    test(s"SPARK-22146 read files containing special characters using $format") {
+      val nameWithSpecialChars = s"sp&cial%chars"
+      withTempDir { dir =>
+        val tmpFile = s"$dir/$nameWithSpecialChars"
+        spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
+        val fileContent = spark.read.format(format).load(tmpFile)
+        checkAnswer(fileContent, Seq(Row("a"), Row("b")))
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala
new file mode 100644
index 000000000000..c6dd7dadc9d9
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FilterPushdownBenchmark.scala
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.io.File
+
+import scala.util.{Random, Try}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.functions.monotonically_increasing_id
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.util.{Benchmark, Utils}
+
+
+/**
+ * Benchmark to measure read performance with Filter pushdown.
+ */
+object FilterPushdownBenchmark {
+  val conf = new SparkConf()
+  conf.set("orc.compression", "snappy")
+  conf.set("spark.sql.parquet.compression.codec", "snappy")
+
+  private val spark = SparkSession.builder()
+    .master("local[1]")
+    .appName("FilterPushdownBenchmark")
+    .config(conf)
+    .getOrCreate()
+
+  def withTempPath(f: File => Unit): Unit = {
+    val path = Utils.createTempDir()
+    path.delete()
+    try f(path) finally Utils.deleteRecursively(path)
+  }
+
+  def withTempTable(tableNames: String*)(f: => Unit): Unit = {
+    try f finally tableNames.foreach(spark.catalog.dropTempView)
+  }
+
+  def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
+    val (keys, values) = pairs.unzip
+    val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption)
+    (keys, values).zipped.foreach(spark.conf.set)
+    try f finally {
+      keys.zip(currentValues).foreach {
+        case (key, Some(value)) => spark.conf.set(key, value)
+        case (key, None) => spark.conf.unset(key)
+      }
+    }
+  }
+
+  private def prepareTable(dir: File, numRows: Int, width: Int): Unit = {
+    import spark.implicits._
+    val selectExpr = (1 to width).map(i => s"CAST(value AS STRING) c$i")
+    val df = spark.range(numRows).map(_ => Random.nextLong).selectExpr(selectExpr: _*)
+      .withColumn("id", monotonically_increasing_id())
+
+    val dirORC = dir.getCanonicalPath + "/orc"
+    val dirParquet = dir.getCanonicalPath + "/parquet"
+
+    df.write.mode("overwrite").orc(dirORC)
+    df.write.mode("overwrite").parquet(dirParquet)
+
+    spark.read.orc(dirORC).createOrReplaceTempView("orcTable")
+    spark.read.parquet(dirParquet).createOrReplaceTempView("parquetTable")
+  }
+
+  def filterPushDownBenchmark(
+      values: Int,
+      title: String,
+      whereExpr: String,
+      selectExpr: String = "*"): Unit = {
+    val benchmark = new Benchmark(title, values, minNumIters = 5)
+
+    Seq(false, true).foreach { pushDownEnabled =>
+      val name = s"Parquet Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}"
+      benchmark.addCase(name) { _ =>
+        withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> s"$pushDownEnabled") {
+          spark.sql(s"SELECT $selectExpr FROM parquetTable WHERE $whereExpr").collect()
+        }
+      }
+    }
+
+    Seq(false, true).foreach { pushDownEnabled =>
+      val name = s"Native ORC Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}"
+      benchmark.addCase(name) { _ =>
+        withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> s"$pushDownEnabled") {
+          spark.sql(s"SELECT $selectExpr FROM orcTable WHERE $whereExpr").collect()
+        }
+      }
+    }
+
+    /*
+    Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+    Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+    Select 0 row (id IS NULL):              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            7882 / 7957          2.0         501.1       1.0X
+    Parquet Vectorized (Pushdown)                   55 /   60        285.2           3.5     142.9X
+    Native ORC Vectorized                         5592 / 5627          2.8         355.5       1.4X
+    Native ORC Vectorized (Pushdown)                66 /   70        237.2           4.2     118.9X
+
+    Select 0 row (7864320 < id < 7864320):  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            7884 / 7909          2.0         501.2       1.0X
+    Parquet Vectorized (Pushdown)                  739 /  752         21.3          47.0      10.7X
+    Native ORC Vectorized                         5614 / 5646          2.8         356.9       1.4X
+    Native ORC Vectorized (Pushdown)                81 /   83        195.2           5.1      97.8X
+
+    Select 1 row (id = 7864320):            Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            7905 / 8027          2.0         502.6       1.0X
+    Parquet Vectorized (Pushdown)                  740 /  766         21.2          47.1      10.7X
+    Native ORC Vectorized                         5684 / 5738          2.8         361.4       1.4X
+    Native ORC Vectorized (Pushdown)                78 /   81        202.4           4.9     101.7X
+
+    Select 1 row (id <=> 7864320):          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            7928 / 7993          2.0         504.1       1.0X
+    Parquet Vectorized (Pushdown)                  747 /  772         21.0          47.5      10.6X
+    Native ORC Vectorized                         5728 / 5753          2.7         364.2       1.4X
+    Native ORC Vectorized (Pushdown)                76 /   78        207.9           4.8     104.8X
+
+    Select 1 row (7864320 <= id <= 7864320):Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            7939 / 8021          2.0         504.8       1.0X
+    Parquet Vectorized (Pushdown)                  746 /  770         21.1          47.4      10.6X
+    Native ORC Vectorized                         5690 / 5734          2.8         361.7       1.4X
+    Native ORC Vectorized (Pushdown)                76 /   79        206.7           4.8     104.3X
+
+    Select 1 row (7864319 < id < 7864321):  Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            7972 / 8019          2.0         506.9       1.0X
+    Parquet Vectorized (Pushdown)                  742 /  764         21.2          47.2      10.7X
+    Native ORC Vectorized                         5704 / 5743          2.8         362.6       1.4X
+    Native ORC Vectorized (Pushdown)                76 /   78        207.9           4.8     105.4X
+
+    Select 10% rows (id < 1572864):         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                            8733 / 8808          1.8         555.2       1.0X
+    Parquet Vectorized (Pushdown)                 2213 / 2267          7.1         140.7       3.9X
+    Native ORC Vectorized                         6420 / 6463          2.4         408.2       1.4X
+    Native ORC Vectorized (Pushdown)              1313 / 1331         12.0          83.5       6.7X
+
+    Select 50% rows (id < 7864320):         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                          11518 / 11591          1.4         732.3       1.0X
+    Parquet Vectorized (Pushdown)                 7962 / 7991          2.0         506.2       1.4X
+    Native ORC Vectorized                         8927 / 8985          1.8         567.6       1.3X
+    Native ORC Vectorized (Pushdown)              6102 / 6160          2.6         387.9       1.9X
+
+    Select 90% rows (id < 14155776):        Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                          14255 / 14389          1.1         906.3       1.0X
+    Parquet Vectorized (Pushdown)               13564 / 13594          1.2         862.4       1.1X
+    Native ORC Vectorized                       11442 / 11608          1.4         727.5       1.2X
+    Native ORC Vectorized (Pushdown)            10991 / 11029          1.4         698.8       1.3X
+
+    Select all rows (id IS NOT NULL):       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                          14917 / 14938          1.1         948.4       1.0X
+    Parquet Vectorized (Pushdown)               14910 / 14964          1.1         948.0       1.0X
+    Native ORC Vectorized                       11986 / 12069          1.3         762.0       1.2X
+    Native ORC Vectorized (Pushdown)            12037 / 12123          1.3         765.3       1.2X
+
+    Select all rows (id > -1):              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                          14951 / 14976          1.1         950.6       1.0X
+    Parquet Vectorized (Pushdown)               14934 / 15016          1.1         949.5       1.0X
+    Native ORC Vectorized                       12000 / 12156          1.3         763.0       1.2X
+    Native ORC Vectorized (Pushdown)            12079 / 12113          1.3         767.9       1.2X
+
+    Select all rows (id != -1):             Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    -----------------------------------------------------------------------------------------------
+    Parquet Vectorized                          14930 / 14972          1.1         949.3       1.0X
+    Parquet Vectorized (Pushdown)               15015 / 15047          1.0         954.6       1.0X
+    Native ORC Vectorized                       12090 / 12259          1.3         768.7       1.2X
+    Native ORC Vectorized (Pushdown)            12021 / 12096          1.3         764.2       1.2X
+    */
+    benchmark.run()
+  }
+
+  def main(args: Array[String]): Unit = {
+    val numRows = 1024 * 1024 * 15
+    val width = 5
+    val mid = numRows / 2
+
+    withTempPath { dir =>
+      withTempTable("orcTable", "patquetTable") {
+        prepareTable(dir, numRows, width)
+
+        Seq("id IS NULL", s"$mid < id AND id < $mid").foreach { whereExpr =>
+          val title = s"Select 0 row ($whereExpr)".replace("id AND id", "id")
+          filterPushDownBenchmark(numRows, title, whereExpr)
+        }
+
+        Seq(
+          s"id = $mid",
+          s"id <=> $mid",
+          s"$mid <= id AND id <= $mid",
+          s"${mid - 1} < id AND id < ${mid + 1}"
+        ).foreach { whereExpr =>
+          val title = s"Select 1 row ($whereExpr)".replace("id AND id", "id")
+          filterPushDownBenchmark(numRows, title, whereExpr)
+        }
+
+        val selectExpr = (1 to width).map(i => s"MAX(c$i)").mkString("", ",", ", MAX(id)")
+
+        Seq(10, 50, 90).foreach { percent =>
+          filterPushDownBenchmark(
+            numRows,
+            s"Select $percent% rows (id < ${numRows * percent / 100})",
+            s"id < ${numRows * percent / 100}",
+            selectExpr
+          )
+        }
+
+        Seq("id IS NOT NULL", "id > -1", "id != -1").foreach { whereExpr =>
+          filterPushDownBenchmark(
+            numRows,
+            s"Select all rows ($whereExpr)",
+            whereExpr,
+            selectExpr)
+        }
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5e077285ade5..083a0c0b1b9a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -28,8 +28,6 @@ import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, SortAggregateExec}
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
-import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, CartesianProductExec, SortMergeJoinExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -1519,24 +1517,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("decimal precision with multiply/division") {
-    checkAnswer(sql("select 10.3 * 3.0"), Row(BigDecimal("30.90")))
-    checkAnswer(sql("select 10.3000 * 3.0"), Row(BigDecimal("30.90000")))
-    checkAnswer(sql("select 10.30000 * 30.0"), Row(BigDecimal("309.000000")))
-    checkAnswer(sql("select 10.300000000000000000 * 3.000000000000000000"),
-      Row(BigDecimal("30.900000000000000000000000000000000000", new MathContext(38))))
-    checkAnswer(sql("select 10.300000000000000000 * 3.0000000000000000000"),
-      Row(null))
-
-    checkAnswer(sql("select 10.3 / 3.0"), Row(BigDecimal("3.433333")))
-    checkAnswer(sql("select 10.3000 / 3.0"), Row(BigDecimal("3.4333333")))
-    checkAnswer(sql("select 10.30000 / 30.0"), Row(BigDecimal("0.343333333")))
-    checkAnswer(sql("select 10.300000000000000000 / 3.00000000000000000"),
-      Row(BigDecimal("3.433333333333333333333333333", new MathContext(38))))
-    checkAnswer(sql("select 10.3000000000000000000 / 3.00000000000000000"),
-      Row(BigDecimal("3.4333333333333333333333333333", new MathContext(38))))
-  }
-
   test("SPARK-10215 Div of Decimal returns null") {
     val d = Decimal(1.12321).toBigDecimal
     val df = Seq((d, 1)).toDF("a", "b")
@@ -2719,6 +2699,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("SPARK-23079: constraints should be inferred correctly with aliases") {
+    withTable("t") {
+      spark.range(5).write.saveAsTable("t")
+      val t = spark.read.table("t")
+      val left = t.withColumn("xid", $"id" + lit(1)).as("x")
+      val right = t.withColumnRenamed("id", "xid").as("y")
+      val df = left.join(right, "xid").filter("id = 3").toDF()
+      checkAnswer(df, Row(4, 3))
+    }
+  }
+
   test("SRARK-22266: the same aggregate function was calculated multiple times") {
     val query = "SELECT a, max(b+1), max(b+1) + 1 FROM testData2 GROUP BY a"
     val df = sql(query)
@@ -2759,20 +2750,4 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
-
-  // Only New OrcFileFormat supports this
-  Seq(classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName,
-      "parquet").foreach { format =>
-    test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
-      withTempPath { file =>
-        val path = file.getCanonicalPath
-        val emptyDf = Seq((true, 1, "str")).toDF.limit(0)
-        emptyDf.write.format(format).save(path)
-
-        val df = spark.read.format(format).load(path)
-        assert(df.schema.sameType(emptyDf.schema))
-        checkAnswer(df, emptyDf)
-      }
-    }
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala
index c01666770720..8386f324d8fe 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql
 
-import org.scalatest.BeforeAndAfterAll
-import org.scalatest.BeforeAndAfterEach
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.SparkFunSuite
@@ -28,8 +26,7 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.util.QueryExecutionListener
 
-class SessionStateSuite extends SparkFunSuite
-    with BeforeAndAfterEach with BeforeAndAfterAll {
+class SessionStateSuite extends SparkFunSuite {
 
   /**
    * A shared SparkSession for all tests in this suite. Make sure you reset any changes to this
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index 7f1c009ca6e7..af6a10b425b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.execution.command.ExplainCommand
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.functions.udf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
-import org.apache.spark.sql.types.DataTypes
+import org.apache.spark.sql.types.{DataTypes, DoubleType}
 
 private case class FunctionResult(f1: String, f2: String)
 
@@ -79,7 +80,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     val e = intercept[AnalysisException] {
       df.selectExpr("substr('abcd', 2, 3, 4)")
     }
-    assert(e.getMessage.contains("Invalid number of arguments for function substr"))
+    assert(e.getMessage.contains("Invalid number of arguments for function substr. Expected:"))
   }
 
   test("error reporting for incorrect number of arguments - udf") {
@@ -88,7 +89,7 @@ class UDFSuite extends QueryTest with SharedSQLContext {
       spark.udf.register("foo", (_: String).length)
       df.selectExpr("foo(2, 3, 4)")
     }
-    assert(e.getMessage.contains("Invalid number of arguments for function foo"))
+    assert(e.getMessage.contains("Invalid number of arguments for function foo. Expected:"))
   }
 
   test("error reporting for undefined functions") {
@@ -128,6 +129,13 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     val df2 = testData.select(bar())
     assert(df2.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic))
     assert(df2.head().getDouble(0) >= 0.0)
+
+    val javaUdf = udf(new UDF0[Double] {
+      override def call(): Double = Math.random()
+    }, DoubleType).asNondeterministic()
+    val df3 = testData.select(javaUdf())
+    assert(df3.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic))
+    assert(df3.head().getDouble(0) >= 0.0)
   }
 
   test("TwoArgument UDF") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index a08433ba794d..cc8b600efa46 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -21,7 +21,7 @@ import scala.beans.{BeanInfo, BeanProperty}
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
+import org.apache.spark.sql.catalyst.expressions.{Cast, ExpressionEvalHelper, GenericInternalRow, Literal}
 import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetTest
 import org.apache.spark.sql.functions._
@@ -44,6 +44,8 @@ object UDT {
       case v: MyDenseVector => java.util.Arrays.equals(this.data, v.data)
       case _ => false
     }
+
+    override def toString: String = data.mkString("(", ", ", ")")
   }
 
   private[sql] class MyDenseVectorUDT extends UserDefinedType[MyDenseVector] {
@@ -143,7 +145,8 @@ private[spark] class ExampleSubTypeUDT extends UserDefinedType[IExampleSubType]
   override def userClass: Class[IExampleSubType] = classOf[IExampleSubType]
 }
 
-class UserDefinedTypeSuite extends QueryTest with SharedSQLContext with ParquetTest {
+class UserDefinedTypeSuite extends QueryTest with SharedSQLContext with ParquetTest
+    with ExpressionEvalHelper {
   import testImplicits._
 
   private lazy val pointsRDD = Seq(
@@ -304,4 +307,12 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext with ParquetT
       pointsRDD.except(pointsRDD2),
       Seq(Row(0.0, new UDT.MyDenseVector(Array(0.2, 2.0)))))
   }
+
+  test("SPARK-23054 Cast UserDefinedType to string") {
+    val udt = new UDT.MyDenseVectorUDT()
+    val vector = new UDT.MyDenseVector(Array(1.0, 3.0, 5.0, 7.0, 9.0))
+    val data = udt.serialize(vector)
+    val ret = Cast(Literal(data, udt), StringType, None)
+    checkEvaluation(ret, "(1.0, 3.0, 5.0, 7.0, 9.0)")
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala
index cc943e0356f2..dcc6fa6403f3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala
@@ -36,7 +36,7 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
 
   test("basic semantic") {
     val expectedErrorMsg = "not found"
-    try {
+    withGlobalTempView("src") {
       sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1, 'a'")
 
       // If there is no database in table name, we should try local temp view first, if not found,
@@ -79,19 +79,15 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
       // We can also use Dataset API to replace global temp view
       Seq(2 -> "b").toDF("i", "j").createOrReplaceGlobalTempView("src")
       checkAnswer(spark.table(s"$globalTempDB.src"), Row(2, "b"))
-    } finally {
-      spark.catalog.dropGlobalTempView("src")
     }
   }
 
   test("global temp view is shared among all sessions") {
-    try {
+    withGlobalTempView("src") {
       sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1, 2")
       checkAnswer(spark.table(s"$globalTempDB.src"), Row(1, 2))
       val newSession = spark.newSession()
       checkAnswer(newSession.table(s"$globalTempDB.src"), Row(1, 2))
-    } finally {
-      spark.catalog.dropGlobalTempView("src")
     }
   }
 
@@ -105,27 +101,25 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
 
   test("CREATE GLOBAL TEMP VIEW USING") {
     withTempPath { path =>
-      try {
+      withGlobalTempView("src") {
         Seq(1 -> "a").toDF("i", "j").write.parquet(path.getAbsolutePath)
         sql(s"CREATE GLOBAL TEMP VIEW src USING parquet OPTIONS (PATH '${path.toURI}')")
         checkAnswer(spark.table(s"$globalTempDB.src"), Row(1, "a"))
         sql(s"INSERT INTO $globalTempDB.src SELECT 2, 'b'")
         checkAnswer(spark.table(s"$globalTempDB.src"), Row(1, "a") :: Row(2, "b") :: Nil)
-      } finally {
-        spark.catalog.dropGlobalTempView("src")
       }
     }
   }
 
   test("CREATE TABLE LIKE should work for global temp view") {
-    try {
-      sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1 AS a, '2' AS b")
-      sql(s"CREATE TABLE cloned LIKE $globalTempDB.src")
-      val tableMeta = spark.sessionState.catalog.getTableMetadata(TableIdentifier("cloned"))
-      assert(tableMeta.schema == new StructType().add("a", "int", false).add("b", "string", false))
-    } finally {
-      spark.catalog.dropGlobalTempView("src")
-      sql("DROP TABLE default.cloned")
+    withTable("cloned") {
+      withGlobalTempView("src") {
+        sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1 AS a, '2' AS b")
+        sql(s"CREATE TABLE cloned LIKE $globalTempDB.src")
+        val tableMeta = spark.sessionState.catalog.getTableMetadata(TableIdentifier("cloned"))
+        assert(tableMeta.schema == new StructType()
+          .add("a", "int", false).add("b", "string", false))
+      }
     }
   }
 
@@ -146,26 +140,25 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
   }
 
   test("should lookup global temp view if and only if global temp db is specified") {
-    try {
-      sql("CREATE GLOBAL TEMP VIEW same_name AS SELECT 3, 4")
-      sql("CREATE TEMP VIEW same_name AS SELECT 1, 2")
+    withTempView("same_name") {
+      withGlobalTempView("same_name") {
+        sql("CREATE GLOBAL TEMP VIEW same_name AS SELECT 3, 4")
+        sql("CREATE TEMP VIEW same_name AS SELECT 1, 2")
 
-      checkAnswer(sql("SELECT * FROM same_name"), Row(1, 2))
+        checkAnswer(sql("SELECT * FROM same_name"), Row(1, 2))
 
-      // we never lookup global temp views if database is not specified in table name
-      spark.catalog.dropTempView("same_name")
-      intercept[AnalysisException](sql("SELECT * FROM same_name"))
+        // we never lookup global temp views if database is not specified in table name
+        spark.catalog.dropTempView("same_name")
+        intercept[AnalysisException](sql("SELECT * FROM same_name"))
 
-      // Use qualified name to lookup a global temp view.
-      checkAnswer(sql(s"SELECT * FROM $globalTempDB.same_name"), Row(3, 4))
-    } finally {
-      spark.catalog.dropTempView("same_name")
-      spark.catalog.dropGlobalTempView("same_name")
+        // Use qualified name to lookup a global temp view.
+        checkAnswer(sql(s"SELECT * FROM $globalTempDB.same_name"), Row(3, 4))
+      }
     }
   }
 
   test("public Catalog should recognize global temp view") {
-    try {
+    withGlobalTempView("src")  {
       sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1, 2")
 
       assert(spark.catalog.tableExists(globalTempDB, "src"))
@@ -175,8 +168,6 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
         description = null,
         tableType = "TEMPORARY",
         isTemporary = true).toString)
-    } finally {
-      spark.catalog.dropGlobalTempView("src")
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index b50642d275ba..f8b26f5b28cc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -260,11 +260,16 @@ class PlannerSuite extends SharedSQLContext {
   // do they satisfy the distribution requirements? As a result, we need at least four test cases.
 
   private def assertDistributionRequirementsAreSatisfied(outputPlan: SparkPlan): Unit = {
-    if (outputPlan.children.length > 1
-        && outputPlan.requiredChildDistribution.toSet != Set(UnspecifiedDistribution)) {
-      val childPartitionings = outputPlan.children.map(_.outputPartitioning)
-      if (!Partitioning.allCompatible(childPartitionings)) {
-        fail(s"Partitionings are not compatible: $childPartitionings")
+    if (outputPlan.children.length > 1) {
+      val childPartitionings = outputPlan.children.zip(outputPlan.requiredChildDistribution)
+        .filter {
+          case (_, UnspecifiedDistribution) => false
+          case (_, _: BroadcastDistribution) => false
+          case _ => true
+        }.map(_._1.outputPartitioning)
+
+      if (childPartitionings.map(_.numPartitions).toSet.size > 1) {
+        fail(s"Partitionings doesn't have same number of partitions: $childPartitionings")
       }
     }
     outputPlan.children.zip(outputPlan.requiredChildDistribution).foreach {
@@ -274,40 +279,7 @@ class PlannerSuite extends SharedSQLContext {
     }
   }
 
-  test("EnsureRequirements with incompatible child partitionings which satisfy distribution") {
-    // Consider an operator that requires inputs that are clustered by two expressions (e.g.
-    // sort merge join where there are multiple columns in the equi-join condition)
-    val clusteringA = Literal(1) :: Nil
-    val clusteringB = Literal(2) :: Nil
-    val distribution = ClusteredDistribution(clusteringA ++ clusteringB)
-    // Say that the left and right inputs are each partitioned by _one_ of the two join columns:
-    val leftPartitioning = HashPartitioning(clusteringA, 1)
-    val rightPartitioning = HashPartitioning(clusteringB, 1)
-    // Individually, each input's partitioning satisfies the clustering distribution:
-    assert(leftPartitioning.satisfies(distribution))
-    assert(rightPartitioning.satisfies(distribution))
-    // However, these partitionings are not compatible with each other, so we still need to
-    // repartition both inputs prior to performing the join:
-    assert(!leftPartitioning.compatibleWith(rightPartitioning))
-    assert(!rightPartitioning.compatibleWith(leftPartitioning))
-    val inputPlan = DummySparkPlan(
-      children = Seq(
-        DummySparkPlan(outputPartitioning = leftPartitioning),
-        DummySparkPlan(outputPartitioning = rightPartitioning)
-      ),
-      requiredChildDistribution = Seq(distribution, distribution),
-      requiredChildOrdering = Seq(Seq.empty, Seq.empty)
-    )
-    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
-    assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case e: ShuffleExchangeExec => true }.isEmpty) {
-      fail(s"Exchange should have been added:\n$outputPlan")
-    }
-  }
-
   test("EnsureRequirements with child partitionings with different numbers of output partitions") {
-    // This is similar to the previous test, except it checks that partitionings are not compatible
-    // unless they produce the same number of partitions.
     val clustering = Literal(1) :: Nil
     val distribution = ClusteredDistribution(clustering)
     val inputPlan = DummySparkPlan(
@@ -386,18 +358,15 @@ class PlannerSuite extends SharedSQLContext {
     }
   }
 
-  test("EnsureRequirements eliminates Exchange if child has Exchange with same partitioning") {
+  test("EnsureRequirements eliminates Exchange if child has same partitioning") {
     val distribution = ClusteredDistribution(Literal(1) :: Nil)
-    val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 5)
-    val childPartitioning = HashPartitioning(Literal(2) :: Nil, 5)
-    assert(!childPartitioning.satisfies(distribution))
-    val inputPlan = ShuffleExchangeExec(finalPartitioning,
-      DummySparkPlan(
-        children = DummySparkPlan(outputPartitioning = childPartitioning) :: Nil,
-        requiredChildDistribution = Seq(distribution),
-        requiredChildOrdering = Seq(Seq.empty)),
-      None)
+    val partitioning = HashPartitioning(Literal(1) :: Nil, 5)
+    assert(partitioning.satisfies(distribution))
 
+    val inputPlan = ShuffleExchangeExec(
+      partitioning,
+      DummySparkPlan(outputPartitioning = partitioning),
+      None)
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
     if (outputPlan.collect { case e: ShuffleExchangeExec => true }.size == 2) {
@@ -407,17 +376,13 @@ class PlannerSuite extends SharedSQLContext {
 
   test("EnsureRequirements does not eliminate Exchange with different partitioning") {
     val distribution = ClusteredDistribution(Literal(1) :: Nil)
-    // Number of partitions differ
-    val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 8)
-    val childPartitioning = HashPartitioning(Literal(2) :: Nil, 5)
-    assert(!childPartitioning.satisfies(distribution))
-    val inputPlan = ShuffleExchangeExec(finalPartitioning,
-      DummySparkPlan(
-        children = DummySparkPlan(outputPartitioning = childPartitioning) :: Nil,
-        requiredChildDistribution = Seq(distribution),
-        requiredChildOrdering = Seq(Seq.empty)),
-      None)
+    val partitioning = HashPartitioning(Literal(2) :: Nil, 5)
+    assert(!partitioning.satisfies(distribution))
 
+    val inputPlan = ShuffleExchangeExec(
+      partitioning,
+      DummySparkPlan(outputPartitioning = partitioning),
+      None)
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
     if (outputPlan.collect { case e: ShuffleExchangeExec => true }.size == 1) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
index 08a4a21b20f6..14082197ba0b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
@@ -69,21 +69,25 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils {
   }
 
   test("create a permanent view on a temp view") {
-    withView("jtv1", "temp_jtv1", "global_temp_jtv1") {
-      sql("CREATE TEMPORARY VIEW temp_jtv1 AS SELECT * FROM jt WHERE id > 3")
-      var e = intercept[AnalysisException] {
-        sql("CREATE VIEW jtv1 AS SELECT * FROM temp_jtv1 WHERE id < 6")
-      }.getMessage
-      assert(e.contains("Not allowed to create a permanent view `jtv1` by " +
-        "referencing a temporary view `temp_jtv1`"))
-
-      val globalTempDB = spark.sharedState.globalTempViewManager.database
-      sql("CREATE GLOBAL TEMP VIEW global_temp_jtv1 AS SELECT * FROM jt WHERE id > 0")
-      e = intercept[AnalysisException] {
-        sql(s"CREATE VIEW jtv1 AS SELECT * FROM $globalTempDB.global_temp_jtv1 WHERE id < 6")
-      }.getMessage
-      assert(e.contains(s"Not allowed to create a permanent view `jtv1` by referencing " +
-        s"a temporary view `global_temp`.`global_temp_jtv1`"))
+    withView("jtv1") {
+      withTempView("temp_jtv1") {
+        withGlobalTempView("global_temp_jtv1") {
+          sql("CREATE TEMPORARY VIEW temp_jtv1 AS SELECT * FROM jt WHERE id > 3")
+          var e = intercept[AnalysisException] {
+            sql("CREATE VIEW jtv1 AS SELECT * FROM temp_jtv1 WHERE id < 6")
+          }.getMessage
+          assert(e.contains("Not allowed to create a permanent view `jtv1` by " +
+            "referencing a temporary view `temp_jtv1`"))
+
+          val globalTempDB = spark.sharedState.globalTempViewManager.database
+          sql("CREATE GLOBAL TEMP VIEW global_temp_jtv1 AS SELECT * FROM jt WHERE id > 0")
+          e = intercept[AnalysisException] {
+            sql(s"CREATE VIEW jtv1 AS SELECT * FROM $globalTempDB.global_temp_jtv1 WHERE id < 6")
+          }.getMessage
+          assert(e.contains(s"Not allowed to create a permanent view `jtv1` by referencing " +
+            s"a temporary view `global_temp`.`global_temp_jtv1`"))
+        }
+      }
     }
   }
 
@@ -289,7 +293,7 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils {
         sql("CREATE TEMPORARY VIEW testView AS SELECT id FROM jt")
       }
 
-      assert(e.message.contains("Temporary table") && e.message.contains("already exists"))
+      assert(e.message.contains("Temporary view") && e.message.contains("already exists"))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index 232c1beae799..3e31d22e15c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -70,6 +70,7 @@ class UnsafeFixedWidthAggregationMapSuite
 
       TaskContext.setTaskContext(new TaskContextImpl(
         stageId = 0,
+        stageAttemptNumber = 0,
         partitionId = 0,
         taskAttemptId = Random.nextInt(10000),
         attemptNumber = 0,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
index 604502f2a57d..6af9f8b77f8d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
@@ -116,6 +116,7 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSQLContext {
     val taskMemMgr = new TaskMemoryManager(memoryManager, 0)
     TaskContext.setTaskContext(new TaskContextImpl(
       stageId = 0,
+      stageAttemptNumber = 0,
       partitionId = 0,
       taskAttemptId = 98456,
       attemptNumber = 0,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index dff88ce7f1b9..a3ae93810aa3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -114,7 +114,7 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
         (i, converter(Row(i)))
       }
       val taskMemoryManager = new TaskMemoryManager(sc.env.memoryManager, 0)
-      val taskContext = new TaskContextImpl(0, 0, 0, 0, taskMemoryManager, new Properties, null)
+      val taskContext = new TaskContextImpl(0, 0, 0, 0, 0, taskMemoryManager, new Properties, null)
 
       val sorter = new ExternalSorter[Int, UnsafeRow, UnsafeRow](
         taskContext,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index bc05dca578c4..22ca128c2776 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -121,31 +121,23 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
   test("cache for primitive type should be in WholeStageCodegen with InMemoryTableScanExec") {
     import testImplicits._
 
-    val dsInt = spark.range(3).cache
-    dsInt.count
+    val dsInt = spark.range(3).cache()
+    dsInt.count()
     val dsIntFilter = dsInt.filter(_ > 0)
     val planInt = dsIntFilter.queryExecution.executedPlan
-    assert(planInt.find(p =>
-      p.isInstanceOf[WholeStageCodegenExec] &&
-      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec] &&
-      p.asInstanceOf[WholeStageCodegenExec].child.asInstanceOf[FilterExec].child
-        .isInstanceOf[InMemoryTableScanExec] &&
-      p.asInstanceOf[WholeStageCodegenExec].child.asInstanceOf[FilterExec].child
-        .asInstanceOf[InMemoryTableScanExec].supportCodegen).isDefined
-    )
+    assert(planInt.collect {
+      case WholeStageCodegenExec(FilterExec(_, i: InMemoryTableScanExec)) if i.supportsBatch => ()
+    }.length == 1)
     assert(dsIntFilter.collect() === Array(1, 2))
 
     // cache for string type is not supported for InMemoryTableScanExec
-    val dsString = spark.range(3).map(_.toString).cache
-    dsString.count
+    val dsString = spark.range(3).map(_.toString).cache()
+    dsString.count()
     val dsStringFilter = dsString.filter(_ == "1")
     val planString = dsStringFilter.queryExecution.executedPlan
-    assert(planString.find(p =>
-      p.isInstanceOf[WholeStageCodegenExec] &&
-      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec] &&
-      !p.asInstanceOf[WholeStageCodegenExec].child.asInstanceOf[FilterExec].child
-        .isInstanceOf[InMemoryTableScanExec]).isDefined
-    )
+    assert(planString.collect {
+      case WholeStageCodegenExec(FilterExec(_, i: InMemoryTableScanExec)) if !i.supportsBatch => ()
+    }.length == 1)
     assert(dsStringFilter.collect() === Array("1"))
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
index 10f1ee279bed..3fad7dfddadc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
@@ -35,7 +35,8 @@ class SortBasedAggregationStoreSuite  extends SparkFunSuite with LocalSparkConte
     val conf = new SparkConf()
     sc = new SparkContext("local[2, 4]", "test", conf)
     val taskManager = new TaskMemoryManager(new TestMemoryManager(conf), 0)
-    TaskContext.setTaskContext(new TaskContextImpl(0, 0, 0, 0, taskManager, new Properties, null))
+    TaskContext.setTaskContext(
+      new TaskContextImpl(0, 0, 0, 0, 0, taskManager, new Properties, null))
   }
 
   override def afterAll(): Unit = TaskContext.unset()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
index 508c116aae92..c42bc60a59d6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
@@ -20,8 +20,8 @@ package org.apache.spark.sql.execution.arrow
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.ArrayData
-import org.apache.spark.sql.execution.vectorized.ArrowColumnVector
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ArrowColumnVector
 import org.apache.spark.unsafe.types.UTF8String
 
 class ArrowWriterSuite extends SparkFunSuite {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/CommandUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/CommandUtilsSuite.scala
new file mode 100644
index 000000000000..f3e15189a641
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/CommandUtilsSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.command
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.catalog.CatalogStatistics
+
+class CommandUtilsSuite extends SparkFunSuite {
+
+  test("Check if compareAndGetNewStats returns correct results") {
+    val oldStats1 = CatalogStatistics(sizeInBytes = 10, rowCount = Some(100))
+    val newStats1 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats1), newTotalSize = 10, newRowCount = Some(100))
+    assert(newStats1.isEmpty)
+    val newStats2 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats1), newTotalSize = -1, newRowCount = None)
+    assert(newStats2.isEmpty)
+    val newStats3 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats1), newTotalSize = 20, newRowCount = Some(-1))
+    assert(newStats3.isDefined)
+    newStats3.foreach { stat =>
+      assert(stat.sizeInBytes === 20)
+      assert(stat.rowCount.isEmpty)
+    }
+    val newStats4 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats1), newTotalSize = -1, newRowCount = Some(200))
+    assert(newStats4.isDefined)
+    newStats4.foreach { stat =>
+      assert(stat.sizeInBytes === 10)
+      assert(stat.rowCount.isDefined && stat.rowCount.get === 200)
+    }
+  }
+
+  test("Check if compareAndGetNewStats can handle large values") {
+    // Tests for large values
+    val oldStats2 = CatalogStatistics(sizeInBytes = BigInt(Long.MaxValue) * 2)
+    val newStats5 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats2), newTotalSize = BigInt(Long.MaxValue) * 2, None)
+    assert(newStats5.isEmpty)
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
index eb7c33590b60..2b1aea08b122 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
@@ -54,6 +54,13 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
     }
   }
 
+  private def intercept(sqlCommand: String, messages: String*): Unit = {
+    val e = intercept[ParseException](parser.parsePlan(sqlCommand)).getMessage
+    messages.foreach { message =>
+      assert(e.contains(message))
+    }
+  }
+
   private def parseAs[T: ClassTag](query: String): T = {
     parser.parsePlan(query) match {
       case t: T => t
@@ -494,6 +501,37 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
     }
   }
 
+  test("Duplicate clauses - create table") {
+    def createTableHeader(duplicateClause: String, isNative: Boolean): String = {
+      val fileFormat = if (isNative) "USING parquet" else "STORED AS parquet"
+      s"CREATE TABLE my_tab(a INT, b STRING) $fileFormat $duplicateClause $duplicateClause"
+    }
+
+    Seq(true, false).foreach { isNative =>
+      intercept(createTableHeader("TBLPROPERTIES('test' = 'test2')", isNative),
+        "Found duplicate clauses: TBLPROPERTIES")
+      intercept(createTableHeader("LOCATION '/tmp/file'", isNative),
+        "Found duplicate clauses: LOCATION")
+      intercept(createTableHeader("COMMENT 'a table'", isNative),
+        "Found duplicate clauses: COMMENT")
+      intercept(createTableHeader("CLUSTERED BY(b) INTO 256 BUCKETS", isNative),
+        "Found duplicate clauses: CLUSTERED BY")
+    }
+
+    // Only for native data source tables
+    intercept(createTableHeader("PARTITIONED BY (b)", isNative = true),
+      "Found duplicate clauses: PARTITIONED BY")
+
+    // Only for Hive serde tables
+    intercept(createTableHeader("PARTITIONED BY (k int)", isNative = false),
+      "Found duplicate clauses: PARTITIONED BY")
+    intercept(createTableHeader("STORED AS parquet", isNative = false),
+      "Found duplicate clauses: STORED AS/BY")
+    intercept(
+      createTableHeader("ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'", isNative = false),
+      "Found duplicate clauses: ROW FORMAT")
+  }
+
   test("create table - with location") {
     val v1 = "CREATE TABLE my_tab(a INT, b STRING) USING parquet LOCATION '/tmp/file'"
 
@@ -1153,38 +1191,119 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
     }
   }
 
+  test("Test CTAS against data source tables") {
+    val s1 =
+      """
+        |CREATE TABLE IF NOT EXISTS mydb.page_view
+        |USING parquet
+        |COMMENT 'This is the staging page view table'
+        |LOCATION '/user/external/page_view'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    val s2 =
+      """
+        |CREATE TABLE IF NOT EXISTS mydb.page_view
+        |USING parquet
+        |LOCATION '/user/external/page_view'
+        |COMMENT 'This is the staging page view table'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    val s3 =
+      """
+        |CREATE TABLE IF NOT EXISTS mydb.page_view
+        |USING parquet
+        |COMMENT 'This is the staging page view table'
+        |LOCATION '/user/external/page_view'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    checkParsing(s1)
+    checkParsing(s2)
+    checkParsing(s3)
+
+    def checkParsing(sql: String): Unit = {
+      val (desc, exists) = extractTableDesc(sql)
+      assert(exists)
+      assert(desc.identifier.database == Some("mydb"))
+      assert(desc.identifier.table == "page_view")
+      assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
+      assert(desc.schema.isEmpty) // will be populated later when the table is actually created
+      assert(desc.comment == Some("This is the staging page view table"))
+      assert(desc.viewText.isEmpty)
+      assert(desc.viewDefaultDatabase.isEmpty)
+      assert(desc.viewQueryColumnNames.isEmpty)
+      assert(desc.partitionColumnNames.isEmpty)
+      assert(desc.provider == Some("parquet"))
+      assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    }
+  }
+
   test("Test CTAS #1") {
     val s1 =
-      """CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
         |COMMENT 'This is the staging page view table'
         |STORED AS RCFILE
         |LOCATION '/user/external/page_view'
         |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
-        |AS SELECT * FROM src""".stripMargin
+        |AS SELECT * FROM src
+      """.stripMargin
 
-    val (desc, exists) = extractTableDesc(s1)
-    assert(exists)
-    assert(desc.identifier.database == Some("mydb"))
-    assert(desc.identifier.table == "page_view")
-    assert(desc.tableType == CatalogTableType.EXTERNAL)
-    assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
-    assert(desc.schema.isEmpty) // will be populated later when the table is actually created
-    assert(desc.comment == Some("This is the staging page view table"))
-    // TODO will be SQLText
-    assert(desc.viewText.isEmpty)
-    assert(desc.viewDefaultDatabase.isEmpty)
-    assert(desc.viewQueryColumnNames.isEmpty)
-    assert(desc.partitionColumnNames.isEmpty)
-    assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat"))
-    assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"))
-    assert(desc.storage.serde ==
-      Some("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe"))
-    assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    val s2 =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+        |STORED AS RCFILE
+        |COMMENT 'This is the staging page view table'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |LOCATION '/user/external/page_view'
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    val s3 =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |LOCATION '/user/external/page_view'
+        |STORED AS RCFILE
+        |COMMENT 'This is the staging page view table'
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    checkParsing(s1)
+    checkParsing(s2)
+    checkParsing(s3)
+
+    def checkParsing(sql: String): Unit = {
+      val (desc, exists) = extractTableDesc(sql)
+      assert(exists)
+      assert(desc.identifier.database == Some("mydb"))
+      assert(desc.identifier.table == "page_view")
+      assert(desc.tableType == CatalogTableType.EXTERNAL)
+      assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
+      assert(desc.schema.isEmpty) // will be populated later when the table is actually created
+      assert(desc.comment == Some("This is the staging page view table"))
+      // TODO will be SQLText
+      assert(desc.viewText.isEmpty)
+      assert(desc.viewDefaultDatabase.isEmpty)
+      assert(desc.viewQueryColumnNames.isEmpty)
+      assert(desc.partitionColumnNames.isEmpty)
+      assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat"))
+      assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"))
+      assert(desc.storage.serde ==
+        Some("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe"))
+      assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    }
   }
 
   test("Test CTAS #2") {
-    val s2 =
-      """CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+    val s1 =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
         |COMMENT 'This is the staging page view table'
         |ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
         | STORED AS
@@ -1192,26 +1311,45 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
         | OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
         |LOCATION '/user/external/page_view'
         |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
-        |AS SELECT * FROM src""".stripMargin
+        |AS SELECT * FROM src
+      """.stripMargin
 
-    val (desc, exists) = extractTableDesc(s2)
-    assert(exists)
-    assert(desc.identifier.database == Some("mydb"))
-    assert(desc.identifier.table == "page_view")
-    assert(desc.tableType == CatalogTableType.EXTERNAL)
-    assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
-    assert(desc.schema.isEmpty) // will be populated later when the table is actually created
-    // TODO will be SQLText
-    assert(desc.comment == Some("This is the staging page view table"))
-    assert(desc.viewText.isEmpty)
-    assert(desc.viewDefaultDatabase.isEmpty)
-    assert(desc.viewQueryColumnNames.isEmpty)
-    assert(desc.partitionColumnNames.isEmpty)
-    assert(desc.storage.properties == Map())
-    assert(desc.storage.inputFormat == Some("parquet.hive.DeprecatedParquetInputFormat"))
-    assert(desc.storage.outputFormat == Some("parquet.hive.DeprecatedParquetOutputFormat"))
-    assert(desc.storage.serde == Some("parquet.hive.serde.ParquetHiveSerDe"))
-    assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    val s2 =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+        |LOCATION '/user/external/page_view'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
+        | STORED AS
+        | INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
+        | OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
+        |COMMENT 'This is the staging page view table'
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    checkParsing(s1)
+    checkParsing(s2)
+
+    def checkParsing(sql: String): Unit = {
+      val (desc, exists) = extractTableDesc(sql)
+      assert(exists)
+      assert(desc.identifier.database == Some("mydb"))
+      assert(desc.identifier.table == "page_view")
+      assert(desc.tableType == CatalogTableType.EXTERNAL)
+      assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
+      assert(desc.schema.isEmpty) // will be populated later when the table is actually created
+      // TODO will be SQLText
+      assert(desc.comment == Some("This is the staging page view table"))
+      assert(desc.viewText.isEmpty)
+      assert(desc.viewDefaultDatabase.isEmpty)
+      assert(desc.viewQueryColumnNames.isEmpty)
+      assert(desc.partitionColumnNames.isEmpty)
+      assert(desc.storage.properties == Map())
+      assert(desc.storage.inputFormat == Some("parquet.hive.DeprecatedParquetInputFormat"))
+      assert(desc.storage.outputFormat == Some("parquet.hive.DeprecatedParquetOutputFormat"))
+      assert(desc.storage.serde == Some("parquet.hive.serde.ParquetHiveSerDe"))
+      assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    }
   }
 
   test("Test CTAS #3") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index fdb9b2f51f9c..6ca21b5aa159 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -835,6 +835,31 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("rename temporary view - destination table with database name,with:CREATE TEMPORARY view") {
+    withTempView("view1") {
+      sql(
+        """
+          |CREATE TEMPORARY VIEW view1
+          |USING org.apache.spark.sql.sources.DDLScanSource
+          |OPTIONS (
+          |  From '1',
+          |  To '10',
+          |  Table 'test1'
+          |)
+        """.stripMargin)
+
+      val e = intercept[AnalysisException] {
+        sql("ALTER TABLE view1 RENAME TO default.tab2")
+      }
+      assert(e.getMessage.contains(
+        "RENAME TEMPORARY VIEW from '`view1`' to '`default`.`tab2`': " +
+          "cannot specify database name 'default' in the destination table"))
+
+      val catalog = spark.sessionState.catalog
+      assert(catalog.listTables("default") == Seq(TableIdentifier("view1")))
+    }
+  }
+
   test("rename temporary view") {
     withTempView("tab1", "tab2") {
       spark.range(10).createOrReplaceTempView("tab1")
@@ -883,6 +908,42 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("rename temporary view - destination table already exists, with: CREATE TEMPORARY view") {
+    withTempView("view1", "view2") {
+      sql(
+        """
+          |CREATE TEMPORARY VIEW view1
+          |USING org.apache.spark.sql.sources.DDLScanSource
+          |OPTIONS (
+          |  From '1',
+          |  To '10',
+          |  Table 'test1'
+          |)
+        """.stripMargin)
+
+      sql(
+        """
+          |CREATE TEMPORARY VIEW view2
+          |USING org.apache.spark.sql.sources.DDLScanSource
+          |OPTIONS (
+          |  From '1',
+          |  To '10',
+          |  Table 'test1'
+          |)
+        """.stripMargin)
+
+      val e = intercept[AnalysisException] {
+        sql("ALTER TABLE view1 RENAME TO view2")
+      }
+      assert(e.getMessage.contains(
+        "RENAME TEMPORARY VIEW from '`view1`' to '`view2`': destination table already exists"))
+
+      val catalog = spark.sessionState.catalog
+      assert(catalog.listTables("default") ==
+        Seq(TableIdentifier("view1"), TableIdentifier("view2")))
+    }
+  }
+
   test("alter table: bucketing is not supported") {
     val catalog = spark.sessionState.catalog
     val tableIdent = TableIdentifier("tab1", Some("dbx"))
@@ -991,6 +1052,10 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
       sql("SHOW DATABASES LIKE '*db1A'"),
       Row("showdb1a") :: Nil)
 
+    checkAnswer(
+      sql("SHOW DATABASES '*db1A'"),
+      Row("showdb1a") :: Nil)
+
     checkAnswer(
       sql("SHOW DATABASES LIKE 'showdb1A'"),
       Row("showdb1a") :: Nil)
@@ -1724,12 +1789,22 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
   }
 
   test("block creating duplicate temp table") {
-    withView("t_temp") {
+    withTempView("t_temp") {
       sql("CREATE TEMPORARY VIEW t_temp AS SELECT 1, 2")
       val e = intercept[TempTableAlreadyExistsException] {
         sql("CREATE TEMPORARY TABLE t_temp (c3 int, c4 string) USING JSON")
       }.getMessage
-      assert(e.contains("Temporary table 't_temp' already exists"))
+      assert(e.contains("Temporary view 't_temp' already exists"))
+    }
+  }
+
+  test("block creating duplicate temp view") {
+    withTempView("t_temp") {
+      sql("CREATE TEMPORARY VIEW t_temp AS SELECT 1, 2")
+      val e = intercept[TempTableAlreadyExistsException] {
+        sql("CREATE TEMPORARY VIEW t_temp (c3 int, c4 string) USING JSON")
+      }.getMessage
+      assert(e.contains("Temporary view 't_temp' already exists"))
     }
   }
 
@@ -1971,8 +2046,8 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
           s"""
              |CREATE TABLE t(a int, b int, c int, d int)
              |USING parquet
-             |PARTITIONED BY(a, b)
              |LOCATION "${dir.toURI}"
+             |PARTITIONED BY(a, b)
            """.stripMargin)
         spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 3, 4")
         checkAnswer(spark.table("t"), Row(3, 4, 1, 2) :: Nil)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
index caf03885e387..c1f2c18d1417 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.datasources
 import java.io.{File, FilenameFilter}
 
 import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec}
 import org.apache.spark.sql.test.SharedSQLContext
 
 class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
@@ -39,4 +40,44 @@ class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
       assert(df.queryExecution.logical.stats.sizeInBytes === BigInt(totalSize))
     }
   }
+
+  test("SPARK-22790: spark.sql.sources.compressionFactor takes effect") {
+    import testImplicits._
+    Seq(1.0, 0.5).foreach { compressionFactor =>
+      withSQLConf("spark.sql.sources.fileCompressionFactor" -> compressionFactor.toString,
+        "spark.sql.autoBroadcastJoinThreshold" -> "400") {
+        withTempPath { workDir =>
+          // the file size is 740 bytes
+          val workDirPath = workDir.getAbsolutePath
+          val data1 = Seq(100, 200, 300, 400).toDF("count")
+          data1.write.parquet(workDirPath + "/data1")
+          val df1FromFile = spark.read.parquet(workDirPath + "/data1")
+          val data2 = Seq(100, 200, 300, 400).toDF("count")
+          data2.write.parquet(workDirPath + "/data2")
+          val df2FromFile = spark.read.parquet(workDirPath + "/data2")
+          val joinedDF = df1FromFile.join(df2FromFile, Seq("count"))
+          if (compressionFactor == 0.5) {
+            val bJoinExec = joinedDF.queryExecution.executedPlan.collect {
+              case bJoin: BroadcastHashJoinExec => bJoin
+            }
+            assert(bJoinExec.nonEmpty)
+            val smJoinExec = joinedDF.queryExecution.executedPlan.collect {
+              case smJoin: SortMergeJoinExec => smJoin
+            }
+            assert(smJoinExec.isEmpty)
+          } else {
+            // compressionFactor is 1.0
+            val bJoinExec = joinedDF.queryExecution.executedPlan.collect {
+              case bJoin: BroadcastHashJoinExec => bJoin
+            }
+            assert(bJoinExec.isEmpty)
+            val smJoinExec = joinedDF.queryExecution.executedPlan.collect {
+              case smJoin: SortMergeJoinExec => smJoin
+            }
+            assert(smJoinExec.nonEmpty)
+          }
+        }
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
index e00e057a18cc..f58c331f33ca 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
@@ -31,6 +31,7 @@ import org.apache.orc.OrcConf.COMPRESS
 import org.apache.orc.mapred.OrcStruct
 import org.apache.orc.mapreduce.OrcInputFormat
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, RecordReaderIterator}
@@ -531,6 +532,52 @@ abstract class OrcQueryTest extends OrcTest {
     val df = spark.read.orc(path1.getCanonicalPath, path2.getCanonicalPath)
     assert(df.count() == 20)
   }
+
+  test("Enabling/disabling ignoreCorruptFiles") {
+    def testIgnoreCorruptFiles(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.orc(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.orc(new Path(basePath, "second").toString)
+        spark.range(2, 3).toDF("a").write.json(new Path(basePath, "third").toString)
+        val df = spark.read.orc(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+        checkAnswer(df, Seq(Row(0), Row(1)))
+      }
+    }
+
+    def testIgnoreCorruptFilesWithoutSchemaInfer(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.orc(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.orc(new Path(basePath, "second").toString)
+        spark.range(2, 3).toDF("a").write.json(new Path(basePath, "third").toString)
+        val df = spark.read.schema("a long").orc(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+        checkAnswer(df, Seq(Row(0), Row(1)))
+      }
+    }
+
+    withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") {
+      testIgnoreCorruptFiles()
+      testIgnoreCorruptFilesWithoutSchemaInfer()
+    }
+
+    withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") {
+      val m1 = intercept[SparkException] {
+        testIgnoreCorruptFiles()
+      }.getMessage
+      assert(m1.contains("Could not read footer for file"))
+      val m2 = intercept[SparkException] {
+        testIgnoreCorruptFilesWithoutSchemaInfer()
+      }.getMessage
+      assert(m2.contains("Malformed ORC file"))
+    }
+  }
 }
 
 class OrcQuerySuite extends OrcQueryTest with SharedSQLContext {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala
new file mode 100644
index 000000000000..ed8fd2b45345
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import java.io.File
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.hadoop.ParquetOutputFormat
+
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSQLContext
+
+class ParquetCompressionCodecPrecedenceSuite extends ParquetTest with SharedSQLContext {
+  test("Test `spark.sql.parquet.compression.codec` config") {
+    Seq("NONE", "UNCOMPRESSED", "SNAPPY", "GZIP", "LZO").foreach { c =>
+      withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> c) {
+        val expected = if (c == "NONE") "UNCOMPRESSED" else c
+        val option = new ParquetOptions(Map.empty[String, String], spark.sessionState.conf)
+        assert(option.compressionCodecClassName == expected)
+      }
+    }
+  }
+
+  test("[SPARK-21786] Test Acquiring 'compressionCodecClassName' for parquet in right order.") {
+    // When "compression" is configured, it should be the first choice.
+    withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> "snappy") {
+      val props = Map("compression" -> "uncompressed", ParquetOutputFormat.COMPRESSION -> "gzip")
+      val option = new ParquetOptions(props, spark.sessionState.conf)
+      assert(option.compressionCodecClassName == "UNCOMPRESSED")
+    }
+
+    // When "compression" is not configured, "parquet.compression" should be the preferred choice.
+    withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> "snappy") {
+      val props = Map(ParquetOutputFormat.COMPRESSION -> "gzip")
+      val option = new ParquetOptions(props, spark.sessionState.conf)
+      assert(option.compressionCodecClassName == "GZIP")
+    }
+
+    // When both "compression" and "parquet.compression" are not configured,
+    // spark.sql.parquet.compression.codec should be the right choice.
+    withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> "snappy") {
+      val props = Map.empty[String, String]
+      val option = new ParquetOptions(props, spark.sessionState.conf)
+      assert(option.compressionCodecClassName == "SNAPPY")
+    }
+  }
+
+  private def getTableCompressionCodec(path: String): Seq[String] = {
+    val hadoopConf = spark.sessionState.newHadoopConf()
+    val codecs = for {
+      footer <- readAllFootersWithoutSummaryFiles(new Path(path), hadoopConf)
+      block <- footer.getParquetMetadata.getBlocks.asScala
+      column <- block.getColumns.asScala
+    } yield column.getCodec.name()
+    codecs.distinct
+  }
+
+  private def createTableWithCompression(
+      tableName: String,
+      isPartitioned: Boolean,
+      compressionCodec: String,
+      rootDir: File): Unit = {
+    val options =
+      s"""
+        |OPTIONS('path'='${rootDir.toURI.toString.stripSuffix("/")}/$tableName',
+        |'parquet.compression'='$compressionCodec')
+       """.stripMargin
+    val partitionCreate = if (isPartitioned) "PARTITIONED BY (p)" else ""
+    sql(
+      s"""
+        |CREATE TABLE $tableName USING Parquet $options $partitionCreate
+        |AS SELECT 1 AS col1, 2 AS p
+       """.stripMargin)
+  }
+
+  private def checkCompressionCodec(compressionCodec: String, isPartitioned: Boolean): Unit = {
+    withTempDir { tmpDir =>
+      val tempTableName = "TempParquetTable"
+      withTable(tempTableName) {
+        createTableWithCompression(tempTableName, isPartitioned, compressionCodec, tmpDir)
+        val partitionPath = if (isPartitioned) "p=2" else ""
+        val path = s"${tmpDir.getPath.stripSuffix("/")}/$tempTableName/$partitionPath"
+        val realCompressionCodecs = getTableCompressionCodec(path)
+        assert(realCompressionCodecs.forall(_ == compressionCodec))
+      }
+    }
+  }
+
+  test("Create parquet table with compression") {
+    Seq(true, false).foreach { isPartitioned =>
+      Seq("UNCOMPRESSED", "SNAPPY", "GZIP").foreach { compressionCodec =>
+        checkCompressionCodec(compressionCodec, isPartitioned)
+      }
+    }
+  }
+
+  test("Create table with unknown compression") {
+    Seq(true, false).foreach { isPartitioned =>
+      val exception = intercept[IllegalArgumentException] {
+        checkCompressionCodec("aa", isPartitioned)
+      }
+      assert(exception.getMessage.contains("Codec [aa] is not available"))
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index 4c8c9ef6e043..6ad88ed997ce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -320,14 +320,27 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
           new Path(basePath, "first").toString,
           new Path(basePath, "second").toString,
           new Path(basePath, "third").toString)
-        checkAnswer(
-          df,
-          Seq(Row(0), Row(1)))
+        checkAnswer(df, Seq(Row(0), Row(1)))
+      }
+    }
+
+    def testIgnoreCorruptFilesWithoutSchemaInfer(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.parquet(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.parquet(new Path(basePath, "second").toString)
+        spark.range(2, 3).toDF("a").write.json(new Path(basePath, "third").toString)
+        val df = spark.read.schema("a long").parquet(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+        checkAnswer(df, Seq(Row(0), Row(1)))
       }
     }
 
     withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") {
       testIgnoreCorruptFiles()
+      testIgnoreCorruptFilesWithoutSchemaInfer()
     }
 
     withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") {
@@ -335,6 +348,10 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
         testIgnoreCorruptFiles()
       }
       assert(exception.getMessage().contains("is not a Parquet file"))
+      val exception2 = intercept[SparkException] {
+        testIgnoreCorruptFilesWithoutSchemaInfer()
+      }
+      assert(exception2.getMessage().contains("is not a Parquet file"))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index 6da46ea3480b..0bcd54e1fcea 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -318,7 +318,7 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
       case b: BroadcastNestedLoopJoinExec =>
         assert(b.getClass.getSimpleName === joinMethod)
         assert(b.buildSide === buildSide)
-      case b: BroadcastNestedLoopJoinExec =>
+      case b: BroadcastHashJoinExec =>
         assert(b.getClass.getSimpleName === joinMethod)
         assert(b.buildSide === buildSide)
       case w: WholeStageCodegenExec =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
index e11705a227f4..85085d43061b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
@@ -18,20 +18,64 @@
 package org.apache.spark.sql.execution.streaming
 
 import java.util.Optional
+import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConverters._
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.continuous._
-import org.apache.spark.sql.execution.streaming.sources.{RateStreamBatchTask, RateStreamSourceV2, RateStreamV2Reader}
+import org.apache.spark.sql.execution.streaming.sources.{RateStreamBatchTask, RateStreamMicroBatchReader, RateStreamSourceV2}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
-import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
 import org.apache.spark.sql.streaming.StreamTest
+import org.apache.spark.util.ManualClock
 
 class RateSourceV2Suite extends StreamTest {
+  import testImplicits._
+
+  case class AdvanceRateManualClock(seconds: Long) extends AddData {
+    override def addData(query: Option[StreamExecution]): (BaseStreamingSource, Offset) = {
+      assert(query.nonEmpty)
+      val rateSource = query.get.logicalPlan.collect {
+        case StreamingExecutionRelation(source: RateStreamMicroBatchReader, _) => source
+      }.head
+      rateSource.clock.asInstanceOf[ManualClock].advance(TimeUnit.SECONDS.toMillis(seconds))
+      rateSource.setOffsetRange(Optional.empty(), Optional.empty())
+      (rateSource, rateSource.getEndOffset())
+    }
+  }
+
+  test("microbatch in registry") {
+    DataSource.lookupDataSource("ratev2", spark.sqlContext.conf).newInstance() match {
+      case ds: MicroBatchReadSupport =>
+        val reader = ds.createMicroBatchReader(Optional.empty(), "", DataSourceV2Options.empty())
+        assert(reader.isInstanceOf[RateStreamMicroBatchReader])
+      case _ =>
+        throw new IllegalStateException("Could not find v2 read support for rate")
+    }
+  }
+
+  test("basic microbatch execution") {
+    val input = spark.readStream
+      .format("rateV2")
+      .option("numPartitions", "1")
+      .option("rowsPerSecond", "10")
+      .option("useManualClock", "true")
+      .load()
+    testStream(input, useV2Sink = true)(
+      AdvanceRateManualClock(seconds = 1),
+      CheckLastBatch((0 until 10).map(v => new java.sql.Timestamp(v * 100L) -> v): _*),
+      StopStream,
+      StartStream(),
+      // Advance 2 seconds because creating a new RateSource will also create a new ManualClock
+      AdvanceRateManualClock(seconds = 2),
+      CheckLastBatch((10 until 20).map(v => new java.sql.Timestamp(v * 100L) -> v): _*)
+    )
+  }
+
   test("microbatch - numPartitions propagated") {
-    val reader = new RateStreamV2Reader(
+    val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
     reader.setOffsetRange(Optional.empty(), Optional.empty())
     val tasks = reader.createReadTasks()
@@ -39,7 +83,7 @@ class RateSourceV2Suite extends StreamTest {
   }
 
   test("microbatch - set offset") {
-    val reader = new RateStreamV2Reader(DataSourceV2Options.empty())
+    val reader = new RateStreamMicroBatchReader(DataSourceV2Options.empty())
     val startOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 1000))))
     val endOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 2000))))
     reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
@@ -48,7 +92,7 @@ class RateSourceV2Suite extends StreamTest {
   }
 
   test("microbatch - infer offsets") {
-    val reader = new RateStreamV2Reader(
+    val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "1", "rowsPerSecond" -> "100").asJava))
     reader.clock.waitTillTime(reader.clock.getTimeMillis() + 100)
     reader.setOffsetRange(Optional.empty(), Optional.empty())
@@ -69,7 +113,7 @@ class RateSourceV2Suite extends StreamTest {
   }
 
   test("microbatch - predetermined batch size") {
-    val reader = new RateStreamV2Reader(
+    val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "1", "rowsPerSecond" -> "20").asJava))
     val startOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 1000))))
     val endOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(20, 2000))))
@@ -80,7 +124,7 @@ class RateSourceV2Suite extends StreamTest {
   }
 
   test("microbatch - data read") {
-    val reader = new RateStreamV2Reader(
+    val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
     val startOffset = RateStreamSourceV2.createInitialOffset(11, reader.creationTimeMs)
     val endOffset = RateStreamOffset(startOffset.partitionToValueAndRunTimeMs.toSeq.map {
@@ -107,14 +151,14 @@ class RateSourceV2Suite extends StreamTest {
     DataSource.lookupDataSource("rate", spark.sqlContext.conf).newInstance() match {
       case ds: ContinuousReadSupport =>
         val reader = ds.createContinuousReader(Optional.empty(), "", DataSourceV2Options.empty())
-        assert(reader.isInstanceOf[ContinuousRateStreamReader])
+        assert(reader.isInstanceOf[RateStreamContinuousReader])
       case _ =>
         throw new IllegalStateException("Could not find v2 read support for rate")
     }
   }
 
   test("continuous data") {
-    val reader = new ContinuousRateStreamReader(
+    val reader = new RateStreamContinuousReader(
       new DataSourceV2Options(Map("numPartitions" -> "2", "rowsPerSecond" -> "20").asJava))
     reader.setOffset(Optional.empty())
     val tasks = reader.createReadTasks()
@@ -122,17 +166,17 @@ class RateSourceV2Suite extends StreamTest {
 
     val data = scala.collection.mutable.ListBuffer[Row]()
     tasks.asScala.foreach {
-      case t: RateStreamReadTask =>
+      case t: RateStreamContinuousReadTask =>
         val startTimeMs = reader.getStartOffset()
           .asInstanceOf[RateStreamOffset]
           .partitionToValueAndRunTimeMs(t.partitionIndex)
           .runTimeMs
-        val r = t.createDataReader().asInstanceOf[RateStreamDataReader]
+        val r = t.createDataReader().asInstanceOf[RateStreamContinuousDataReader]
         for (rowIndex <- 0 to 9) {
           r.next()
           data.append(r.get())
           assert(r.getOffset() ==
-            ContinuousRateStreamPartitionOffset(
+            RateStreamPartitionOffset(
               t.partitionIndex,
               t.partitionIndex + rowIndex * 2,
               startTimeMs + (rowIndex + 1) * 100))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriterSuite.scala
new file mode 100644
index 000000000000..55acf2ba28d2
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriterSuite.scala
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.sources
+
+import java.io.ByteArrayOutputStream
+
+import org.scalatest.time.SpanSugar._
+
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.streaming.{StreamTest, Trigger}
+
+class ConsoleWriterSuite extends StreamTest {
+  import testImplicits._
+
+  test("microbatch - default") {
+    val input = MemoryStream[Int]
+
+    val captured = new ByteArrayOutputStream()
+    Console.withOut(captured) {
+      val query = input.toDF().writeStream.format("console").start()
+      try {
+        input.addData(1, 2, 3)
+        query.processAllAvailable()
+        input.addData(4, 5, 6)
+        query.processAllAvailable()
+        input.addData()
+        query.processAllAvailable()
+      } finally {
+        query.stop()
+      }
+    }
+
+    assert(captured.toString() ==
+      """-------------------------------------------
+        |Batch: 0
+        |-------------------------------------------
+        |+-----+
+        ||value|
+        |+-----+
+        ||    1|
+        ||    2|
+        ||    3|
+        |+-----+
+        |
+        |-------------------------------------------
+        |Batch: 1
+        |-------------------------------------------
+        |+-----+
+        ||value|
+        |+-----+
+        ||    4|
+        ||    5|
+        ||    6|
+        |+-----+
+        |
+        |-------------------------------------------
+        |Batch: 2
+        |-------------------------------------------
+        |+-----+
+        ||value|
+        |+-----+
+        |+-----+
+        |
+        |""".stripMargin)
+  }
+
+  test("microbatch - with numRows") {
+    val input = MemoryStream[Int]
+
+    val captured = new ByteArrayOutputStream()
+    Console.withOut(captured) {
+      val query = input.toDF().writeStream.format("console").option("NUMROWS", 2).start()
+      try {
+        input.addData(1, 2, 3)
+        query.processAllAvailable()
+      } finally {
+        query.stop()
+      }
+    }
+
+    assert(captured.toString() ==
+      """-------------------------------------------
+        |Batch: 0
+        |-------------------------------------------
+        |+-----+
+        ||value|
+        |+-----+
+        ||    1|
+        ||    2|
+        |+-----+
+        |only showing top 2 rows
+        |
+        |""".stripMargin)
+  }
+
+  test("microbatch - truncation") {
+    val input = MemoryStream[String]
+
+    val captured = new ByteArrayOutputStream()
+    Console.withOut(captured) {
+      val query = input.toDF().writeStream.format("console").option("TRUNCATE", true).start()
+      try {
+        input.addData("123456789012345678901234567890")
+        query.processAllAvailable()
+      } finally {
+        query.stop()
+      }
+    }
+
+    assert(captured.toString() ==
+      """-------------------------------------------
+        |Batch: 0
+        |-------------------------------------------
+        |+--------------------+
+        ||               value|
+        |+--------------------+
+        ||12345678901234567...|
+        |+--------------------+
+        |
+        |""".stripMargin)
+  }
+
+  test("continuous - default") {
+    val captured = new ByteArrayOutputStream()
+    Console.withOut(captured) {
+      val input = spark.readStream
+        .format("rate")
+        .option("numPartitions", "1")
+        .option("rowsPerSecond", "5")
+        .load()
+        .select('value)
+
+      val query = input.writeStream.format("console").trigger(Trigger.Continuous(200)).start()
+      assert(query.isActive)
+      query.stop()
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
index 03490ad15a65..53432669e215 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -23,6 +23,7 @@ import org.apache.arrow.vector.complex._
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.execution.arrow.ArrowUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ArrowColumnVector
 import org.apache.spark.unsafe.types.UTF8String
 
 class ArrowColumnVectorSuite extends SparkFunSuite {
@@ -321,6 +322,42 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
     allocator.close()
   }
 
+  test("non nullable struct") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue)
+    val schema = new StructType().add("int", IntegerType).add("long", LongType)
+    val vector = ArrowUtils.toArrowField("struct", schema, nullable = false, null)
+      .createVector(allocator).asInstanceOf[NullableMapVector]
+
+    vector.allocateNew()
+    val intVector = vector.getChildByOrdinal(0).asInstanceOf[IntVector]
+    val longVector = vector.getChildByOrdinal(1).asInstanceOf[BigIntVector]
+
+    vector.setIndexDefined(0)
+    intVector.setSafe(0, 1)
+    longVector.setSafe(0, 1L)
+
+    vector.setIndexDefined(1)
+    intVector.setSafe(1, 2)
+    longVector.setNull(1)
+
+    vector.setValueCount(2)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === schema)
+    assert(columnVector.numNulls === 0)
+
+    val row0 = columnVector.getStruct(0, 2)
+    assert(row0.getInt(0) === 1)
+    assert(row0.getLong(1) === 1L)
+
+    val row1 = columnVector.getStruct(1, 2)
+    assert(row1.getInt(0) === 2)
+    assert(row1.isNullAt(1))
+
+    columnVector.close()
+    allocator.close()
+  }
+
   test("struct") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue)
     val schema = new StructType().add("int", IntegerType).add("long", LongType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
index 54b31cee031f..944240f3bade 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
@@ -21,10 +21,10 @@ import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow
-import org.apache.spark.sql.catalyst.util.ArrayData
 import org.apache.spark.sql.execution.columnar.ColumnAccessor
 import org.apache.spark.sql.execution.columnar.compression.ColumnBuilderHelper
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ColumnarArray
 import org.apache.spark.unsafe.types.UTF8String
 
 class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 7848ebdcab6d..cd90681ecabc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.{RandomDataGenerator, Row}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.arrow.ArrowUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.types.CalendarInterval
 
@@ -874,14 +875,13 @@ class ColumnarBatchSuite extends SparkFunSuite {
         .add("intCol2", IntegerType)
         .add("string", BinaryType)
 
-      val capacity = ColumnarBatch.DEFAULT_BATCH_SIZE
+      val capacity = 4 * 1024
       val columns = schema.fields.map { field =>
         allocate(capacity, field.dataType, memMode)
       }
-      val batch = new ColumnarBatch(schema, columns.toArray, ColumnarBatch.DEFAULT_BATCH_SIZE)
+      val batch = new ColumnarBatch(columns.toArray)
       assert(batch.numCols() == 4)
       assert(batch.numRows() == 0)
-      assert(batch.capacity() > 0)
       assert(batch.rowIterator().hasNext == false)
 
       // Add a row [1, 1.1, NULL]
@@ -918,10 +918,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(it.hasNext == false)
 
       // Reset and add 3 rows
-      batch.reset()
-      assert(batch.numRows() == 0)
-      assert(batch.rowIterator().hasNext == false)
-
+      columns.foreach(_.reset())
       // Add rows [NULL, 2.2, 2, "abc"], [3, NULL, 3, ""], [4, 4.4, 4, "world]
       columns(0).putNull(0)
       columns(1).putDouble(0, 2.2)
@@ -1155,7 +1152,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
     val columnVectors = Seq(new ArrowColumnVector(vector1), new ArrowColumnVector(vector2))
 
     val schema = StructType(Seq(StructField("int1", IntegerType), StructField("int2", IntegerType)))
-    val batch = new ColumnarBatch(schema, columnVectors.toArray[ColumnVector], 11)
+    val batch = new ColumnarBatch(columnVectors.toArray)
     batch.setNumRows(11)
 
     assert(batch.numCols() == 2)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 8b7e2e5f4594..fef01c860db6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -21,6 +21,8 @@ import java.io.File
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, Row}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.Utils
 
@@ -442,4 +444,80 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
       assert(e.contains("Only Data Sources providing FileFormat are supported"))
     }
   }
+
+  test("SPARK-20236: dynamic partition overwrite without catalog table") {
+    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) {
+      withTempPath { path =>
+        Seq((1, 1, 1)).toDF("i", "part1", "part2")
+          .write.partitionBy("part1", "part2").parquet(path.getAbsolutePath)
+        checkAnswer(spark.read.parquet(path.getAbsolutePath), Row(1, 1, 1))
+
+        Seq((2, 1, 1)).toDF("i", "part1", "part2")
+          .write.partitionBy("part1", "part2").mode("overwrite").parquet(path.getAbsolutePath)
+        checkAnswer(spark.read.parquet(path.getAbsolutePath), Row(2, 1, 1))
+
+        Seq((2, 2, 2)).toDF("i", "part1", "part2")
+          .write.partitionBy("part1", "part2").mode("overwrite").parquet(path.getAbsolutePath)
+        checkAnswer(spark.read.parquet(path.getAbsolutePath), Row(2, 1, 1) :: Row(2, 2, 2) :: Nil)
+      }
+    }
+  }
+
+  test("SPARK-20236: dynamic partition overwrite") {
+    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) {
+      withTable("t") {
+        sql(
+          """
+            |create table t(i int, part1 int, part2 int) using parquet
+            |partitioned by (part1, part2)
+          """.stripMargin)
+
+        sql("insert into t partition(part1=1, part2=1) select 1")
+        checkAnswer(spark.table("t"), Row(1, 1, 1))
+
+        sql("insert overwrite table t partition(part1=1, part2=1) select 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1))
+
+        sql("insert overwrite table t partition(part1=2, part2) select 2, 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Nil)
+
+        sql("insert overwrite table t partition(part1=1, part2=2) select 3")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+
+        sql("insert overwrite table t partition(part1=1, part2) select 4, 1")
+        checkAnswer(spark.table("t"), Row(4, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+      }
+    }
+  }
+
+  test("SPARK-20236: dynamic partition overwrite with customer partition path") {
+    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) {
+      withTable("t") {
+        sql(
+          """
+            |create table t(i int, part1 int, part2 int) using parquet
+            |partitioned by (part1, part2)
+          """.stripMargin)
+
+        val path1 = Utils.createTempDir()
+        sql(s"alter table t add partition(part1=1, part2=1) location '$path1'")
+        sql(s"insert into t partition(part1=1, part2=1) select 1")
+        checkAnswer(spark.table("t"), Row(1, 1, 1))
+
+        sql("insert overwrite table t partition(part1=1, part2=1) select 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1))
+
+        sql("insert overwrite table t partition(part1=2, part2) select 2, 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Nil)
+
+        val path2 = Utils.createTempDir()
+        sql(s"alter table t add partition(part1=1, part2=2) location '$path2'")
+        sql("insert overwrite table t partition(part1=1, part2=2) select 3")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+
+        sql("insert overwrite table t partition(part1=1, part2) select 4, 1")
+        checkAnswer(spark.table("t"), Row(4, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
index ab37e4984bd1..0ca29524c6d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -24,10 +24,12 @@ import test.org.apache.spark.sql.sources.v2._
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
 import org.apache.spark.sql.sources.{Filter, GreaterThan}
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.sql.vectorized.ColumnarBatch
 
 class DataSourceV2Suite extends QueryTest with SharedSQLContext {
   import testImplicits._
@@ -56,7 +58,7 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("unsafe row implementation") {
+  test("unsafe row scan implementation") {
     Seq(classOf[UnsafeRowDataSourceV2], classOf[JavaUnsafeRowDataSourceV2]).foreach { cls =>
       withClue(cls.getName) {
         val df = spark.read.format(cls.getName).load()
@@ -67,6 +69,17 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("columnar batch scan implementation") {
+    Seq(classOf[BatchDataSourceV2], classOf[JavaBatchDataSourceV2]).foreach { cls =>
+      withClue(cls.getName) {
+        val df = spark.read.format(cls.getName).load()
+        checkAnswer(df, (0 until 90).map(i => Row(i, -i)))
+        checkAnswer(df.select('j), (0 until 90).map(i => Row(-i)))
+        checkAnswer(df.filter('i > 50), (51 until 90).map(i => Row(i, -i)))
+      }
+    }
+  }
+
   test("schema required data source") {
     Seq(classOf[SchemaRequiredDataSource], classOf[JavaSchemaRequiredDataSource]).foreach { cls =>
       withClue(cls.getName) {
@@ -275,7 +288,7 @@ class UnsafeRowReadTask(start: Int, end: Int)
 
   private var current = start - 1
 
-  override def createDataReader(): DataReader[UnsafeRow] = new UnsafeRowReadTask(start, end)
+  override def createDataReader(): DataReader[UnsafeRow] = this
 
   override def next(): Boolean = {
     current += 1
@@ -300,3 +313,55 @@ class SchemaRequiredDataSource extends DataSourceV2 with ReadSupportWithSchema {
   override def createReader(schema: StructType, options: DataSourceV2Options): DataSourceV2Reader =
     new Reader(schema)
 }
+
+class BatchDataSourceV2 extends DataSourceV2 with ReadSupport {
+
+  class Reader extends DataSourceV2Reader with SupportsScanColumnarBatch {
+    override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int")
+
+    override def createBatchReadTasks(): JList[ReadTask[ColumnarBatch]] = {
+      java.util.Arrays.asList(new BatchReadTask(0, 50), new BatchReadTask(50, 90))
+    }
+  }
+
+  override def createReader(options: DataSourceV2Options): DataSourceV2Reader = new Reader
+}
+
+class BatchReadTask(start: Int, end: Int)
+  extends ReadTask[ColumnarBatch] with DataReader[ColumnarBatch] {
+
+  private final val BATCH_SIZE = 20
+  private lazy val i = new OnHeapColumnVector(BATCH_SIZE, IntegerType)
+  private lazy val j = new OnHeapColumnVector(BATCH_SIZE, IntegerType)
+  private lazy val batch = new ColumnarBatch(Array(i, j))
+
+  private var current = start
+
+  override def createDataReader(): DataReader[ColumnarBatch] = this
+
+  override def next(): Boolean = {
+    i.reset()
+    j.reset()
+
+    var count = 0
+    while (current < end && count < BATCH_SIZE) {
+      i.putInt(count, current)
+      j.putInt(count, -current)
+      current += 1
+      count += 1
+    }
+
+    if (count == 0) {
+      false
+    } else {
+      batch.setNumRows(count)
+      true
+    }
+  }
+
+  override def get(): ColumnarBatch = {
+    batch
+  }
+
+  override def close(): Unit = batch.close()
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index 39bb57274061..5bb0f4d643bb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -74,11 +74,11 @@ abstract class FileStreamSourceTest
     protected def addData(source: FileStreamSource): Unit
   }
 
-  case class AddTextFileData(content: String, src: File, tmp: File)
+  case class AddTextFileData(content: String, src: File, tmp: File, tmpFilePrefix: String = "text")
     extends AddFileData {
 
     override def addData(source: FileStreamSource): Unit = {
-      val tempFile = Utils.tempFileWith(new File(tmp, "text"))
+      val tempFile = Utils.tempFileWith(new File(tmp, tmpFilePrefix))
       val finalFile = new File(src, tempFile.getName)
       src.mkdirs()
       require(stringToFile(tempFile, content).renameTo(finalFile))
@@ -408,6 +408,52 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     }
   }
 
+  test("SPARK-21996 read from text files -- file name has space") {
+    withTempDirs { case (src, tmp) =>
+      val textStream = createFileStream("text", src.getCanonicalPath)
+      val filtered = textStream.filter($"value" contains "keep")
+
+      testStream(filtered)(
+        AddTextFileData("drop1\nkeep2\nkeep3", src, tmp, "text text"),
+        CheckAnswer("keep2", "keep3")
+      )
+    }
+  }
+
+  test("SPARK-21996 read from text files generated by file sink -- file name has space") {
+    val testTableName = "FileStreamSourceTest"
+    withTable(testTableName) {
+      withTempDirs { case (src, checkpoint) =>
+        val output = new File(src, "text text")
+        val inputData = MemoryStream[String]
+        val ds = inputData.toDS()
+
+        val query = ds.writeStream
+          .option("checkpointLocation", checkpoint.getCanonicalPath)
+          .format("text")
+          .start(output.getCanonicalPath)
+
+        try {
+          inputData.addData("foo")
+          failAfter(streamingTimeout) {
+            query.processAllAvailable()
+          }
+        } finally {
+          query.stop()
+        }
+
+        val df2 = spark.readStream.format("text").load(output.getCanonicalPath)
+        val query2 = df2.writeStream.format("memory").queryName(testTableName).start()
+        try {
+          query2.processAllAvailable()
+          checkDatasetUnorderly(spark.table(testTableName).as[String], "foo")
+        } finally {
+          query2.stop()
+        }
+      }
+    }
+  }
+
   test("read from textfile") {
     withTempDirs { case (src, tmp) =>
       val textStream = spark.readStream.textFile(src.getCanonicalPath)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 4b7f0fbe97d4..efdb0e0e7cf1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -38,8 +38,9 @@ import org.apache.spark.sql.{Dataset, Encoder, QueryTest, Row}
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, EpochCoordinatorRef, IncrementAndGetEpoch}
+import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, ContinuousTrigger, EpochCoordinatorRef, IncrementAndGetEpoch}
 import org.apache.spark.sql.execution.streaming.sources.MemorySinkV2
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.streaming.StreamingQueryListener._
@@ -80,6 +81,9 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     StateStore.stop() // stop the state store maintenance thread and unload store providers
   }
 
+  protected val defaultTrigger = Trigger.ProcessingTime(0)
+  protected val defaultUseV2Sink = false
+
   /** How long to wait for an active stream to catch up when checking a result. */
   val streamingTimeout = 10.seconds
 
@@ -105,7 +109,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
      * the active query, and then return the source object the data was added, as well as the
      * offset of added data.
      */
-    def addData(query: Option[StreamExecution]): (Source, Offset)
+    def addData(query: Option[StreamExecution]): (BaseStreamingSource, Offset)
   }
 
   /** A trait that can be extended when testing a source. */
@@ -189,7 +193,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
   /** Starts the stream, resuming if data has already been processed. It must not be running. */
   case class StartStream(
-      trigger: Trigger = Trigger.ProcessingTime(0),
+      trigger: Trigger = defaultTrigger,
       triggerClock: Clock = new SystemClock,
       additionalConfs: Map[String, String] = Map.empty,
       checkpointLocation: String = null)
@@ -259,7 +263,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     def apply(): AssertOnQuery =
       Execute {
         case s: ContinuousExecution =>
-          val newEpoch = EpochCoordinatorRef.get(s.runId.toString, SparkEnv.get)
+          val newEpoch = EpochCoordinatorRef.get(s.currentEpochCoordinatorId, SparkEnv.get)
             .askSync[Long](IncrementAndGetEpoch)
           s.awaitEpoch(newEpoch - 1)
         case _ => throw new IllegalStateException("microbatch cannot increment epoch")
@@ -276,7 +280,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
   def testStream(
       _stream: Dataset[_],
       outputMode: OutputMode = OutputMode.Append,
-      useV2Sink: Boolean = false)(actions: StreamAction*): Unit = synchronized {
+      useV2Sink: Boolean = defaultUseV2Sink)(actions: StreamAction*): Unit = synchronized {
     import org.apache.spark.sql.streaming.util.StreamManualClock
 
     // `synchronized` is added to prevent the user from calling multiple `testStream`s concurrently
@@ -403,18 +407,11 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
     def fetchStreamAnswer(currentStream: StreamExecution, lastOnly: Boolean) = {
       verify(currentStream != null, "stream not running")
-      // Get the map of source index to the current source objects
-      val indexToSource = currentStream
-        .logicalPlan
-        .collect { case StreamingExecutionRelation(s, _) => s }
-        .zipWithIndex
-        .map(_.swap)
-        .toMap
 
       // Block until all data added has been processed for all the source
       awaiting.foreach { case (sourceIndex, offset) =>
         failAfter(streamingTimeout) {
-          currentStream.awaitOffset(indexToSource(sourceIndex), offset)
+          currentStream.awaitOffset(sourceIndex, offset)
         }
       }
 
@@ -473,6 +470,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
             // after starting the query.
             try {
               currentStream.awaitInitialization(streamingTimeout.toMillis)
+              currentStream match {
+                case s: ContinuousExecution => eventually("IncrementalExecution was not created") {
+                  s.lastExecution.executedPlan // will fail if lastExecution is null
+                }
+                case _ =>
+              }
             } catch {
               case _: StreamingQueryException =>
                 // Ignore the exception. `StopStream` or `ExpectFailure` will catch it as well.
@@ -600,7 +603,10 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
               def findSourceIndex(plan: LogicalPlan): Option[Int] = {
                 plan
-                  .collect { case StreamingExecutionRelation(s, _) => s }
+                  .collect {
+                    case StreamingExecutionRelation(s, _) => s
+                    case DataSourceV2Relation(_, r) => r
+                  }
                   .zipWithIndex
                   .find(_._1 == source)
                   .map(_._2)
@@ -613,9 +619,13 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
                   findSourceIndex(query.logicalPlan)
                 }.orElse {
                   findSourceIndex(stream.logicalPlan)
+                }.orElse {
+                  queryToUse.flatMap { q =>
+                    findSourceIndex(q.lastExecution.logical)
+                  }
                 }.getOrElse {
                   throw new IllegalArgumentException(
-                    "Could find index of the source to which data was added")
+                    "Could not find index of the source to which data was added")
                 }
 
               // Store the expected offset of added data to wait for it later
@@ -635,7 +645,10 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
             }
 
           case CheckAnswerRowsContains(expectedAnswer, lastOnly) =>
-            val sparkAnswer = fetchStreamAnswer(currentStream, lastOnly)
+            val sparkAnswer = currentStream match {
+              case null => fetchStreamAnswer(lastStream, lastOnly)
+              case s => fetchStreamAnswer(s, lastOnly)
+            }
             QueryTest.includesRows(expectedAnswer, sparkAnswer).foreach {
               error => failTest(error)
             }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 9ff02dee288f..79d65192a14a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -174,6 +174,31 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
+  test("continuous processing listeners should receive QueryTerminatedEvent") {
+    val df = spark.readStream.format("rate").load()
+    val listeners = (1 to 5).map(_ => new EventCollector)
+    try {
+      listeners.foreach(listener => spark.streams.addListener(listener))
+      testStream(df, OutputMode.Append, useV2Sink = true)(
+        StartStream(Trigger.Continuous(1000)),
+        StopStream,
+        AssertOnQuery { query =>
+          eventually(Timeout(streamingTimeout)) {
+            listeners.foreach(listener => assert(listener.terminationEvent !== null))
+            listeners.foreach(listener => assert(listener.terminationEvent.id === query.id))
+            listeners.foreach(listener => assert(listener.terminationEvent.runId === query.runId))
+            listeners.foreach(listener => assert(listener.terminationEvent.exception === None))
+          }
+          listeners.foreach(listener => listener.checkAsyncErrors())
+          listeners.foreach(listener => listener.reset())
+          true
+        }
+      )
+    } finally {
+      listeners.foreach(spark.streams.removeListener)
+    }
+  }
+
   test("adding and removing listener") {
     def isListenerActive(listener: EventCollector): Boolean = {
       listener.reset()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 2fa4595dab37..76201c63a270 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -424,6 +424,29 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi
     }
   }
 
+  test("SPARK-22975: MetricsReporter defaults when there was no progress reported") {
+    withSQLConf("spark.sql.streaming.metricsEnabled" -> "true") {
+      BlockingSource.latch = new CountDownLatch(1)
+      withTempDir { tempDir =>
+        val sq = spark.readStream
+          .format("org.apache.spark.sql.streaming.util.BlockingSource")
+          .load()
+          .writeStream
+          .format("org.apache.spark.sql.streaming.util.BlockingSource")
+          .option("checkpointLocation", tempDir.toString)
+          .start()
+          .asInstanceOf[StreamingQueryWrapper]
+          .streamingQuery
+
+        val gauges = sq.streamMetrics.metricRegistry.getGauges
+        assert(gauges.get("latency").getValue.asInstanceOf[Long] == 0)
+        assert(gauges.get("processingRate-total").getValue.asInstanceOf[Double] == 0.0)
+        assert(gauges.get("inputRate-total").getValue.asInstanceOf[Double] == 0.0)
+        sq.stop()
+      }
+    }
+  }
+
   test("input row calculation with mixed batch and streaming sources") {
     val streamingTriggerDF = spark.createDataset(1 to 10).toDF
     val streamingInputDF = createSingleTriggerStreamingDF(streamingTriggerDF).toDF("value")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
index eda0d8ad4831..4b4ed82dc652 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
@@ -17,36 +17,18 @@
 
 package org.apache.spark.sql.streaming.continuous
 
-import java.io.{File, InterruptedIOException, IOException, UncheckedIOException}
-import java.nio.channels.ClosedByInterruptException
-import java.util.concurrent.{CountDownLatch, ExecutionException, TimeoutException, TimeUnit}
+import java.util.UUID
 
-import scala.reflect.ClassTag
-import scala.util.control.ControlThrowable
-
-import com.google.common.util.concurrent.UncheckedExecutionException
-import org.apache.commons.io.FileUtils
-import org.apache.hadoop.conf.Configuration
-
-import org.apache.spark.{SparkContext, SparkEnv}
-import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
+import org.apache.spark.{SparkContext, SparkEnv, SparkException}
+import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart, SparkListenerTaskStart}
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.plans.logical.Range
-import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
-import org.apache.spark.sql.execution.command.ExplainCommand
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanExec, WriteToDataSourceV2Exec}
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.execution.streaming.sources.MemorySinkV2
-import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreConf, StateStoreId, StateStoreProvider}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.StreamSourceProvider
 import org.apache.spark.sql.streaming.{StreamTest, Trigger}
-import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.TestSparkSession
-import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
 
 class ContinuousSuiteBase extends StreamTest {
   // We need more than the default local[2] to be able to schedule all partitions simultaneously.
@@ -61,7 +43,7 @@ class ContinuousSuiteBase extends StreamTest {
       case s: ContinuousExecution =>
         assert(numTriggers >= 2, "must wait for at least 2 triggers to ensure query is initialized")
         val reader = s.lastExecution.executedPlan.collectFirst {
-          case DataSourceV2ScanExec(_, r: ContinuousRateStreamReader) => r
+          case DataSourceV2ScanExec(_, r: RateStreamContinuousReader) => r
         }.get
 
         val deltaMs = numTriggers * 1000 + 300
@@ -219,6 +201,41 @@ class ContinuousSuite extends ContinuousSuiteBase {
       StopStream)
   }
 
+  test("task failure kills the query") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "5")
+      .load()
+      .select('value)
+
+    // Get an arbitrary task from this query to kill. It doesn't matter which one.
+    var taskId: Long = -1
+    val listener = new SparkListener() {
+      override def onTaskStart(start: SparkListenerTaskStart): Unit = {
+        taskId = start.taskInfo.taskId
+      }
+    }
+    spark.sparkContext.addSparkListener(listener)
+    try {
+      testStream(df, useV2Sink = true)(
+        StartStream(Trigger.Continuous(100)),
+        Execute(waitForRateSourceTriggers(_, 2)),
+        Execute { _ =>
+          // Wait until a task is started, then kill its first attempt.
+          eventually(timeout(streamingTimeout)) {
+            assert(taskId != -1)
+          }
+          spark.sparkContext.killTaskAttempt(taskId)
+        },
+        ExpectFailure[SparkException] { e =>
+          e.getCause != null && e.getCause.getCause.isInstanceOf[ContinuousTaskRetryException]
+        })
+    } finally {
+      spark.sparkContext.removeSparkListener(listener)
+    }
+  }
+
   test("query without test harness") {
     val df = spark.readStream
       .format("rate")
@@ -258,13 +275,9 @@ class ContinuousStressSuite extends ContinuousSuiteBase {
       AwaitEpoch(0),
       Execute(waitForRateSourceTriggers(_, 201)),
       IncrementEpoch(),
-      Execute { query =>
-        val data = query.sink.asInstanceOf[MemorySinkV2].allData
-        val vals = data.map(_.getLong(0)).toSet
-        assert(scala.Range(0, 25000).forall { i =>
-          vals.contains(i)
-        })
-      })
+      StopStream,
+      CheckAnswerRowsContains(scala.Range(0, 25000).map(Row(_)))
+    )
   }
 
   test("automatic epoch advancement") {
@@ -280,6 +293,7 @@ class ContinuousStressSuite extends ContinuousSuiteBase {
       AwaitEpoch(0),
       Execute(waitForRateSourceTriggers(_, 201)),
       IncrementEpoch(),
+      StopStream,
       CheckAnswerRowsContains(scala.Range(0, 25000).map(Row(_))))
   }
 
@@ -311,6 +325,7 @@ class ContinuousStressSuite extends ContinuousSuiteBase {
       StopStream,
       StartStream(Trigger.Continuous(2012)),
       AwaitEpoch(50),
+      StopStream,
       CheckAnswerRowsContains(scala.Range(0, 25000).map(Row(_))))
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
new file mode 100644
index 000000000000..f152174b0a7f
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming.sources
+
+import java.util.Optional
+
+import org.apache.spark.sql.{AnalysisException, Row}
+import org.apache.spark.sql.execution.datasources.DataSource
+import org.apache.spark.sql.execution.streaming.{LongOffset, RateStreamOffset}
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
+import org.apache.spark.sql.sources.DataSourceRegister
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.reader.ReadTask
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport, MicroBatchReadSupport, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, MicroBatchReader, Offset, PartitionOffset}
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer
+import org.apache.spark.sql.streaming.{OutputMode, StreamingQueryException, StreamTest, Trigger}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.Utils
+
+case class FakeReader() extends MicroBatchReader with ContinuousReader {
+  def setOffsetRange(start: Optional[Offset], end: Optional[Offset]): Unit = {}
+  def getStartOffset: Offset = RateStreamOffset(Map())
+  def getEndOffset: Offset = RateStreamOffset(Map())
+  def deserializeOffset(json: String): Offset = RateStreamOffset(Map())
+  def commit(end: Offset): Unit = {}
+  def readSchema(): StructType = StructType(Seq())
+  def stop(): Unit = {}
+  def mergeOffsets(offsets: Array[PartitionOffset]): Offset = RateStreamOffset(Map())
+  def setOffset(start: Optional[Offset]): Unit = {}
+
+  def createReadTasks(): java.util.ArrayList[ReadTask[Row]] = {
+    throw new IllegalStateException("fake source - cannot actually read")
+  }
+}
+
+trait FakeMicroBatchReadSupport extends MicroBatchReadSupport {
+  override def createMicroBatchReader(
+      schema: Optional[StructType],
+      checkpointLocation: String,
+      options: DataSourceV2Options): MicroBatchReader = FakeReader()
+}
+
+trait FakeContinuousReadSupport extends ContinuousReadSupport {
+  override def createContinuousReader(
+      schema: Optional[StructType],
+      checkpointLocation: String,
+      options: DataSourceV2Options): ContinuousReader = FakeReader()
+}
+
+trait FakeMicroBatchWriteSupport extends MicroBatchWriteSupport {
+  def createMicroBatchWriter(
+      queryId: String,
+      epochId: Long,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): Optional[DataSourceV2Writer] = {
+    throw new IllegalStateException("fake sink - cannot actually write")
+  }
+}
+
+trait FakeContinuousWriteSupport extends ContinuousWriteSupport {
+  def createContinuousWriter(
+      queryId: String,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): Optional[ContinuousWriter] = {
+    throw new IllegalStateException("fake sink - cannot actually write")
+  }
+}
+
+class FakeReadMicroBatchOnly extends DataSourceRegister with FakeMicroBatchReadSupport {
+  override def shortName(): String = "fake-read-microbatch-only"
+}
+
+class FakeReadContinuousOnly extends DataSourceRegister with FakeContinuousReadSupport {
+  override def shortName(): String = "fake-read-continuous-only"
+}
+
+class FakeReadBothModes extends DataSourceRegister
+    with FakeMicroBatchReadSupport with FakeContinuousReadSupport {
+  override def shortName(): String = "fake-read-microbatch-continuous"
+}
+
+class FakeReadNeitherMode extends DataSourceRegister {
+  override def shortName(): String = "fake-read-neither-mode"
+}
+
+class FakeWriteMicroBatchOnly extends DataSourceRegister with FakeMicroBatchWriteSupport {
+  override def shortName(): String = "fake-write-microbatch-only"
+}
+
+class FakeWriteContinuousOnly extends DataSourceRegister with FakeContinuousWriteSupport {
+  override def shortName(): String = "fake-write-continuous-only"
+}
+
+class FakeWriteBothModes extends DataSourceRegister
+    with FakeMicroBatchWriteSupport with FakeContinuousWriteSupport {
+  override def shortName(): String = "fake-write-microbatch-continuous"
+}
+
+class FakeWriteNeitherMode extends DataSourceRegister {
+  override def shortName(): String = "fake-write-neither-mode"
+}
+
+class StreamingDataSourceV2Suite extends StreamTest {
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    val fakeCheckpoint = Utils.createTempDir()
+    spark.conf.set("spark.sql.streaming.checkpointLocation", fakeCheckpoint.getCanonicalPath)
+  }
+
+  val readFormats = Seq(
+    "fake-read-microbatch-only",
+    "fake-read-continuous-only",
+    "fake-read-microbatch-continuous",
+    "fake-read-neither-mode")
+  val writeFormats = Seq(
+    "fake-write-microbatch-only",
+    "fake-write-continuous-only",
+    "fake-write-microbatch-continuous",
+    "fake-write-neither-mode")
+  val triggers = Seq(
+    Trigger.Once(),
+    Trigger.ProcessingTime(1000),
+    Trigger.Continuous(1000))
+
+  private def testPositiveCase(readFormat: String, writeFormat: String, trigger: Trigger) = {
+    val query = spark.readStream
+      .format(readFormat)
+      .load()
+      .writeStream
+      .format(writeFormat)
+      .trigger(trigger)
+      .start()
+    query.stop()
+  }
+
+  private def testNegativeCase(
+      readFormat: String,
+      writeFormat: String,
+      trigger: Trigger,
+      errorMsg: String) = {
+    val ex = intercept[UnsupportedOperationException] {
+      testPositiveCase(readFormat, writeFormat, trigger)
+    }
+    assert(ex.getMessage.contains(errorMsg))
+  }
+
+  private def testPostCreationNegativeCase(
+      readFormat: String,
+      writeFormat: String,
+      trigger: Trigger,
+      errorMsg: String) = {
+    val query = spark.readStream
+      .format(readFormat)
+      .load()
+      .writeStream
+      .format(writeFormat)
+      .trigger(trigger)
+      .start()
+
+    eventually(timeout(streamingTimeout)) {
+      assert(query.exception.isDefined)
+      assert(query.exception.get.cause != null)
+      assert(query.exception.get.cause.getMessage.contains(errorMsg))
+    }
+  }
+
+  // Get a list of (read, write, trigger) tuples for test cases.
+  val cases = readFormats.flatMap { read =>
+    writeFormats.flatMap { write =>
+      triggers.map(t => (write, t))
+    }.map {
+      case (write, t) => (read, write, t)
+    }
+  }
+
+  for ((read, write, trigger) <- cases) {
+    testQuietly(s"stream with read format $read, write format $write, trigger $trigger") {
+      val readSource = DataSource.lookupDataSource(read, spark.sqlContext.conf).newInstance()
+      val writeSource = DataSource.lookupDataSource(write, spark.sqlContext.conf).newInstance()
+      (readSource, writeSource, trigger) match {
+        // Valid microbatch queries.
+        case (_: MicroBatchReadSupport, _: MicroBatchWriteSupport, t)
+          if !t.isInstanceOf[ContinuousTrigger] =>
+          testPositiveCase(read, write, trigger)
+
+        // Valid continuous queries.
+        case (_: ContinuousReadSupport, _: ContinuousWriteSupport, _: ContinuousTrigger) =>
+          testPositiveCase(read, write, trigger)
+
+        // Invalid - can't read at all
+        case (r, _, _)
+            if !r.isInstanceOf[MicroBatchReadSupport]
+              && !r.isInstanceOf[ContinuousReadSupport] =>
+          testNegativeCase(read, write, trigger,
+            s"Data source $read does not support streamed reading")
+
+        // Invalid - trigger is continuous but writer is not
+        case (_, w, _: ContinuousTrigger) if !w.isInstanceOf[ContinuousWriteSupport] =>
+          testNegativeCase(read, write, trigger,
+            s"Data source $write does not support continuous writing")
+
+        // Invalid - can't write at all
+        case (_, w, _)
+            if !w.isInstanceOf[MicroBatchWriteSupport]
+              && !w.isInstanceOf[ContinuousWriteSupport] =>
+          testNegativeCase(read, write, trigger,
+            s"Data source $write does not support streamed writing")
+
+        // Invalid - trigger and writer are continuous but reader is not
+        case (r, _: ContinuousWriteSupport, _: ContinuousTrigger)
+            if !r.isInstanceOf[ContinuousReadSupport] =>
+          testNegativeCase(read, write, trigger,
+            s"Data source $read does not support continuous processing")
+
+        // Invalid - trigger is microbatch but writer is not
+        case (_, w, t)
+            if !w.isInstanceOf[MicroBatchWriteSupport] && !t.isInstanceOf[ContinuousTrigger] =>
+          testNegativeCase(read, write, trigger,
+            s"Data source $write does not support streamed writing")
+
+        // Invalid - trigger and writer are microbatch but reader is not
+        case (r, _, t)
+           if !r.isInstanceOf[MicroBatchReadSupport] && !t.isInstanceOf[ContinuousTrigger] =>
+          testPostCreationNegativeCase(read, write, trigger,
+            s"Data source $read does not support microbatch processing")
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index aa163d2211c3..8212fb912ec5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -422,21 +422,6 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
-  test("ConsoleSink can be correctly loaded") {
-    LastOptions.clear()
-    val df = spark.readStream
-      .format("org.apache.spark.sql.streaming.test")
-      .load()
-
-    val sq = df.writeStream
-      .format("console")
-      .option("checkpointLocation", newMetadataDir)
-      .trigger(ProcessingTime(2.seconds))
-      .start()
-
-    sq.awaitTermination(2000L)
-  }
-
   test("prevent all column partitioning") {
     withTempDir { dir =>
       val path = dir.getCanonicalPath
@@ -450,16 +435,6 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
-  test("ConsoleSink should not require checkpointLocation") {
-    LastOptions.clear()
-    val df = spark.readStream
-      .format("org.apache.spark.sql.streaming.test")
-      .load()
-
-    val sq = df.writeStream.format("console").start()
-    sq.stop()
-  }
-
   private def testMemorySinkCheckpointRecovery(chkLoc: String, provideInWriter: Boolean): Unit = {
     import testImplicits._
     val ms = new MemoryStream[Int](0, sqlContext)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index 904f9f2ad0b2..bc4a120f7042 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -254,13 +254,26 @@ private[sql] trait SQLTestUtilsBase
   }
 
   /**
-   * Drops temporary table `tableName` after calling `f`.
+   * Drops temporary view `viewNames` after calling `f`.
    */
-  protected def withTempView(tableNames: String*)(f: => Unit): Unit = {
+  protected def withTempView(viewNames: String*)(f: => Unit): Unit = {
     try f finally {
       // If the test failed part way, we don't want to mask the failure by failing to remove
-      // temp tables that never got created.
-      try tableNames.foreach(spark.catalog.dropTempView) catch {
+      // temp views that never got created.
+      try viewNames.foreach(spark.catalog.dropTempView) catch {
+        case _: NoSuchTableException =>
+      }
+    }
+  }
+
+  /**
+   * Drops global temporary view `viewNames` after calling `f`.
+   */
+  protected def withGlobalTempView(viewNames: String*)(f: => Unit): Unit = {
+    try f finally {
+      // If the test failed part way, we don't want to mask the failure by failing to remove
+      // global temp views that never got created.
+      try viewNames.foreach(spark.catalog.dropGlobalTempView) catch {
         case _: NoSuchTableException =>
       }
     }
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 3135a8a275da..6e04f528a1c2 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java
index 47bfaa86021d..108074cce3d6 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java
@@ -223,6 +223,18 @@ private void configureSession(Map<String, String> sessionConfMap) throws HiveSQL
 
   @Override
   public void setOperationLogSessionDir(File operationLogRootDir) {
+    if (!operationLogRootDir.exists()) {
+      LOG.warn("The operation log root directory is removed, recreating: " +
+          operationLogRootDir.getAbsolutePath());
+      if (!operationLogRootDir.mkdirs()) {
+        LOG.warn("Unable to create operation log root directory: " +
+            operationLogRootDir.getAbsolutePath());
+      }
+    }
+    if (!operationLogRootDir.canWrite()) {
+      LOG.warn("The operation log root directory is not writable: " +
+          operationLogRootDir.getAbsolutePath());
+    }
     sessionLogDir = new File(operationLogRootDir, sessionHandle.getHandleIdentifier().toString());
     isOperationLogEnabled = true;
     if (!sessionLogDir.exists()) {
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 66fad85ea026..da50c9bd60fa 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
index 92cb4ef11c9e..12c74368dd18 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
@@ -42,7 +42,7 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session
    * Create a Hive aware resource loader.
    */
   override protected lazy val resourceLoader: HiveSessionResourceLoader = {
-    val client: HiveClient = externalCatalog.client.newSession()
+    val client: HiveClient = externalCatalog.client
     new HiveSessionResourceLoader(session, client)
   }
 
@@ -96,22 +96,7 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session
       override val sparkSession: SparkSession = session
 
       override def extraPlanningStrategies: Seq[Strategy] =
-        super.extraPlanningStrategies ++ customPlanningStrategies
-
-      override def strategies: Seq[Strategy] = {
-        experimentalMethods.extraStrategies ++
-          extraPlanningStrategies ++ Seq(
-          FileSourceStrategy,
-          DataSourceStrategy(conf),
-          SpecialLimits,
-          InMemoryScans,
-          HiveTableScans,
-          Scripts,
-          Aggregation,
-          JoinSelection,
-          BasicOperators
-        )
-      }
+        super.extraPlanningStrategies ++ customPlanningStrategies ++ Seq(HiveTableScans, Scripts)
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index c489690af8cd..c7717d70c996 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -47,7 +47,7 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf._
 import org.apache.spark.sql.internal.StaticSQLConf.{CATALOG_IMPLEMENTATION, WAREHOUSE_PATH}
 import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ChildFirstURLClassLoader, Utils}
 
 
 private[spark] object HiveUtils extends Logging {
@@ -312,6 +312,8 @@ private[spark] object HiveUtils extends Logging {
       // starting from the given classLoader.
       def allJars(classLoader: ClassLoader): Array[URL] = classLoader match {
         case null => Array.empty[URL]
+        case childFirst: ChildFirstURLClassLoader =>
+          childFirst.getURLs() ++ allJars(Utils.getSparkClassLoader)
         case urlClassLoader: URLClassLoader =>
           urlClassLoader.getURLs ++ allJars(urlClassLoader.getParent)
         case other => allJars(other.getParent)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 7b7f4e0f1021..4b923f5235a9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -330,7 +330,7 @@ private[hive] class HiveClientImpl(
     Option(client.getDatabase(dbName)).map { d =>
       CatalogDatabase(
         name = d.getName,
-        description = d.getDescription,
+        description = Option(d.getDescription).getOrElse(""),
         locationUri = CatalogUtils.stringToURI(d.getLocationUri),
         properties = Option(d.getParameters).map(_.asScala.toMap).orNull)
     }.getOrElse(throw new NoSuchDatabaseException(dbName))
@@ -823,7 +823,8 @@ private[hive] class HiveClientImpl(
   }
 
   def reset(): Unit = withHiveState {
-    client.getAllTables("default").asScala.foreach { t =>
+    try {
+      client.getAllTables("default").asScala.foreach { t =>
         logDebug(s"Deleting table $t")
         val table = client.getTable("default", t)
         client.getIndexes("default", t, 255).asScala.foreach { index =>
@@ -837,6 +838,9 @@ private[hive] class HiveClientImpl(
         logDebug(s"Dropping Database: $db")
         client.dropDatabase(db, true, false, true)
       }
+    } finally {
+      runSqlHive("USE default")
+    }
   }
 }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 95741c7b3028..237ed9bc0598 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -59,9 +59,11 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       sparkSession: SparkSession,
       options: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
+    val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
     OrcFileOperator.readSchema(
       files.map(_.getPath.toString),
-      Some(sparkSession.sessionState.newHadoopConf())
+      Some(sparkSession.sessionState.newHadoopConf()),
+      ignoreCorruptFiles
     )
   }
 
@@ -129,6 +131,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
 
     val broadcastedHadoopConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
+    val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
 
     (file: PartitionedFile) => {
       val conf = broadcastedHadoopConf.value.value
@@ -138,7 +141,8 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       // SPARK-8501: Empty ORC files always have an empty schema stored in their footer. In this
       // case, `OrcFileOperator.readSchema` returns `None`, and we can't read the underlying file
       // using the given physical schema. Instead, we simply return an empty iterator.
-      val isEmptyFile = OrcFileOperator.readSchema(Seq(filePath.toString), Some(conf)).isEmpty
+      val isEmptyFile =
+        OrcFileOperator.readSchema(Seq(filePath.toString), Some(conf), ignoreCorruptFiles).isEmpty
       if (isEmptyFile) {
         Iterator.empty
       } else {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
index 5a3fcd7a759c..80e44ca50435 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.sql.hive.orc
 
+import java.io.IOException
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
 
+import org.apache.spark.SparkException
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
@@ -46,7 +49,10 @@ private[hive] object OrcFileOperator extends Logging {
    *       create the result reader from that file.  If no such file is found, it returns `None`.
    * @todo Needs to consider all files when schema evolution is taken into account.
    */
-  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
+  def getFileReader(basePath: String,
+      config: Option[Configuration] = None,
+      ignoreCorruptFiles: Boolean = false)
+      : Option[Reader] = {
     def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
       reader.getObjectInspector match {
         case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
@@ -65,16 +71,28 @@ private[hive] object OrcFileOperator extends Logging {
     }
 
     listOrcFiles(basePath, conf).iterator.map { path =>
-      path -> OrcFile.createReader(fs, path)
+      val reader = try {
+        Some(OrcFile.createReader(fs, path))
+      } catch {
+        case e: IOException =>
+          if (ignoreCorruptFiles) {
+            logWarning(s"Skipped the footer in the corrupted file: $path", e)
+            None
+          } else {
+            throw new SparkException(s"Could not read footer for file: $path", e)
+          }
+      }
+      path -> reader
     }.collectFirst {
-      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
+      case (path, Some(reader)) if isWithNonEmptySchema(path, reader) => reader
     }
   }
 
-  def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = {
+  def readSchema(paths: Seq[String], conf: Option[Configuration], ignoreCorruptFiles: Boolean)
+      : Option[StructType] = {
     // Take the first file where we can open a valid reader if we can find one.  Otherwise just
     // return None to indicate we can't infer the schema.
-    paths.flatMap(getFileReader(_, conf)).headOption.map { reader =>
+    paths.flatMap(getFileReader(_, conf, ignoreCorruptFiles)).headOption.map { reader =>
       val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
       val schema = readerInspector.getTypeName
       logDebug(s"Reading schema from file $paths, got Hive schema string: $schema")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index b6be00dbb3a7..c84131fc3212 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -180,7 +180,13 @@ private[hive] class TestHiveSparkSession(
       ConfVars.METASTORE_INTEGER_JDO_PUSHDOWN.varname -> "true",
       // scratch directory used by Hive's metastore client
       ConfVars.SCRATCHDIR.varname -> TestHiveContext.makeScratchDir().toURI.toString,
-      ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY.varname -> "1")
+      ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY.varname -> "1") ++
+      // After session cloning, the JDBC connect string for a JDBC metastore should not be changed.
+      existingSharedState.map { state =>
+        val connKey =
+          state.sparkContext.hadoopConfiguration.get(ConfVars.METASTORECONNECTURLKEY.varname)
+        ConfVars.METASTORECONNECTURLKEY.varname -> connKey
+      }
 
     metastoreTempConf.foreach { case (k, v) =>
       sc.hadoopConfiguration.set(k, v)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
index 2e35fdeba464..0a522b6a11c8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
@@ -107,4 +107,10 @@ class HiveExternalCatalogSuite extends ExternalCatalogSuite {
       .filter(_.contains("Num Buckets")).head
     assert(bucketString.contains("10"))
   }
+
+  test("SPARK-23001: NullPointerException when running desc database") {
+    val catalog = newBasicCatalog()
+    catalog.createDatabase(newDb("dbWithNullDesc").copy(description = null), ignoreIfExists = false)
+    assert(catalog.getDatabase("dbWithNullDesc").description == "")
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index a3d5b941a676..ae4aeb7b4ce4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -18,11 +18,14 @@
 package org.apache.spark.sql.hive
 
 import java.io.File
-import java.nio.file.Files
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Paths}
 
 import scala.sys.process._
 
-import org.apache.spark.TestUtils
+import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.{SecurityManager, SparkConf, TestUtils}
 import org.apache.spark.sql.{QueryTest, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.CatalogTableType
@@ -55,14 +58,19 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
   private def tryDownloadSpark(version: String, path: String): Unit = {
     // Try mirrors a few times until one succeeds
     for (i <- 0 until 3) {
+      // we don't retry on a failure to get mirror url. If we can't get a mirror url,
+      // the test fails (getStringFromUrl will throw an exception)
       val preferredMirror =
-        Seq("wget", "https://www.apache.org/dyn/closer.lua?preferred=true", "-q", "-O", "-").!!.trim
-      val url = s"$preferredMirror/spark/spark-$version/spark-$version-bin-hadoop2.7.tgz"
+        getStringFromUrl("https://www.apache.org/dyn/closer.lua?preferred=true")
+      val filename = s"spark-$version-bin-hadoop2.7.tgz"
+      val url = s"$preferredMirror/spark/spark-$version/$filename"
       logInfo(s"Downloading Spark $version from $url")
-      if (Seq("wget", url, "-q", "-P", path).! == 0) {
+      try {
+        getFileFromUrl(url, path, filename)
         return
+      } catch {
+        case ex: Exception => logWarning(s"Failed to download Spark $version from $url", ex)
       }
-      logWarning(s"Failed to download Spark $version from $url")
     }
     fail(s"Unable to download Spark $version")
   }
@@ -85,6 +93,34 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
     new File(tmpDataDir, name).getCanonicalPath
   }
 
+  private def getFileFromUrl(urlString: String, targetDir: String, filename: String): Unit = {
+    val conf = new SparkConf
+    // if the caller passes the name of an existing file, we want doFetchFile to write over it with
+    // the contents from the specified url.
+    conf.set("spark.files.overwrite", "true")
+    val securityManager = new SecurityManager(conf)
+    val hadoopConf = new Configuration
+
+    val outDir = new File(targetDir)
+    if (!outDir.exists()) {
+      outDir.mkdirs()
+    }
+
+    // propagate exceptions up to the caller of getFileFromUrl
+    Utils.doFetchFile(urlString, outDir, filename, conf, securityManager, hadoopConf)
+  }
+
+  private def getStringFromUrl(urlString: String): String = {
+    val contentFile = File.createTempFile("string-", ".txt")
+    contentFile.deleteOnExit()
+
+    // exceptions will propagate to the caller of getStringFromUrl
+    getFileFromUrl(urlString, contentFile.getParent, contentFile.getName)
+
+    val contentPath = Paths.get(contentFile.toURI)
+    new String(Files.readAllBytes(contentPath), StandardCharsets.UTF_8)
+  }
+
   override def beforeAll(): Unit = {
     super.beforeAll()
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 18137e7ea1d6..83b4c862e254 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -146,6 +146,12 @@ class DataSourceWithHiveMetastoreCatalogSuite
     'id cast StringType as 'd2
   ).coalesce(1)
 
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    sparkSession.sessionState.catalog.reset()
+    sparkSession.metadataHive.reset()
+  }
+
   Seq(
     "parquet" -> ((
       "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
@@ -160,13 +166,13 @@ class DataSourceWithHiveMetastoreCatalogSuite
     ))
   ).foreach { case (provider, (inputFormat, outputFormat, serde)) =>
     test(s"Persist non-partitioned $provider relation into metastore as managed table") {
-      withTable("t") {
+      withTable("default.t") {
         withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") {
           testDF
             .write
             .mode(SaveMode.Overwrite)
             .format(provider)
-            .saveAsTable("t")
+            .saveAsTable("default.t")
         }
 
         val hiveTable = sessionState.catalog.getTableMetadata(TableIdentifier("t", Some("default")))
@@ -181,14 +187,15 @@ class DataSourceWithHiveMetastoreCatalogSuite
         assert(columns.map(_.name) === Seq("d1", "d2"))
         assert(columns.map(_.dataType) === Seq(DecimalType(10, 3), StringType))
 
-        checkAnswer(table("t"), testDF)
-        assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") === Seq("1.1\t1", "2.1\t2"))
+        checkAnswer(table("default.t"), testDF)
+        assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM default.t") ===
+          Seq("1.1\t1", "2.1\t2"))
       }
     }
 
     test(s"Persist non-partitioned $provider relation into metastore as external table") {
       withTempPath { dir =>
-        withTable("t") {
+        withTable("default.t") {
           val path = dir.getCanonicalFile
 
           withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") {
@@ -197,7 +204,7 @@ class DataSourceWithHiveMetastoreCatalogSuite
               .mode(SaveMode.Overwrite)
               .format(provider)
               .option("path", path.toString)
-              .saveAsTable("t")
+              .saveAsTable("default.t")
           }
 
           val hiveTable =
@@ -213,8 +220,8 @@ class DataSourceWithHiveMetastoreCatalogSuite
           assert(columns.map(_.name) === Seq("d1", "d2"))
           assert(columns.map(_.dataType) === Seq(DecimalType(10, 3), StringType))
 
-          checkAnswer(table("t"), testDF)
-          assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") ===
+          checkAnswer(table("default.t"), testDF)
+          assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM default.t") ===
             Seq("1.1\t1", "2.1\t2"))
         }
       }
@@ -222,9 +229,9 @@ class DataSourceWithHiveMetastoreCatalogSuite
 
     test(s"Persist non-partitioned $provider relation into metastore as managed table using CTAS") {
       withTempPath { dir =>
-        withTable("t") {
+        withTable("default.t") {
           sql(
-            s"""CREATE TABLE t USING $provider
+            s"""CREATE TABLE default.t USING $provider
                |OPTIONS (path '${dir.toURI}')
                |AS SELECT 1 AS d1, "val_1" AS d2
              """.stripMargin)
@@ -242,8 +249,9 @@ class DataSourceWithHiveMetastoreCatalogSuite
           assert(columns.map(_.name) === Seq("d1", "d2"))
           assert(columns.map(_.dataType) === Seq(IntegerType, StringType))
 
-          checkAnswer(table("t"), Row(1, "val_1"))
-          assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") === Seq("1\tval_1"))
+          checkAnswer(table("default.t"), Row(1, "val_1"))
+          assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM default.t") ===
+            Seq("1\tval_1"))
         }
       }
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSessionStateSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSessionStateSuite.scala
index 958ad3e1c3ce..3d1a0b054dc3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSessionStateSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSessionStateSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive
 
-import org.scalatest.BeforeAndAfterEach
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
@@ -25,8 +25,7 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton
 /**
  * Run all tests from `SessionStateSuite` with a Hive based `SessionState`.
  */
-class HiveSessionStateSuite extends SessionStateSuite
-  with TestHiveSingleton with BeforeAndAfterEach {
+class HiveSessionStateSuite extends SessionStateSuite with TestHiveSingleton {
 
   override def beforeAll(): Unit = {
     // Reuse the singleton session
@@ -38,4 +37,15 @@ class HiveSessionStateSuite extends SessionStateSuite
     activeSession = null
     super.afterAll()
   }
+
+  test("Clone then newSession") {
+    val sparkSession = hiveContext.sparkSession
+    val conf = sparkSession.sparkContext.hadoopConfiguration
+    val oldValue = conf.get(ConfVars.METASTORECONNECTURLKEY.varname)
+    sparkSession.cloneSession()
+    sparkSession.sharedState.externalCatalog.client.newSession()
+    val newValue = conf.get(ConfVars.METASTORECONNECTURLKEY.varname)
+    assert(oldValue == newValue,
+      "cloneSession and then newSession should not affect the Derby directory")
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
index fdbfcf1a6844..8697d47e89e8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
@@ -17,11 +17,16 @@
 
 package org.apache.spark.sql.hive
 
+import java.net.URL
+
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader}
 
 class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
@@ -42,4 +47,19 @@ class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton
       assert(hiveConf("foo") === "bar")
     }
   }
+
+  test("ChildFirstURLClassLoader's parent is null, get spark classloader instead") {
+    val conf = new SparkConf
+    val contextClassLoader = Thread.currentThread().getContextClassLoader
+    val loader = new ChildFirstURLClassLoader(Array(), contextClassLoader)
+    try {
+      Thread.currentThread().setContextClassLoader(loader)
+      HiveUtils.newClientForMetadata(
+        conf,
+        SparkHadoopUtil.newConfiguration(conf),
+        HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true))
+    } finally {
+      Thread.currentThread().setContextClassLoader(contextClassLoader)
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index c8caba83bf36..fade143a1755 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -23,14 +23,12 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.SparkContext
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.execution.command.CreateTableCommand
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.hive.HiveExternalCatalog._
-import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.StaticSQLConf._
@@ -1344,18 +1342,6 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
     }
   }
 
-  Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
-    test(s"SPARK-22146: read files containing special characters using $format") {
-      val nameWithSpecialChars = s"sp&cial%chars"
-      withTempDir { dir =>
-        val tmpFile = s"$dir/$nameWithSpecialChars"
-        spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
-        val fileContent = spark.read.format(format).load(tmpFile)
-        checkAnswer(fileContent, Seq(Row("a"), Row("b")))
-      }
-    }
-  }
-
   private def withDebugMode(f: => Unit): Unit = {
     val previousValue = sparkSession.sparkContext.conf.get(DEBUG_MODE)
     try {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 94473a08dd31..e64389e56b5a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -163,6 +163,15 @@ class VersionsSuite extends SparkFunSuite with Logging {
       client.createDatabase(tempDB, ignoreIfExists = true)
     }
 
+    test(s"$version: createDatabase with null description") {
+      withTempDir { tmpDir =>
+        val dbWithNullDesc =
+          CatalogDatabase("dbWithNullDesc", description = null, tmpDir.toURI, Map())
+        client.createDatabase(dbWithNullDesc, ignoreIfExists = true)
+        assert(client.getDatabase("dbWithNullDesc").description == "")
+      }
+    }
+
     test(s"$version: setCurrentDatabase") {
       client.setCurrentDatabase("default")
     }
@@ -802,7 +811,7 @@ class VersionsSuite extends SparkFunSuite with Logging {
 
     test(s"$version: read avro file containing decimal") {
       val url = Thread.currentThread().getContextClassLoader.getResource("avroDecimal")
-      val location = new File(url.getFile)
+      val location = new File(url.getFile).toURI.toString
 
       val tableName = "tab1"
       val avroSchema =
@@ -842,6 +851,8 @@ class VersionsSuite extends SparkFunSuite with Logging {
     }
 
     test(s"$version: SPARK-17920: Insert into/overwrite avro table") {
+      // skipped because it's failed in the condition on Windows
+      assume(!(Utils.isWindows && version == "0.12"))
       withTempDir { dir =>
         val avroSchema =
           """
@@ -866,10 +877,10 @@ class VersionsSuite extends SparkFunSuite with Logging {
         val writer = new PrintWriter(schemaFile)
         writer.write(avroSchema)
         writer.close()
-        val schemaPath = schemaFile.getCanonicalPath
+        val schemaPath = schemaFile.toURI.toString
 
         val url = Thread.currentThread().getContextClassLoader.getResource("avroDecimal")
-        val srcLocation = new File(url.getFile).getCanonicalPath
+        val srcLocation = new File(url.getFile).toURI.toString
         val destTableName = "tab1"
         val srcTableName = "tab2"
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index f2e0c695ca38..65be24441867 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -875,12 +875,13 @@ class HiveDDLSuite
 
   test("desc table for Hive table - bucketed + sorted table") {
     withTable("tbl") {
-      sql(s"""
-        CREATE TABLE tbl (id int, name string)
-        PARTITIONED BY (ds string)
-        CLUSTERED BY(id)
-        SORTED BY(id, name) INTO 1024 BUCKETS
-        """)
+      sql(
+        s"""
+          |CREATE TABLE tbl (id int, name string)
+          |CLUSTERED BY(id)
+          |SORTED BY(id, name) INTO 1024 BUCKETS
+          |PARTITIONED BY (ds string)
+        """.stripMargin)
 
       val x = sql("DESC FORMATTED tbl").collect()
       assert(x.containsSlice(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
index dfabf1ec2a22..a4273de5fe26 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -171,4 +171,21 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
       sql("EXPLAIN EXTENDED CODEGEN SELECT 1")
     }
   }
+
+  test("SPARK-23021 AnalysisBarrier should not cut off explain output for parsed logical plans") {
+    val df = Seq((1, 1)).toDF("a", "b").groupBy("a").count().limit(1)
+    val outputStream = new java.io.ByteArrayOutputStream()
+    Console.withOut(outputStream) {
+      df.explain(true)
+    }
+    assert(outputStream.toString.replaceAll("""#\d+""", "#0").contains(
+      s"""== Parsed Logical Plan ==
+         |GlobalLimit 1
+         |+- LocalLimit 1
+         |   +- AnalysisBarrier
+         |         +- Aggregate [a#0], [a#0, count(1) AS count#0L]
+         |            +- Project [_1#0 AS a#0, _2#0 AS b#0]
+         |               +- LocalRelation [_1#0, _2#0]
+         |""".stripMargin))
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 07ae3ae94584..33bcae91fdaf 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -461,51 +461,55 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("CTAS without serde without location") {
-    val originalConf = sessionState.conf.convertCTAS
-
-    setConf(SQLConf.CONVERT_CTAS, true)
-
-    val defaultDataSource = sessionState.conf.defaultDataSourceName
-    try {
-      sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-      sql("CREATE TABLE IF NOT EXISTS ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-      val message = intercept[AnalysisException] {
+    withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") {
+      val defaultDataSource = sessionState.conf.defaultDataSourceName
+      withTable("ctas1") {
         sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-      }.getMessage
-      assert(message.contains("already exists"))
-      checkRelation("ctas1", true, defaultDataSource)
-      sql("DROP TABLE ctas1")
+        sql("CREATE TABLE IF NOT EXISTS ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+        val message = intercept[AnalysisException] {
+          sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+        }.getMessage
+        assert(message.contains("already exists"))
+        checkRelation("ctas1", isDataSourceTable = true, defaultDataSource)
+      }
 
       // Specifying database name for query can be converted to data source write path
       // is not allowed right now.
-      sql("CREATE TABLE default.ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", true, defaultDataSource)
-      sql("DROP TABLE ctas1")
+      withTable("ctas1") {
+        sql("CREATE TABLE default.ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+        checkRelation("ctas1", isDataSourceTable = true, defaultDataSource)
+      }
 
-      sql("CREATE TABLE ctas1 stored as textfile" +
+      withTable("ctas1") {
+        sql("CREATE TABLE ctas1 stored as textfile" +
           " AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "text")
-      sql("DROP TABLE ctas1")
+        checkRelation("ctas1", isDataSourceTable = false, "text")
+      }
 
-      sql("CREATE TABLE ctas1 stored as sequencefile" +
-            " AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "sequence")
-      sql("DROP TABLE ctas1")
+      withTable("ctas1") {
+        sql("CREATE TABLE ctas1 stored as sequencefile" +
+          " AS SELECT key k, value FROM src ORDER BY k, value")
+        checkRelation("ctas1", isDataSourceTable = false, "sequence")
+      }
 
-      sql("CREATE TABLE ctas1 stored as rcfile AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "rcfile")
-      sql("DROP TABLE ctas1")
+      withTable("ctas1") {
+        sql("CREATE TABLE ctas1 stored as rcfile AS SELECT key k, value FROM src ORDER BY k, value")
+        checkRelation("ctas1", isDataSourceTable = false, "rcfile")
+      }
 
-      sql("CREATE TABLE ctas1 stored as orc AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "orc")
-      sql("DROP TABLE ctas1")
+      withTable("ctas1") {
+        sql("CREATE TABLE ctas1 stored as orc AS SELECT key k, value FROM src ORDER BY k, value")
+        checkRelation("ctas1", isDataSourceTable = false, "orc")
+      }
 
-      sql("CREATE TABLE ctas1 stored as parquet AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "parquet")
-      sql("DROP TABLE ctas1")
-    } finally {
-      setConf(SQLConf.CONVERT_CTAS, originalConf)
-      sql("DROP TABLE IF EXISTS ctas1")
+      withTable("ctas1") {
+        sql(
+          """
+            |CREATE TABLE ctas1 stored as parquet
+            |AS SELECT key k, value FROM src ORDER BY k, value
+          """.stripMargin)
+        checkRelation("ctas1", isDataSourceTable = false, "parquet")
+      }
     }
   }
 
@@ -539,30 +543,40 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         val defaultDataSource = sessionState.conf.defaultDataSourceName
 
         val tempLocation = dir.toURI.getPath.stripSuffix("/")
-        sql(s"CREATE TABLE ctas1 LOCATION 'file:$tempLocation/c1'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", true, defaultDataSource, Some(s"file:$tempLocation/c1"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 LOCATION 'file:$tempLocation/c1'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = true, defaultDataSource, Some(s"file:$tempLocation/c1"))
+        }
 
-        sql(s"CREATE TABLE ctas1 LOCATION 'file:$tempLocation/c2'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", true, defaultDataSource, Some(s"file:$tempLocation/c2"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 LOCATION 'file:$tempLocation/c2'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = true, defaultDataSource, Some(s"file:$tempLocation/c2"))
+        }
 
-        sql(s"CREATE TABLE ctas1 stored as textfile LOCATION 'file:$tempLocation/c3'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", false, "text", Some(s"file:$tempLocation/c3"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 stored as textfile LOCATION 'file:$tempLocation/c3'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = false, "text", Some(s"file:$tempLocation/c3"))
+        }
 
-        sql(s"CREATE TABLE ctas1 stored as sequenceFile LOCATION 'file:$tempLocation/c4'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", false, "sequence", Some(s"file:$tempLocation/c4"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 stored as sequenceFile LOCATION 'file:$tempLocation/c4'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = false, "sequence", Some(s"file:$tempLocation/c4"))
+        }
 
-        sql(s"CREATE TABLE ctas1 stored as rcfile LOCATION 'file:$tempLocation/c5'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", false, "rcfile", Some(s"file:$tempLocation/c5"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 stored as rcfile LOCATION 'file:$tempLocation/c5'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = false, "rcfile", Some(s"file:$tempLocation/c5"))
+        }
       }
     }
   }
@@ -2145,12 +2159,4 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
-
-  Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
-    test(s"Writing empty datasets should not fail - $format") {
-      withTempDir { dir =>
-        Seq("str").toDS.limit(0).write.format(format).save(dir.getCanonicalPath + "/tmp")
-      }
-    }
-  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
index 17b7d8cfe127..d556a030e218 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
@@ -20,8 +20,10 @@ package org.apache.spark.sql.hive.orc
 import java.io.File
 
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.datasources.orc.OrcSuite
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.util.Utils
 
 class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
@@ -62,6 +64,33 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
        """.stripMargin)
   }
 
+  test("SPARK-22972: hive orc source") {
+    val tableName = "normal_orc_as_source_hive"
+    withTable(tableName) {
+      sql(
+        s"""
+          |CREATE TABLE $tableName
+          |USING org.apache.spark.sql.hive.orc
+          |OPTIONS (
+          |  PATH '${new File(orcTableAsDir.getAbsolutePath).toURI}'
+          |)
+        """.stripMargin)
+
+      val tableMetadata = spark.sessionState.catalog.getTableMetadata(
+        TableIdentifier(tableName))
+      assert(tableMetadata.storage.inputFormat ==
+        Option("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"))
+      assert(tableMetadata.storage.outputFormat ==
+        Option("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"))
+      assert(tableMetadata.storage.serde ==
+        Option("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
+      assert(HiveSerDe.sourceToSerDe("org.apache.spark.sql.hive.orc")
+        .equals(HiveSerDe.sourceToSerDe("orc")))
+      assert(HiveSerDe.sourceToSerDe("org.apache.spark.sql.orc")
+        .equals(HiveSerDe.sourceToSerDe("orc")))
+    }
+  }
+
   test("SPARK-19459/SPARK-18220: read char/varchar column written by Hive") {
     val location = Utils.createTempDir()
     val uri = location.toURI
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
new file mode 100644
index 000000000000..bf6efa7c4c08
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
@@ -0,0 +1,501 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.io.File
+
+import scala.util.{Random, Try}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
+import org.apache.spark.util.{Benchmark, Utils}
+
+
+/**
+ * Benchmark to measure ORC read performance.
+ *
+ * This is in `sql/hive` module in order to compare `sql/core` and `sql/hive` ORC data sources.
+ */
+// scalastyle:off line.size.limit
+object OrcReadBenchmark {
+  val conf = new SparkConf()
+  conf.set("orc.compression", "snappy")
+
+  private val spark = SparkSession.builder()
+    .master("local[1]")
+    .appName("OrcReadBenchmark")
+    .config(conf)
+    .getOrCreate()
+
+  // Set default configs. Individual cases will change them if necessary.
+  spark.conf.set(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key, "true")
+
+  def withTempPath(f: File => Unit): Unit = {
+    val path = Utils.createTempDir()
+    path.delete()
+    try f(path) finally Utils.deleteRecursively(path)
+  }
+
+  def withTempTable(tableNames: String*)(f: => Unit): Unit = {
+    try f finally tableNames.foreach(spark.catalog.dropTempView)
+  }
+
+  def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
+    val (keys, values) = pairs.unzip
+    val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption)
+    (keys, values).zipped.foreach(spark.conf.set)
+    try f finally {
+      keys.zip(currentValues).foreach {
+        case (key, Some(value)) => spark.conf.set(key, value)
+        case (key, None) => spark.conf.unset(key)
+      }
+    }
+  }
+
+  private val NATIVE_ORC_FORMAT = classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName
+  private val HIVE_ORC_FORMAT = classOf[org.apache.spark.sql.hive.orc.OrcFileFormat].getCanonicalName
+
+  private def prepareTable(dir: File, df: DataFrame, partition: Option[String] = None): Unit = {
+    val dirORC = dir.getCanonicalPath
+
+    if (partition.isDefined) {
+      df.write.partitionBy(partition.get).orc(dirORC)
+    } else {
+      df.write.orc(dirORC)
+    }
+
+    spark.read.format(NATIVE_ORC_FORMAT).load(dirORC).createOrReplaceTempView("nativeOrcTable")
+    spark.read.format(HIVE_ORC_FORMAT).load(dirORC).createOrReplaceTempView("hiveOrcTable")
+  }
+
+  def numericScanBenchmark(values: Int, dataType: DataType): Unit = {
+    val benchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        import spark.implicits._
+        spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1")
+
+        prepareTable(dir, spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM t1"))
+
+        benchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
+
+        SQL Single TINYINT Column Scan:          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1135 / 1171         13.9          72.2       1.0X
+        Native ORC Vectorized                          152 /  163        103.4           9.7       7.5X
+        Native ORC Vectorized with copy                149 /  162        105.4           9.5       7.6X
+        Hive built-in ORC                             1380 / 1384         11.4          87.7       0.8X
+
+        SQL Single SMALLINT Column Scan:         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1182 / 1244         13.3          75.2       1.0X
+        Native ORC Vectorized                          145 /  156        108.7           9.2       8.2X
+        Native ORC Vectorized with copy                148 /  158        106.4           9.4       8.0X
+        Hive built-in ORC                             1591 / 1636          9.9         101.2       0.7X
+
+        SQL Single INT Column Scan:              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1271 / 1271         12.4          80.8       1.0X
+        Native ORC Vectorized                          206 /  212         76.3          13.1       6.2X
+        Native ORC Vectorized with copy                200 /  213         78.8          12.7       6.4X
+        Hive built-in ORC                             1776 / 1787          8.9         112.9       0.7X
+
+        SQL Single BIGINT Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1344 / 1355         11.7          85.4       1.0X
+        Native ORC Vectorized                          258 /  268         61.0          16.4       5.2X
+        Native ORC Vectorized with copy                252 /  257         62.4          16.0       5.3X
+        Hive built-in ORC                             1818 / 1823          8.7         115.6       0.7X
+
+        SQL Single FLOAT Column Scan:            Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1333 / 1352         11.8          84.8       1.0X
+        Native ORC Vectorized                          310 /  324         50.7          19.7       4.3X
+        Native ORC Vectorized with copy                312 /  320         50.4          19.9       4.3X
+        Hive built-in ORC                             1904 / 1918          8.3         121.0       0.7X
+
+        SQL Single DOUBLE Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1408 / 1585         11.2          89.5       1.0X
+        Native ORC Vectorized                          359 /  368         43.8          22.8       3.9X
+        Native ORC Vectorized with copy                364 /  371         43.2          23.2       3.9X
+        Hive built-in ORC                             1881 / 1954          8.4         119.6       0.7X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def intStringScanBenchmark(values: Int): Unit = {
+    val benchmark = new Benchmark("Int and String Scan", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        import spark.implicits._
+        spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1")
+
+        prepareTable(
+          dir,
+          spark.sql("SELECT CAST(value AS INT) AS c1, CAST(value as STRING) AS c2 FROM t1"))
+
+        benchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(c1), sum(length(c2)) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
+
+        Int and String Scan:                     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 2566 / 2592          4.1         244.7       1.0X
+        Native ORC Vectorized                         1098 / 1113          9.6         104.7       2.3X
+        Native ORC Vectorized with copy               1527 / 1593          6.9         145.6       1.7X
+        Hive built-in ORC                             3561 / 3705          2.9         339.6       0.7X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def partitionTableScanBenchmark(values: Int): Unit = {
+    val benchmark = new Benchmark("Partitioned Table", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        import spark.implicits._
+        spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1")
+
+        prepareTable(dir, spark.sql("SELECT value % 2 AS p, value AS id FROM t1"), Some("p"))
+
+        benchmark.addCase("Data column - Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Data column - Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Data column - Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Data column - Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
+        }
+
+        benchmark.addCase("Partition column - Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Partition column - Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Partition column - Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Partition column - Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(p) FROM hiveOrcTable").collect()
+        }
+
+        benchmark.addCase("Both columns - Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Both columns - Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Both column - Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Both columns - Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(p), sum(id) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
+
+        Partitioned Table:                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Data only - Native ORC MR                      1447 / 1457         10.9          92.0       1.0X
+        Data only - Native ORC Vectorized               256 /  266         61.4          16.3       5.6X
+        Data only - Native ORC Vectorized with copy     263 /  273         59.8          16.7       5.5X
+        Data only - Hive built-in ORC                  1960 / 1988          8.0         124.6       0.7X
+        Partition only - Native ORC MR                 1039 / 1043         15.1          66.0       1.4X
+        Partition only - Native ORC Vectorized           48 /   53        326.6           3.1      30.1X
+        Partition only - Native ORC Vectorized with copy 48 /   53        328.4           3.0      30.2X
+        Partition only - Hive built-in ORC             1234 / 1242         12.7          78.4       1.2X
+        Both columns - Native ORC MR                   1465 / 1475         10.7          93.1       1.0X
+        Both columns - Native ORC Vectorized            292 /  301         53.9          18.6       5.0X
+        Both column - Native ORC Vectorized with copy   348 /  354         45.1          22.2       4.2X
+        Both columns - Hive built-in ORC               2051 / 2060          7.7         130.4       0.7X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def repeatedStringScanBenchmark(values: Int): Unit = {
+    val benchmark = new Benchmark("Repeated String", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        spark.range(values).createOrReplaceTempView("t1")
+
+        prepareTable(dir, spark.sql("SELECT CAST((id % 200) + 10000 as STRING) AS c1 FROM t1"))
+
+        benchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(length(c1)) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
+
+        Repeated String:                         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1271 / 1278          8.3         121.2       1.0X
+        Native ORC Vectorized                          200 /  212         52.4          19.1       6.4X
+        Native ORC Vectorized with copy                342 /  347         30.7          32.6       3.7X
+        Hive built-in ORC                             1874 / 2105          5.6         178.7       0.7X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def stringWithNullsScanBenchmark(values: Int, fractionOfNulls: Double): Unit = {
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        spark.range(values).createOrReplaceTempView("t1")
+
+        prepareTable(
+          dir,
+          spark.sql(
+            s"SELECT IF(RAND(1) < $fractionOfNulls, NULL, CAST(id as STRING)) AS c1, " +
+            s"IF(RAND(2) < $fractionOfNulls, NULL, CAST(id as STRING)) AS c2 FROM t1"))
+
+        val benchmark = new Benchmark(s"String with Nulls Scan ($fractionOfNulls%)", values)
+
+        benchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " +
+              "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+          }
+        }
+
+        benchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " +
+            "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+        }
+
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " +
+              "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+          }
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql("SELECT SUM(LENGTH(c2)) FROM hiveOrcTable " +
+            "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
+
+        String with Nulls Scan (0.0%):           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 2394 / 2886          4.4         228.3       1.0X
+        Native ORC Vectorized                          699 /  729         15.0          66.7       3.4X
+        Native ORC Vectorized with copy                959 / 1025         10.9          91.5       2.5X
+        Hive built-in ORC                             3899 / 3901          2.7         371.9       0.6X
+
+        String with Nulls Scan (0.5%):           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 2234 / 2255          4.7         213.1       1.0X
+        Native ORC Vectorized                          854 /  869         12.3          81.4       2.6X
+        Native ORC Vectorized with copy               1099 / 1128          9.5         104.8       2.0X
+        Hive built-in ORC                             2767 / 2793          3.8         263.9       0.8X
+
+        String with Nulls Scan (0.95%):          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1166 / 1202          9.0         111.2       1.0X
+        Native ORC Vectorized                          338 /  345         31.1          32.2       3.5X
+        Native ORC Vectorized with copy                418 /  428         25.1          39.9       2.8X
+        Hive built-in ORC                             1730 / 1761          6.1         164.9       0.7X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def columnsBenchmark(values: Int, width: Int): Unit = {
+    val benchmark = new Benchmark(s"Single Column Scan from $width columns", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        import spark.implicits._
+        val middle = width / 2
+        val selectExpr = (1 to width).map(i => s"value as c$i")
+        spark.range(values).map(_ => Random.nextLong).toDF()
+          .selectExpr(selectExpr: _*).createOrReplaceTempView("t1")
+
+        prepareTable(dir, spark.sql("SELECT * FROM t1"))
+
+        benchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql(s"SELECT sum(c$middle) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
+
+        Single Column Scan from 100 columns:     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1050 / 1053          1.0        1001.1       1.0X
+        Native ORC Vectorized                           95 /  101         11.0          90.9      11.0X
+        Native ORC Vectorized with copy                 95 /  102         11.0          90.9      11.0X
+        Hive built-in ORC                              348 /  358          3.0         331.8       3.0X
+
+        Single Column Scan from 200 columns:     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 2099 / 2108          0.5        2002.1       1.0X
+        Native ORC Vectorized                          179 /  187          5.8         171.1      11.7X
+        Native ORC Vectorized with copy                176 /  188          6.0         167.6      11.9X
+        Hive built-in ORC                              562 /  581          1.9         535.9       3.7X
+
+        Single Column Scan from 300 columns:     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 3221 / 3246          0.3        3071.4       1.0X
+        Native ORC Vectorized                          312 /  322          3.4         298.0      10.3X
+        Native ORC Vectorized with copy                306 /  320          3.4         291.6      10.5X
+        Hive built-in ORC                              815 /  824          1.3         777.3       4.0X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def main(args: Array[String]): Unit = {
+    Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { dataType =>
+      numericScanBenchmark(1024 * 1024 * 15, dataType)
+    }
+    intStringScanBenchmark(1024 * 1024 * 10)
+    partitionTableScanBenchmark(1024 * 1024 * 15)
+    repeatedStringScanBenchmark(1024 * 1024 * 10)
+    for (fractionOfNulls <- List(0.0, 0.50, 0.95)) {
+      stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls)
+    }
+    columnsBenchmark(1024 * 1024 * 1, 100)
+    columnsBenchmark(1024 * 1024 * 1, 200)
+    columnsBenchmark(1024 * 1024 * 1, 300)
+  }
+}
+// scalastyle:on line.size.limit
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
index 49be30435ad2..27f398ebf301 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
@@ -28,6 +28,8 @@ import org.apache.spark.sql.types._
 class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
   override val dataSourceName: String = "json"
 
+  private val badJson = "\u0000\u0000\u0000A\u0001AAA"
+
   // JSON does not write data of NullType and does not play well with BinaryType.
   override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
     case _: NullType => false
@@ -105,4 +107,36 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
       )
     }
   }
+
+  test("invalid json with leading nulls - from file (multiLine=true)") {
+    import testImplicits._
+    withTempDir { tempDir =>
+      val path = tempDir.getAbsolutePath
+      Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
+      val expected = s"""$badJson\n{"a":1}\n"""
+      val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType)
+      val df =
+        spark.read.format(dataSourceName).option("multiLine", true).schema(schema).load(path)
+      checkAnswer(df, Row(null, expected))
+    }
+  }
+
+  test("invalid json with leading nulls - from file (multiLine=false)") {
+    import testImplicits._
+    withTempDir { tempDir =>
+      val path = tempDir.getAbsolutePath
+      Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
+      val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType)
+      val df =
+        spark.read.format(dataSourceName).option("multiLine", false).schema(schema).load(path)
+      checkAnswer(df, Seq(Row(1, null), Row(null, badJson)))
+    }
+  }
+
+  test("invalid json with leading nulls - from dataset") {
+    import testImplicits._
+    checkAnswer(
+      spark.read.json(Seq(badJson).toDS()),
+      Row(badJson))
+  }
 }
diff --git a/streaming/pom.xml b/streaming/pom.xml
index fea882ad1123..6e8e034fc4de 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 37427e8da62d..9523085cf551 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.3.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>