apache · un-knower · Sep 6, 2018 · Sep 6, 2018 · Sep 6, 2018 · Sep 6, 2018
diff --git a/.travis.yml b/.travis.yml
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.4.0
+Version: 2.4.1
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -503,7 +503,6 @@ setMethod("createOrReplaceTempView",
 #' @param x A SparkDataFrame
 #' @param tableName A character vector containing the name of the table
 #'
-#' @family SparkDataFrame functions
 #' @seealso \link{createOrReplaceTempView}
 #' @rdname registerTempTable-deprecated
 #' @name registerTempTable

diff --git a/R/pkg/R/catalog.R b/R/pkg/R/catalog.R
@@ -69,7 +69,6 @@ createExternalTable <- function(x, ...) {
 #' @param ... additional named parameters as options for the data source.
 #' @return A SparkDataFrame.
 #' @rdname createTable
-#' @seealso \link{createExternalTable}
 #' @examples
 #'\dontrun{
 #' sparkR.session()

diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
@@ -167,18 +167,30 @@ parallelize <- function(sc, coll, numSlices = 1) {
   # 2-tuples of raws
   serializedSlices <- lapply(slices, serialize, connection = NULL)
 
-  # The PRC backend cannot handle arguments larger than 2GB (INT_MAX)
+  # The RPC backend cannot handle arguments larger than 2GB (INT_MAX)
   # If serialized data is safely less than that threshold we send it over the PRC channel.
   # Otherwise, we write it to a file and send the file name
   if (objectSize < sizeLimit) {
     jrdd <- callJStatic("org.apache.spark.api.r.RRDD", "createRDDFromArray", sc, serializedSlices)
   } else {
-    fileName <- writeToTempFile(serializedSlices)
-    jrdd <- tryCatch(callJStatic(
-        "org.apache.spark.api.r.RRDD", "createRDDFromFile", sc, fileName, as.integer(numSlices)),
-      finally = {
-        file.remove(fileName)
-    })
+    if (callJStatic("org.apache.spark.api.r.RUtils", "getEncryptionEnabled", sc)) {
+      # the length of slices here is the parallelism to use in the jvm's sc.parallelize()
+      parallelism <- as.integer(numSlices)
+      jserver <- newJObject("org.apache.spark.api.r.RParallelizeServer", sc, parallelism)
+      authSecret <- callJMethod(jserver, "secret")
+      port <- callJMethod(jserver, "port")
+      conn <- socketConnection(port = port, blocking = TRUE, open = "wb", timeout = 1500)
+      doServerAuth(conn, authSecret)
+      writeToConnection(serializedSlices, conn)
+      jrdd <- callJMethod(jserver, "getResult")
+    } else {
+      fileName <- writeToTempFile(serializedSlices)
+      jrdd <- tryCatch(callJStatic(
+          "org.apache.spark.api.r.RRDD", "createRDDFromFile", sc, fileName, as.integer(numSlices)),
+        finally = {
+          file.remove(fileName)
+      })
+    }
   }
 
   RDD(jrdd, "byte")
@@ -194,14 +206,21 @@ getMaxAllocationLimit <- function(sc) {
   ))
 }
 
+writeToConnection <- function(serializedSlices, conn) {
+  tryCatch({
+    for (slice in serializedSlices) {
+      writeBin(as.integer(length(slice)), conn, endian = "big")
+      writeBin(slice, conn, endian = "big")
+    }
+  }, finally = {
+    close(conn)
+  })
+}
+
 writeToTempFile <- function(serializedSlices) {
   fileName <- tempfile()
   conn <- file(fileName, "wb")
-  for (slice in serializedSlices) {
-    writeBin(as.integer(length(slice)), conn, endian = "big")
-    writeBin(slice, conn, endian = "big")
-  }
-  close(conn)
+  writeToConnection(serializedSlices, conn)
   fileName
 }
 

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -2203,9 +2203,16 @@ setMethod("from_json", signature(x = "Column", schema = "characterOrstructType")
           })
 
 #' @details
-#' \code{from_utc_timestamp}: Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a
-#' time in UTC, and renders that time as a timestamp in the given time zone. For example, 'GMT+1'
-#' would yield '2017-07-14 03:40:00.0'.
+#' \code{from_utc_timestamp}: This is a common function for databases supporting TIMESTAMP WITHOUT
+#' TIMEZONE. This function takes a timestamp which is timezone-agnostic, and interprets it as a
+#' timestamp in UTC, and renders that timestamp as a timestamp in the given time zone.
+#' However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not
+#' timezone-agnostic. So in Spark this function just shift the timestamp value from UTC timezone to
+#' the given timezone.
+#' This function may return confusing result if the input is a string with timezone, e.g.
+#' (\code{2018-03-13T06:18:23+00:00}). The reason is that, Spark firstly cast the string to
+#' timestamp according to the timezone in the string, and finally display the result by converting
+#' the timestamp to string according to the session local timezone.
 #'
 #' @rdname column_datetime_diff_functions
 #'
@@ -2261,9 +2268,16 @@ setMethod("next_day", signature(y = "Column", x = "character"),
           })
 
 #' @details
-#' \code{to_utc_timestamp}: Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a
-#' time in the given time zone, and renders that time as a timestamp in UTC. For example, 'GMT+1'
-#' would yield '2017-07-14 01:40:00.0'.
+#' \code{to_utc_timestamp}: This is a common function for databases supporting TIMESTAMP WITHOUT
+#' TIMEZONE. This function takes a timestamp which is timezone-agnostic, and interprets it as a
+#' timestamp in the given timezone, and renders that timestamp as a timestamp in UTC.
+#' However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not
+#' timezone-agnostic. So in Spark this function just shift the timestamp value from the given
+#' timezone to UTC timezone.
+#' This function may return confusing result if the input is a string with timezone, e.g.
+#' (\code{2018-03-13T06:18:23+00:00}). The reason is that, Spark firstly cast the string to
+#' timestamp according to the timezone in the string, and finally display the result by converting
+#' the timestamp to string according to the session local timezone.
 #'
 #' @rdname column_datetime_diff_functions
 #' @aliases to_utc_timestamp to_utc_timestamp,Column,character-method

diff --git a/R/pkg/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R
@@ -124,3 +124,35 @@ test_that("SerDe of list of lists", {
 })
 
 sparkR.session.stop()
+
+# Note that this test should be at the end of tests since the configruations used here are not
+# specific to sessions, and the Spark context is restarted.
+test_that("createDataFrame large objects", {
+  for (encryptionEnabled in list("true", "false")) {
+    # To simulate a large object scenario, we set spark.r.maxAllocationLimit to a smaller value
+    conf <- list(spark.r.maxAllocationLimit = "100",
+                 spark.io.encryption.enabled = encryptionEnabled)
+
+    suppressWarnings(sparkR.session(master = sparkRTestMaster,
+                                    sparkConfig = conf,
+                                    enableHiveSupport = FALSE))
+
+    sc <- getSparkContext()
+    actual <- callJStatic("org.apache.spark.api.r.RUtils", "getEncryptionEnabled", sc)
+    expected <- as.logical(encryptionEnabled)
+    expect_equal(actual, expected)
+
+    tryCatch({
+      # suppress warnings from dot in the field names. See also SPARK-21536.
+      df <- suppressWarnings(createDataFrame(iris, numPartitions = 3))
+      expect_equal(getNumPartitions(df), 3)
+      expect_equal(dim(df), dim(iris))
+
+      df <- createDataFrame(cars, numPartitions = 3)
+      expect_equal(collect(df), cars)
+    },
+    finally = {
+      sparkR.stop()
+    })
+  }
+})
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -316,18 +316,6 @@ test_that("create DataFrame from RDD", {
   unsetHiveContext()
 })
 
-test_that("createDataFrame uses files for large objects", {
-  # To simulate a large file scenario, we set spark.r.maxAllocationLimit to a smaller value
-  conf <- callJMethod(sparkSession, "conf")
-  callJMethod(conf, "set", "spark.r.maxAllocationLimit", "100")
-  df <- suppressWarnings(createDataFrame(iris, numPartitions = 3))
-  expect_equal(getNumPartitions(df), 3)
-
-  # Resetting the conf back to default value
-  callJMethod(conf, "set", "spark.r.maxAllocationLimit", toString(.Machine$integer.max / 10))
-  expect_equal(dim(df), dim(iris))
-})
-
 test_that("read/write csv as DataFrame", {
   if (windows_with_hadoop()) {
     csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")

diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
@@ -18,50 +18,55 @@
 library(testthat)
 library(SparkR)
 
-# Turn all warnings into errors
-options("warn" = 2)
+# SPARK-25572
+if (identical(Sys.getenv("NOT_CRAN"), "true")) {
 
-if (.Platform$OS.type == "windows") {
-  Sys.setenv(TZ = "GMT")
-}
+  # Turn all warnings into errors
+  options("warn" = 2)
 
-# Setup global test environment
-# Install Spark first to set SPARK_HOME
+  if (.Platform$OS.type == "windows") {
+    Sys.setenv(TZ = "GMT")
+  }
 
-# NOTE(shivaram): We set overwrite to handle any old tar.gz files or directories left behind on
-# CRAN machines. For Jenkins we should already have SPARK_HOME set.
-install.spark(overwrite = TRUE)
+  # Setup global test environment
+  # Install Spark first to set SPARK_HOME
 
-sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
-sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
-invisible(lapply(sparkRWhitelistSQLDirs,
-                 function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)}))
-sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
+  # NOTE(shivaram): We set overwrite to handle any old tar.gz files or directories left behind on
+  # CRAN machines. For Jenkins we should already have SPARK_HOME set.
+  install.spark(overwrite = TRUE)
 
-sparkRTestMaster <- "local[1]"
-sparkRTestConfig <- list()
-if (identical(Sys.getenv("NOT_CRAN"), "true")) {
-  sparkRTestMaster <- ""
-} else {
-  # Disable hsperfdata on CRAN
-  old_java_opt <- Sys.getenv("_JAVA_OPTIONS")
-  Sys.setenv("_JAVA_OPTIONS" = paste("-XX:-UsePerfData", old_java_opt))
-  tmpDir <- tempdir()
-  tmpArg <- paste0("-Djava.io.tmpdir=", tmpDir)
-  sparkRTestConfig <- list(spark.driver.extraJavaOptions = tmpArg,
-                           spark.executor.extraJavaOptions = tmpArg)
-}
+  sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
+  sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")
+  invisible(lapply(sparkRWhitelistSQLDirs,
+                   function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)}))
+  sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE)
 
-test_package("SparkR")
+  sparkRTestMaster <- "local[1]"
+  sparkRTestConfig <- list()
+  if (identical(Sys.getenv("NOT_CRAN"), "true")) {
+    sparkRTestMaster <- ""
+  } else {
+    # Disable hsperfdata on CRAN
+    old_java_opt <- Sys.getenv("_JAVA_OPTIONS")
+    Sys.setenv("_JAVA_OPTIONS" = paste("-XX:-UsePerfData", old_java_opt))
+    tmpDir <- tempdir()
+    tmpArg <- paste0("-Djava.io.tmpdir=", tmpDir)
+    sparkRTestConfig <- list(spark.driver.extraJavaOptions = tmpArg,
+                             spark.executor.extraJavaOptions = tmpArg)
+  }
 
-if (identical(Sys.getenv("NOT_CRAN"), "true")) {
-  # set random seed for predictable results. mostly for base's sample() in tree and classification
-  set.seed(42)
-  # for testthat 1.0.2 later, change reporter from "summary" to default_reporter()
-  testthat:::run_tests("SparkR",
-                       file.path(sparkRDir, "pkg", "tests", "fulltests"),
-                       NULL,
-                       "summary")
-}
+  test_package("SparkR")
+
+  if (identical(Sys.getenv("NOT_CRAN"), "true")) {
+    # set random seed for predictable results. mostly for base's sample() in tree and classification
+    set.seed(42)
+    # for testthat 1.0.2 later, change reporter from "summary" to default_reporter()
+    testthat:::run_tests("SparkR",
+                         file.path(sparkRDir, "pkg", "tests", "fulltests"),
+                         NULL,
+                         "summary")
+  }
 
-SparkR:::uninstallDownloadedSpark()
+  SparkR:::uninstallDownloadedSpark()
+
+}
diff --git a/assembly/README b/assembly/README
@@ -9,4 +9,4 @@ This module is off by default. To activate it specify the profile in the command
 
 If you need to build an assembly for a different version of Hadoop the
 hadoop-version system property needs to be set as in this example:
-  -Dhadoop.version=2.7.7
+  -Dhadoop.version=2.7.3
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.4.0-SNAPSHOT</version>
+    <version>2.4.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh
@@ -54,6 +54,8 @@ function build {
       img_path=$IMG_PATH
       --build-arg
       spark_jars=assembly/target/scala-$SPARK_SCALA_VERSION/jars
+      --build-arg
+      k8s_tests=resource-managers/kubernetes/integration-tests/tests
     )
   else
     # Not passed as an argument to docker, but used to validate the Spark directory.

diff --git a/build/mvn b/build/mvn
@@ -153,6 +153,7 @@ if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`"${ZINC_BIN}" -status -port ${ZINC_PORT}`
   export ZINC_OPTS=${ZINC_OPTS:-"$_COMPILE_JVM_OPTS"}
   "${ZINC_BIN}" -shutdown -port ${ZINC_PORT}
   "${ZINC_BIN}" -start -port ${ZINC_PORT} \
+    -server 127.0.0.1 -idle-timeout 30m \
     -scala-compiler "${SCALA_COMPILER}" \
     -scala-library "${SCALA_LIBRARY}" &>/dev/null
 fi
@@ -164,3 +165,6 @@ echo "Using \`mvn\` from path: $MVN_BIN" 1>&2
 
 # Last, call the `mvn` command as usual
 "${MVN_BIN}" -DzincPort=${ZINC_PORT} "$@"
+
+# Try to shut down zinc explicitly
+"${ZINC_BIN}" -shutdown -port ${ZINC_PORT}
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.4.0-SNAPSHOT</version>
+    <version>2.4.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 

diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.4.0-SNAPSHOT</version>
+    <version>2.4.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 

diff --git a/common/network-common/src/main/java/org/apache/spark/network/buffer/ManagedBuffer.java b/common/network-common/src/main/java/org/apache/spark/network/buffer/ManagedBuffer.java
@@ -36,7 +36,10 @@
  */
 public abstract class ManagedBuffer {
 
-  /** Number of bytes of the data. */
+  /**
+   * Number of bytes of the data. If this buffer will decrypt for all of the views into the data,
+   * this is the size of the decrypted data.
+   */
   public abstract long size();
 
   /**