Merge remote-tracking branch 'amplab-sparkr/master' into sparkr-runner

shivaram · shivaram · commit bf52b17a546c · 2015-02-20T10:36:35.000-08:00
Conflicts:
	pkg/R/sparkR.R
diff --git a/README.md b/README.md
@@ -13,12 +13,6 @@ SparkR requires Scala 2.10 and Spark version >= 0.9.0. Current build by default
 Apache Spark 1.1.0. You can also build SparkR against a
 different Spark version (>= 0.9.0) by modifying `pkg/src/build.sbt`.
 
-SparkR also requires the R package `rJava` to be installed. To install `rJava`,
-you can run the following command in R:
-
-    install.packages("rJava")
-
-
 ### Package installation
 To develop SparkR, you can build the scala package and the R package using
 
@@ -31,9 +25,9 @@ If you wish to try out the package directly from github, you can use [`install_g
 
 SparkR by default uses Apache Spark 1.1.0. You can switch to a different Spark
 version by setting the environment variable `SPARK_VERSION`. For example, to
-use Apache Spark 1.2.0, you can run
+use Apache Spark 1.3.0, you can run
 
-    SPARK_VERSION=1.2.0 ./install-dev.sh
+    SPARK_VERSION=1.3.0 ./install-dev.sh
 
 SparkR by default links to Hadoop 1.0.4. To use SparkR with other Hadoop
 versions, you will need to rebuild SparkR with the same version that [Spark is
@@ -97,8 +91,9 @@ To run one of them, use `./sparkR <filename> <args>`. For example:
 
     ./sparkR examples/pi.R local[2]
 
-You can also run the unit-tests for SparkR by running
+You can also run the unit-tests for SparkR by running (you need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first):
 
+    R -e 'install.packages("testthat", repos="http://cran.us.r-project.org")'
     ./run-tests.sh
 
 ## Running on EC2
@@ -110,7 +105,7 @@ Instructions for running SparkR on EC2 can be found in the
 Currently, SparkR supports running on YARN with the `yarn-client` mode. These steps show how to build SparkR with YARN support and run SparkR programs on a YARN cluster:
 
 ```
-# assumes Java, R, rJava, yarn, spark etc. are installed on the whole cluster.
+# assumes Java, R, yarn, spark etc. are installed on the whole cluster.
 cd SparkR-pkg/
 USE_YARN=1 SPARK_YARN_VERSION=2.4.0 SPARK_HADOOP_VERSION=2.4.0 ./install-dev.sh
 ```
diff --git a/pkg/NAMESPACE b/pkg/NAMESPACE
@@ -2,6 +2,7 @@
 exportClasses("RDD")
 exportClasses("Broadcast")
 exportMethods(
+              "aggregateByKey",
               "aggregateRDD",
               "cache",
               "checkpoint",
@@ -19,6 +20,7 @@ exportMethods(
               "flatMap",
               "flatMapValues",
               "fold",
+              "foldByKey",
               "foreach",
               "foreachPartition",
               "fullOuterJoin",
@@ -41,6 +43,7 @@ exportMethods(
               "numPartitions",
               "partitionBy",
               "persist",
+              "pipeRDD",
               "reduce",
               "reduceByKey",
               "reduceByKeyLocally",
diff --git a/pkg/R/RDD.R b/pkg/R/RDD.R
@@ -110,12 +110,10 @@ setMethod("getJRDD", signature(rdd = "PipelinedRDD"),
             computeFunc <- function(split, part) {
               rdd@func(split, part)
             }
-            serializedFuncArr <- serialize("computeFunc", connection = NULL,
-                                           ascii = TRUE)
+            serializedFuncArr <- serialize("computeFunc", connection = NULL)
 
             packageNamesArr <- serialize(.sparkREnv[[".packages"]],
-                                         connection = NULL,
-                                         ascii = TRUE)
+                                         connection = NULL)
 
             broadcastArr <- lapply(ls(.broadcastNames),
                                    function(name) { get(name, .broadcastNames) })
@@ -1275,6 +1273,43 @@ setMethod("aggregateRDD",
             Reduce(combOp, partitionList, zeroValue)
           })
 
+#' Pipes elements to a forked external process.
+#'
+#' The same as 'pipe()' in Spark.
+#'
+#' @param rdd The RDD whose elements are piped to the forked external process.
+#' @param command The command to fork an external process.
+#' @param env A named list to set environment variables of the external process.
+#' @return A new RDD created by piping all elements to a forked external process.
+#' @rdname pipeRDD
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' rdd <- parallelize(sc, 1:10)
+#' collect(pipeRDD(rdd, "more")
+#' Output: c("1", "2", ..., "10")
+#'}
+setGeneric("pipeRDD", function(rdd, command, env = list()) { 
+  standardGeneric("pipeRDD") 
+})
+
+#' @rdname pipeRDD
+#' @aliases pipeRDD,RDD,character-method
+setMethod("pipeRDD",
+          signature(rdd = "RDD", command = "character"),
+          function(rdd, command, env = list()) {
+            func <- function(part) {
+              trim.trailing.func <- function(x) {
+                sub("[\r\n]*$", "", toString(x))
+              }
+              input <- unlist(lapply(part, trim.trailing.func))
+              res <- system2(command, stdout = TRUE, input = input, env = env)
+              lapply(res, trim.trailing.func)
+            }
+            lapplyPartition(rdd, func)
+          })
+
 # TODO: Consider caching the name in the RDD's environment
 #' Return an RDD's name.
 #'
diff --git a/pkg/R/context.R b/pkg/R/context.R
@@ -179,7 +179,7 @@ includePackage <- function(sc, pkg) {
 #'}
 broadcast <- function(sc, object) {
   objName <- as.character(substitute(object))
-  serializedObj <- serialize(object, connection = NULL, ascii = TRUE)
+  serializedObj <- serialize(object, connection = NULL)
 
   jBroadcast <- callJMethod(sc, "broadcast", serializedObj)
   id <- as.character(callJMethod(jBroadcast, "id"))
diff --git a/pkg/R/pairRDD.R b/pkg/R/pairRDD.R
@@ -212,12 +212,10 @@ setMethod("partitionBy",
             depsBinArr <- getDependencies(partitionFunc)
 
             serializedHashFuncBytes <- serialize(as.character(substitute(partitionFunc)),
-                                                 connection = NULL,
-                                                 ascii = TRUE)
+                                                 connection = NULL)
 
             packageNamesArr <- serialize(.sparkREnv$.packages,
-                                         connection = NULL,
-                                         ascii = TRUE)
+                                         connection = NULL)
             broadcastArr <- lapply(ls(.broadcastNames), function(name) {
                                    get(name, .broadcastNames) })
             jrdd <- getJRDD(rdd)
@@ -497,6 +495,88 @@ setMethod("combineByKey",
             lapplyPartition(shuffled, mergeAfterShuffle)
           })
 
+#' Aggregate a pair RDD by each key.
+#' 
+#' Aggregate the values of each key in an RDD, using given combine functions
+#' and a neutral "zero value". This function can return a different result type,
+#' U, than the type of the values in this RDD, V. Thus, we need one operation
+#' for merging a V into a U and one operation for merging two U's, The former 
+#' operation is used for merging values within a partition, and the latter is 
+#' used for merging values between partitions. To avoid memory allocation, both 
+#' of these functions are allowed to modify and return their first argument 
+#' instead of creating a new U.
+#' 
+#' @param rdd An RDD.
+#' @param zeroValue A neutral "zero value".
+#' @param seqOp A function to aggregate the values of each key. It may return 
+#'              a different result type from the type of the values.
+#' @param combOp A function to aggregate results of seqOp.
+#' @return An RDD containing the aggregation result.
+#' @rdname aggregateByKey
+#' @seealso foldByKey, combineByKey
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
+#' zeroValue <- list(0, 0)
+#' seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
+#' combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
+#' aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L) 
+#'   # list(list(1, list(3, 2)), list(2, list(7, 2)))
+#'}
+setGeneric("aggregateByKey",
+           function(rdd, zeroValue, seqOp, combOp, numPartitions) {
+             standardGeneric("aggregateByKey")
+           })
+
+#' @rdname aggregateByKey
+#' @aliases aggregateByKey,RDD,ANY,ANY,ANY,integer-method
+setMethod("aggregateByKey",
+          signature(rdd = "RDD", zeroValue = "ANY", seqOp = "ANY",
+                    combOp = "ANY", numPartitions = "integer"),
+          function(rdd, zeroValue, seqOp, combOp, numPartitions) {
+            createCombiner <- function(v) {
+              do.call(seqOp, list(zeroValue, v))
+            }
+
+            combineByKey(rdd, createCombiner, seqOp, combOp, numPartitions)
+          })
+
+#' Fold a pair RDD by each key.
+#' 
+#' Aggregate the values of each key in an RDD, using an associative function "func"
+#' and a neutral "zero value" which may be added to the result an arbitrary 
+#' number of times, and must not change the result (e.g., 0 for addition, or 
+#' 1 for multiplication.).
+#' 
+#' @param rdd An RDD.
+#' @param zeroValue A neutral "zero value".
+#' @param func An associative function for folding values of each key.
+#' @return An RDD containing the aggregation result.
+#' @rdname foldByKey
+#' @seealso aggregateByKey, combineByKey
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
+#' foldByKey(rdd, 0, "+", 2L) # list(list(1, 3), list(2, 7))
+#'}
+setGeneric("foldByKey",
+           function(rdd, zeroValue, func, numPartitions) {
+             standardGeneric("foldByKey")
+           })
+
+#' @rdname foldByKey
+#' @aliases foldByKey,RDD,ANY,ANY,integer-method
+setMethod("foldByKey",
+          signature(rdd = "RDD", zeroValue = "ANY",
+                    func = "ANY", numPartitions = "integer"),
+          function(rdd, zeroValue, func, numPartitions) {
+            aggregateByKey(rdd, zeroValue, func, func, numPartitions)
+          })
+
 ############ Binary Functions #############
 
 #' Join two RDDs
diff --git a/pkg/R/sparkR.R b/pkg/R/sparkR.R
@@ -4,7 +4,6 @@ assemblyJarName <- "sparkr-assembly-0.1.jar"
 
 sparkR.onLoad <- function(libname, pkgname) {
   assemblyJarPath <- paste(libname, "/SparkR/", assemblyJarName, sep = "")
-  assemblyJarPath <- gsub(" ", "\\ ", assemblyJarPath, fixed = T)
   packageStartupMessage("[SparkR] Initializing with classpath ", assemblyJarPath, "\n")
  
   .sparkREnv$libname <- libname
@@ -90,17 +89,28 @@ sparkR.init <- function(
   sparkExecutorEnv = list(),
   sparkJars = "",
   sparkRLibDir = "",
-  sparkRBackendPort = as.integer(Sys.getenv("SPARKR_BACKEND_PORT", "12345"))) {
+  sparkRBackendPort = as.integer(Sys.getenv("SPARKR_BACKEND_PORT", "12345")),
+  sparkRRetryCount = 6) {
 
   if (exists(".sparkRjsc", envir = .sparkREnv)) {
     cat("Re-using existing Spark Context. Please stop SparkR with sparkR.stop() or restart R to create a new Spark Context\n")
     return(get(".sparkRjsc", envir = .sparkREnv))
   }
 
   sparkMem <- Sys.getenv("SPARK_MEM", "512m")
-  jars <- c(as.character(.sparkREnv$assemblyJarPath), as.character(sparkJars))
-
-  cp <- paste0(jars, collapse = ":")
+  jars <- suppressWarnings(
+    normalizePath(c(as.character(.sparkREnv$assemblyJarPath), as.character(sparkJars))))
+
+  # Classpath separator is ";" on Windows
+  # URI needs four /// as from http://stackoverflow.com/a/18522792
+  if (.Platform$OS.type == "unix") {
+    collapseChar <- ":"
+    uriSep <- "//"
+  } else {
+    collapseChar <- ";"
+    uriSep <- "////"
+  }
+  cp <- paste0(jars, collapse = collapseChar)
 
   yarn_conf_dir <- Sys.getenv("YARN_CONF_DIR", "")
   if (yarn_conf_dir != "") {
@@ -126,10 +136,30 @@ sparkR.init <- function(
           sparkHome = sparkHome,
           sparkSubmitOpts = Sys.getenv("SPARKR_SUBMIT_ARGS", ""))
     }
-    Sys.sleep(2) # Wait for backend to come up
   }
+
   .sparkREnv$sparkRBackendPort <- sparkRBackendPort
-  connectBackend("localhost", sparkRBackendPort) # Connect to it
+  cat("Waiting for JVM to come up...\n")
+  tries <- 0
+  while (tries < sparkRRetryCount) {
+    if (!connExists(.sparkREnv)) {
+      Sys.sleep(2 ^ tries)
+      tryCatch({
+        connectBackend("localhost", .sparkREnv$sparkRBackendPort)
+      }, error = function(err) {
+        cat("Error in Connection, retrying...\n")
+      }, warning = function(war) {
+        cat("No Connection Found, retrying...\n")
+      })
+      tries <- tries + 1
+    } else {
+      cat("Connection ok.\n")
+      break
+    }
+  }
+  if (tries == sparkRRetryCount) {
+    stop(sprintf("Failed to connect JVM after %d tries.\n", sparkRRetryCount))
+  }
 
   if (nchar(sparkHome) != 0) {
     sparkHome <- normalizePath(sparkHome)
@@ -153,7 +183,7 @@ sparkR.init <- function(
   }
 
   nonEmptyJars <- Filter(function(x) { x != "" }, jars)
-  localJarPaths <- sapply(nonEmptyJars, function(j) { paste("file://", j, sep = "") })
+  localJarPaths <- sapply(nonEmptyJars, function(j) { utils::URLencode(paste("file:", uriSep, j, sep = "")) })
 
   assign(
     ".sparkRjsc",
diff --git a/pkg/R/sparkRClient.R b/pkg/R/sparkRClient.R
@@ -35,6 +35,8 @@ launchBackend <- function(
   } else {
     java_bin <- java_bin_name
   }
+  # Quote the classpath to make sure it handles spaces on Windows
+  classPath <- shQuote(classPath)
   combinedArgs <- paste(javaOpts, "-cp", classPath, mainClass, args, sep = " ")
   cat("Launching java with command ", java_bin, " ", combinedArgs, "\n")
   invisible(system2(java_bin, combinedArgs, wait = F))
diff --git a/pkg/inst/tests/test_rdd.R b/pkg/inst/tests/test_rdd.R
@@ -336,6 +336,23 @@ test_that("values() on RDDs", {
   expect_equal(actual, lapply(intPairs, function(x) { x[[2]] }))
 })
 
+test_that("pipeRDD() on RDDs", {
+  actual <- collect(pipeRDD(rdd, "more"))
+  expected <- as.list(as.character(1:10))
+  expect_equal(actual, expected)
+  
+  trailed.rdd <- parallelize(sc, c("1", "", "2\n", "3\n\r\n"))
+  actual <- collect(pipeRDD(trailed.rdd, "sort"))
+  expected <- list("", "1", "2", "3")
+  expect_equal(actual, expected)
+  
+  rev.nums <- 9:0
+  rev.rdd <- parallelize(sc, rev.nums, 2L)
+  actual <- collect(pipeRDD(rev.rdd, "sort"))
+  expected <- as.list(as.character(c(5:9, 0:4)))
+  expect_equal(actual, expected)
+})
+
 test_that("join() on pairwise RDDs", {
   rdd1 <- parallelize(sc, list(list(1,1), list(2,4)))
   rdd2 <- parallelize(sc, list(list(1,2), list(1,3)))
diff --git a/pkg/inst/tests/test_shuffle.R b/pkg/inst/tests/test_shuffle.R
diff --git a/pkg/man/aggregateByKey.Rd b/pkg/man/aggregateByKey.Rd
diff --git a/pkg/man/foldByKey.Rd b/pkg/man/foldByKey.Rd
diff --git a/pkg/man/pipeRDD.Rd b/pkg/man/pipeRDD.Rd

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,8 @@ launchBackend <- function(`
`35`	`35`	`} else {`
`36`	`36`	`java_bin <- java_bin_name`
`37`	`37`	`}`
	`38`	`+ # Quote the classpath to make sure it handles spaces on Windows`
	`39`	`+ classPath <- shQuote(classPath)`
`38`	`40`	`combinedArgs <- paste(javaOpts, "-cp", classPath, mainClass, args, sep = " ")`
`39`	`41`	`cat("Launching java with command ", java_bin, " ", combinedArgs, "\n")`
`40`	`42`	`invisible(system2(java_bin, combinedArgs, wait = F))`