Add sparkR-submit helper script

shivaram · shivaram · commit 8398f2ec8fa5 · 2015-02-09T21:15:16.000-08:00
Also adjust R file path for YARN cluster mode
diff --git a/pkg/src/src/main/scala/edu/berkeley/cs/amplab/sparkr/SparkRRunner.scala b/pkg/src/src/main/scala/edu/berkeley/cs/amplab/sparkr/SparkRRunner.scala
@@ -6,6 +6,8 @@ import java.net.URI
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.JavaConversions._
 
+import org.apache.hadoop.fs.Path
+
 /**
  * Main class used to launch SparkR applications using spark-submit. It executes R as a
  * subprocess and then has it connect back to the JVM to access system properties etc.
@@ -19,9 +21,15 @@ object SparkRRunner {
     val sparkRBackendPort = sys.env.getOrElse("SPARKR_BACKEND_PORT", "12345").toInt
     val rCommand = "Rscript"
 
-    // val formattedPythonFile = formatPath(pythonFile)
-    // TODO: Normalize path ?
-    val rFileNormalized = rFile
+    // Check if the file path exists.
+    // If not, change directory to current working directory for YARN cluster mode
+    val rF = new File(rFile)
+    val rFileNormalized = if (!rF.exists()) {
+      new Path(rFile).getName
+    } else {
+      rFile
+    }
+
 
     // Launch a SparkR backend server for the R process to connect to; this will let it see our
     // Java system properties etc.
diff --git a/sparkR-submit b/sparkR-submit
@@ -0,0 +1,56 @@
+#!/bin/bash
+# To use sparkR-submit, we assume the SparkR package in yarn-cluster mode
+# we assume that it has been installed to a standard location using
+# R CMD INSTALL pkg/
+
+FWDIR="$(cd `dirname $0`; pwd)"
+
+export PROJECT_HOME="$FWDIR"
+
+export SPARKR_JAR_FILE="$FWDIR/lib/SparkR/sparkr-assembly-0.1.jar"
+
+# Exit if the user hasn't set SPARK_HOME 
+if [ ! -f "$SPARK_HOME/bin/spark-submit" ]; then
+  echo "SPARK_HOME must be set to use sparkR-submit"
+  exit 1
+fi
+
+source "$SPARK_HOME/bin/utils.sh"
+
+function usage() {
+  echo "Usage: ./sparkR-submit [options]" 1>&2
+  "$SPARK_HOME"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
+  exit 0
+}
+
+if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
+  usage
+fi
+
+
+# Add SparkR to .libPaths
+# If we are running an R program, only set libPaths and use Rscript
+
+export R_PROFILE_USER="/tmp/sparkR.profile"
+
+cat > /tmp/sparkR.profile << EOF
+  .First <- function() {
+  projecHome <- Sys.getenv("PROJECT_HOME")
+  .libPaths(c(paste(projecHome,"/lib", sep=""), .libPaths()))
+  Sys.setenv(NOAWT=1)
+}
+EOF
+
+# Build up arguments list manually to preserve quotes and backslashes.
+SUBMIT_USAGE_FUNCTION=usage
+gatherSparkSubmitOpts "$@"
+
+# If a R file is provided, directly run spark-submit. 
+if [[ "$1" =~ \.R$ ]]; then
+  primary="$1"
+  shift
+  gatherSparkSubmitOpts "$@"
+  exec "$FWDIR"/bin/spark-submit --class edu.berkeley.cs.amplab.sparkr.SparkRRunner --files "$primary" "${SUBMISSION_OPTS[@]}" "$SPARKR_JAR_FILE" "$primary" "${APPLICATION_OPTS[@]}"
+else
+  echo "sparkR-submit can only be used to run R programs. Please use sparkR to launch a shell"
+fi