Skip to content

Commit 8398f2e

Browse files
committed
Add sparkR-submit helper script
Also adjust R file path for YARN cluster mode
1 parent 7ca6512 commit 8398f2e

File tree

2 files changed

+67
-3
lines changed

2 files changed

+67
-3
lines changed

pkg/src/src/main/scala/edu/berkeley/cs/amplab/sparkr/SparkRRunner.scala

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import java.net.URI
66
import scala.collection.mutable.ArrayBuffer
77
import scala.collection.JavaConversions._
88

9+
import org.apache.hadoop.fs.Path
10+
911
/**
1012
* Main class used to launch SparkR applications using spark-submit. It executes R as a
1113
* subprocess and then has it connect back to the JVM to access system properties etc.
@@ -19,9 +21,15 @@ object SparkRRunner {
1921
val sparkRBackendPort = sys.env.getOrElse("SPARKR_BACKEND_PORT", "12345").toInt
2022
val rCommand = "Rscript"
2123

22-
// val formattedPythonFile = formatPath(pythonFile)
23-
// TODO: Normalize path ?
24-
val rFileNormalized = rFile
24+
// Check if the file path exists.
25+
// If not, change directory to current working directory for YARN cluster mode
26+
val rF = new File(rFile)
27+
val rFileNormalized = if (!rF.exists()) {
28+
new Path(rFile).getName
29+
} else {
30+
rFile
31+
}
32+
2533

2634
// Launch a SparkR backend server for the R process to connect to; this will let it see our
2735
// Java system properties etc.

sparkR-submit

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/bin/bash
2+
# To use sparkR-submit, we assume the SparkR package in yarn-cluster mode
3+
# we assume that it has been installed to a standard location using
4+
# R CMD INSTALL pkg/
5+
6+
FWDIR="$(cd `dirname $0`; pwd)"
7+
8+
export PROJECT_HOME="$FWDIR"
9+
10+
export SPARKR_JAR_FILE="$FWDIR/lib/SparkR/sparkr-assembly-0.1.jar"
11+
12+
# Exit if the user hasn't set SPARK_HOME
13+
if [ ! -f "$SPARK_HOME/bin/spark-submit" ]; then
14+
echo "SPARK_HOME must be set to use sparkR-submit"
15+
exit 1
16+
fi
17+
18+
source "$SPARK_HOME/bin/utils.sh"
19+
20+
function usage() {
21+
echo "Usage: ./sparkR-submit [options]" 1>&2
22+
"$SPARK_HOME"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
23+
exit 0
24+
}
25+
26+
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
27+
usage
28+
fi
29+
30+
31+
# Add SparkR to .libPaths
32+
# If we are running an R program, only set libPaths and use Rscript
33+
34+
export R_PROFILE_USER="/tmp/sparkR.profile"
35+
36+
cat > /tmp/sparkR.profile << EOF
37+
.First <- function() {
38+
projecHome <- Sys.getenv("PROJECT_HOME")
39+
.libPaths(c(paste(projecHome,"/lib", sep=""), .libPaths()))
40+
Sys.setenv(NOAWT=1)
41+
}
42+
EOF
43+
44+
# Build up arguments list manually to preserve quotes and backslashes.
45+
SUBMIT_USAGE_FUNCTION=usage
46+
gatherSparkSubmitOpts "$@"
47+
48+
# If a R file is provided, directly run spark-submit.
49+
if [[ "$1" =~ \.R$ ]]; then
50+
primary="$1"
51+
shift
52+
gatherSparkSubmitOpts "$@"
53+
exec "$FWDIR"/bin/spark-submit --class edu.berkeley.cs.amplab.sparkr.SparkRRunner --files "$primary" "${SUBMISSION_OPTS[@]}" "$SPARKR_JAR_FILE" "$primary" "${APPLICATION_OPTS[@]}"
54+
else
55+
echo "sparkR-submit can only be used to run R programs. Please use sparkR to launch a shell"
56+
fi

0 commit comments

Comments
 (0)