support GPU on aws ec2

kiszk · kiszk · commit 2abc3feada1b · 2016-01-03T12:17:17.000-05:00
diff --git a/ec2/gpu/create_image.sh b/ec2/gpu/create_image.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+# Creates an AMI for the Spark EC2 scripts starting with a stock Amazon 
+# Linux AMI.
+# This has only been tested with Amazon Linux AMI 2014.03.2 
+
+set -e
+
+if [ "$(id -u)" != "0" ]; then
+   echo "This script must be run as root" 1>&2
+   exit 1
+fi
+
+
+# Update
+yum update -y
+# Dev tools
+sudo yum install -y java-1.8.0-openjdk-devel gcc gcc-c++ ant git
+# Perf tools
+sudo yum install -y dstat iotop strace sysstat htop perf
+#sudo debuginfo-install -q -y glibc
+#sudo debuginfo-install -q -y kernel
+yum install -y kernel-devel-`uname -r`
+sudo yum --enablerepo='*-debug*' install -q -y java-1.8.0-openjdk-debuginfo.x86_64
+
+# PySpark and MLlib deps
+# sudo yum install -y  python-matplotlib python-tornado scipy libgfortran
+sudo yum install -y libgfortran
+# SparkR deps
+#sudo yum install -y R
+# Other handy tools
+sudo yum install -y pssh
+# Ganglia
+#sudo yum install -y ganglia ganglia-web ganglia-gmond ganglia-gmetad
+
+if [ "$1" == "gpu" ]; then
+# CUDA
+sudo yum install -y gcc-c++
+# Install NVIDIA Driver
+sudo wget -P /root -q http://us.download.nvidia.com/XFree86/Linux-x86_64/346.96/NVIDIA-Linux-x86_64-346.96.run
+sudo chmod +x /root/NVIDIA-Linux-x86_64-346.96.run
+sudo /root/NVIDIA-Linux-x86_64-346.96.run -s > /root/driver.log 2>&1
+# Install CUDA (without driver installation... for Amazon Linux 2015.09)
+sudo wget -P /root -q http://developer.download.nvidia.com/compute/cuda/7_0/Prod/local_installers/cuda_7.0.28_linux.run
+sudo chmod +x /root/cuda_7.0.28_linux.run
+sudo /root/cuda_7.0.28_linux.run -extract=/root
+sudo /root/cuda-linux64-rel-7.0.28-19326674.run -noprompt > /root/cuda.log 2>&1 
+rm -f *.run
+fi
+
+# Root ssh config
+sudo sed -i 's/PermitRootLogin.*/PermitRootLogin without-password/g' \
+  /etc/ssh/sshd_config
+sudo sed -i 's/disable_root.*/disable_root: 0/g' /etc/cloud/cloud.cfg
+
+# Set up ephemeral mounts
+sudo sed -i 's/mounts.*//g' /etc/cloud/cloud.cfg
+sudo sed -i 's/.*ephemeral.*//g' /etc/cloud/cloud.cfg
+sudo sed -i 's/.*swap.*//g' /etc/cloud/cloud.cfg
+
+echo "mounts:" >> /etc/cloud/cloud.cfg
+echo " - [ ephemeral0, /mnt, auto, \"defaults,noatime,nodiratime\", "\
+  "\"0\", \"0\" ]" >> /etc/cloud.cloud.cfg
+
+for x in {1..23}; do
+  echo " - [ ephemeral$x, /mnt$((x + 1)), auto, "\
+    "\"defaults,noatime,nodiratime\", \"0\", \"0\" ]" >> /etc/cloud/cloud.cfg
+done
+
+# Install Maven (for Hadoop)
+cd /tmp
+sudo wget -q "http://archive.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz"
+sudo tar xf apache-maven-3.3.9-bin.tar.gz
+sudo mv apache-maven-3.3.9 /opt
+
+# Edit bash profile
+echo "export PS1=\"\\u@\\h \\W]\\$ \"" >> ~/.bash_profile
+echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> ~/.bash_profile
+echo "export M2_HOME=/opt/apache-maven-3.3.9" >> ~/.bash_profile
+echo "export M2_HOME=/opt/hadoop-2.4.1" >> ~/.bash_profile
+echo "export PATH=/usr/local/cuda/bin:\$PATH:\$M2_HOME/bin" >> ~/.bash_profile
+echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64" >> ~/.bash_profile
+
+source ~/.bash_profile
+
+# Build Hadoop to install native libs
+sudo mkdir /root/hadoop-native
+cd /tmp
+#sudo yum install -y protobuf-compiler cmake openssl-devel
+#wget "http://archive.apache.org/dist/hadoop/common/hadoop-2.4.1/hadoop-2.4.1-src.tar.gz"
+#tar xvzf hadoop-2.4.1-src.tar.gz
+#cd hadoop-2.4.1-src
+#mvn package -Pdist,native -DskipTests -Dtar
+#sudo mv hadoop-dist/target/hadoop-2.4.1/lib/native/* /root/hadoop-native
+sudo wget -q "http://archive.apache.org/dist/hadoop/common/hadoop-2.4.1/hadoop-2.4.1.tar.gz"
+sudo tar xf hadoop-2.4.1.tar.gz
+sudo mv hadoop-2.4.1 /root/hadoop
+
+# Install Snappy lib (for Hadoop)
+yum install -y snappy
+sudo ln -sf /usr/lib64/libsnappy.so.1 /root/hadoop-native/.
+
+# Create /usr/bin/realpath which is used by R to find Java installations
+# NOTE: /usr/bin/realpath is missing in CentOS AMIs. See
+# http://superuser.com/questions/771104/usr-bin-realpath-not-found-in-centos-6-5
+sudo echo '#!/bin/bash' > /usr/bin/realpath
+sudo echo 'readlink -e "$@"' >> /usr/bin/realpath
+sudo chmod a+x /usr/bin/realpath
+
+mkdir -p /tmp/spark-events
+chmod 777 /tmp/spark-events
+
diff --git a/ec2/gpu/spark-defaults.conf.add b/ec2/gpu/spark-defaults.conf.add
@@ -0,0 +1,5 @@
+
+spark.eventLog.enabled          true
+spark.eventLog.dir              file:///tmp/spark-events
+spark.history.fs.logDirectory   file:///tmp/spark-events
+
diff --git a/ec2/gpu/spark-env.sh.add b/ec2/gpu/spark-env.sh.add
@@ -0,0 +1,2 @@
+
+export LD_LIBRARY_PATH=/usr/local/cuda/lib64
diff --git a/ec2/gpu/spark_init.sh b/ec2/gpu/spark_init.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+pushd /root > /dev/null
+
+if [ -d "spark" ]; then
+  echo "Spark seems to be installed. Exiting."
+  return
+fi
+
+# Github tag:
+if [[ "$SPARK_VERSION" == *\|* ]]
+then
+  mkdir spark
+  pushd spark > /dev/null
+  git init
+  repo=`python -c "print '$SPARK_VERSION'.split('|')[0]"`
+  git_hash=`python -c "print '$SPARK_VERSION'.split('|')[1]"`
+  git remote add origin $repo
+  git fetch origin
+  git checkout $git_hash
+  cd spark-gpu
+  ./make-distribution.sh -Dscala-2.11 -Phadoop-2.4 -Pyarn
+  popd > /dev/null
+
+# Pre-packaged spark version:
+else
+  case "$SPARK_VERSION" in
+    2.0.0)
+      wget http://s3.amazonaws.com/spark-gpu-public/spark-gpu-latest-bin-hadoop2.4.tgz
+      if [ $? != 0 ]; then
+        echo "ERROR: Unknown Spark version"
+        return -1
+      fi
+    esac
+
+  echo "Unpacking Spark"
+  tar xvf spark-*.tgz > /tmp/spark-ec2_spark.log
+  rm spark-*.tgz
+  mv spark-gpu spark
+fi
+popd > /dev/null
diff --git a/ec2/gpu/user_data.txt b/ec2/gpu/user_data.txt
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+sed -i 's/Defaults    requiretty/Defaults    !requiretty/g' /etc/sudoers
+
+sed -i 's/PermitRootLogin forced-commands-only/PermitRootLogin without-password/g' /etc/ssh/sshd_config
+
+cd /root/.ssh
+cp authorized_keys authorized_keys.orig
+cp /home/ec2-user/.ssh/authorized_keys .
+service sshd restart
+
+yum install -y git
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
@@ -51,7 +51,7 @@
     raw_input = input
     xrange = range
 
-SPARK_EC2_VERSION = "1.6.0"
+SPARK_EC2_VERSION = "2.0.0"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
 
 VALID_SPARK_VERSIONS = set([
@@ -76,6 +76,7 @@
     "1.5.1",
     "1.5.2",
     "1.6.0",
+    "2.0.0",
 ])
 
 SPARK_TACHYON_MAP = {
@@ -94,6 +95,7 @@
     "1.5.1": "0.7.1",
     "1.5.2": "0.7.1",
     "1.6.0": "0.8.2",
+    "2.0.0": "0.8.2",
 }
 
 DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
@@ -192,12 +194,12 @@ def parse_args():
         help="If you have multiple profiles (AWS or boto config), you can configure " +
              "additional, named profiles by using this option (default: %default)")
     parser.add_option(
-        "-t", "--instance-type", default="m1.large",
+        "-t", "--instance-type", default="g2.2xlarge",
         help="Type of instance to launch (default: %default). " +
              "WARNING: must be 64-bit; small instances won't work")
     parser.add_option(
-        "-m", "--master-instance-type", default="",
-        help="Master instance type (leave empty for same as instance-type)")
+        "-m", "--master-instance-type", default="t2.micro",
+        help="Master instance type (default: %default).")
     parser.add_option(
         "-r", "--region", default="us-east-1",
         help="EC2 region used to launch instances in, or to find them in (default: %default)")
@@ -271,7 +273,7 @@ def parse_args():
         help="If specified, launch slaves as spot instances with the given " +
              "maximum price (in dollars)")
     parser.add_option(
-        "--ganglia", action="store_true", default=True,
+        "--ganglia", action="store_true", default=False,
         help="Setup Ganglia monitoring on cluster (default: %default). NOTE: " +
              "the Ganglia page will be publicly accessible")
     parser.add_option(
@@ -295,8 +297,8 @@ def parse_args():
         help="Extra options to give to master through SPARK_MASTER_OPTS variable " +
              "(e.g -Dspark.worker.timeout=180)")
     parser.add_option(
-        "--user-data", type="string", default="",
-        help="Path to a user-data file (most AMIs interpret this as an initialization script)")
+        "--user-data", type="string", default=SPARK_EC2_DIR+"/gpu/user_data.txt",
+        help="Path to a user-data file (default:%default) (most AMIs interpret this as an initialization script)")
     parser.add_option(
         "--authorized-address", type="string", default="0.0.0.0/0",
         help="Address to authorize on created security groups (default: %default)")
@@ -327,6 +329,9 @@ def parse_args():
     parser.add_option(
         "--instance-profile-name", default=None,
         help="IAM profile name to launch instances under")
+    parser.add_option(
+        "-g", "--gpu", action="store_true", default=True,
+        help="enable GPU exploitation (default: %default)")
 
     (opts, args) = parser.parse_args()
     if len(args) != 2:
@@ -444,13 +449,35 @@ def get_validate_spark_version(version, repo):
     "t2.large":    "hvm",
 }
 
+# Source: http://aws.amazon.com/amazon-linux-ami/
+# Last Updated: 2016-01-02
+EC2_AMAZON_HVM_AMI = {
+    "us-east-1":       "ami-60b6c60a",
+    "us-west-2":       "ami-f0091d91",
+    "us-west-1":       "ami-d5ea86b5",
+    "eu-west-1":       "ami-bff32ccc",
+    "eu-central-1":    "ami-bc5b48d0",
+    "ap-southeast-1":  "ami-c9b572aa",
+    "ap-northeast-1":  "ami-383c1956",
+    "ap-southeast-2":  "ami-48d38c2b",
+    "sa-east-1":       "ami-6817af04",
+}
 
 def get_tachyon_version(spark_version):
     return SPARK_TACHYON_MAP.get(spark_version, "")
 
 
 # Attempt to resolve an appropriate AMI given the architecture and region of the request.
 def get_spark_ami(opts):
+    if opts.gpu:
+        if opts.region in EC2_AMAZON_HVM_AMI:
+            ami = EC2_AMAZON_HVM_AMI[opts.region]
+            print("Spark AMI: " + ami)
+            return ami
+        else:
+            print("Could not resolve AMAZON AMI for region: " + opts.region, file=stderr)
+            sys.exit(1)
+
     if opts.instance_type in EC2_INSTANCE_TYPES:
         instance_type = EC2_INSTANCE_TYPES[opts.instance_type]
     else:
@@ -851,11 +878,25 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
         )
 
     print("Running setup on master...")
-    setup_spark_cluster(master, opts)
+    setup_spark_cluster(master, slave_nodes, opts)
     print("Done!")
 
 
-def setup_spark_cluster(master, opts):
+def setup_spark_cluster(master, slave_nodes, opts):
+    if opts.gpu:
+        scp(master, opts, "%s/gpu/spark_init.sh" % SPARK_EC2_DIR, "spark-ec2/spark/init.sh")
+        scp(master, opts, "%s/gpu/spark-defaults.conf.add" % SPARK_EC2_DIR, "spark-ec2/templates/root/spark/conf/spark-defaults.conf.add")
+        scp(master, opts, "%s/gpu/spark-env.sh.add" % SPARK_EC2_DIR, "spark-ec2/templates/root/spark/conf/spark-env.sh.add")
+        ssh(master, opts, "cat spark-ec2/templates/root/spark/conf/spark-defaults.conf.add >> spark-ec2/templates/root/spark/conf/spark-defaults.conf")
+        ssh(master, opts, "cat spark-ec2/templates/root/spark/conf/spark-env.sh.add >> spark-ec2/templates/root/spark/conf/spark-env.sh")
+        scp(master, opts, "%s/gpu/create_image.sh" % SPARK_EC2_DIR, "create_image.sh")
+        ssh(master, opts, "chmod u+x ./create_image.sh")
+        ssh(master, opts, "./create_image.sh > ./create_image_master.log")
+        for slave in slave_nodes:
+            slave_address = get_dns_name(slave, opts.private_ips)
+            scp(slave_address, opts, "%s/gpu/create_image.sh" % SPARK_EC2_DIR, "create_image.sh")
+            ssh(slave_address, opts, "chmod u+x ./create_image.sh")
+            ssh(slave_address, opts, "./create_image.sh gpu > ./create_image_slave.log")
     ssh(master, opts, "chmod u+x spark-ec2/setup.sh")
     ssh(master, opts, "spark-ec2/setup.sh")
     print("Spark standalone cluster started at http://%s:8080" % master)
@@ -1180,6 +1221,29 @@ def ssh(host, opts, command):
             tries = tries + 1
 
 
+def scp(host, opts, src, dst):
+    tries = 0
+    while True:
+        try:
+            return subprocess.check_call(
+                ['scp'] + ssh_args(opts) +
+                [stringify_command(src), '%s@%s:%s' % (opts.user, host, stringify_command(dst))])
+        except subprocess.CalledProcessError as e:
+            if tries > 5:
+                # If this was an ssh failure, provide the user with hints.
+                if e.returncode == 255:
+                    raise UsageError(
+                        "Failed to SCP to remote host {0}.\n"
+                        "Please check that you have provided the correct --identity-file and "
+                        "--key-pair parameters and try again.".format(host))
+                else:
+                    raise e
+            print("Error executing remote command, retrying after 30 seconds: {0}".format(e),
+                  file=stderr)
+            time.sleep(30)
+            tries = tries + 1
+
+
 # Backported from Python 2.7 for compatiblity with 2.6 (See SPARK-1990)
 def _check_output(*popenargs, **kwargs):
     if 'stdout' in kwargs:

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+`
	`2`	`+export LD_LIBRARY_PATH=/usr/local/cuda/lib64`