diff --git a/ec2/gpu/create_image.sh b/ec2/gpu/create_image.sh new file mode 100755 index 0000000000..087a9613ef --- /dev/null +++ b/ec2/gpu/create_image.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# Creates an AMI for the Spark EC2 scripts starting with a stock Amazon +# Linux AMI. +# This has only been tested with Amazon Linux AMI 2014.03.2 + +set -e + +if [ "$(id -u)" != "0" ]; then + echo "This script must be run as root" 1>&2 + exit 1 +fi + + +# Update +yum update -y +# Dev tools +sudo yum install -y java-1.8.0-openjdk-devel gcc gcc-c++ ant git +# Perf tools +sudo yum install -y dstat iotop strace sysstat htop perf +#sudo debuginfo-install -q -y glibc +#sudo debuginfo-install -q -y kernel +yum install -y kernel-devel-`uname -r` +sudo yum --enablerepo='*-debug*' install -q -y java-1.8.0-openjdk-debuginfo.x86_64 + +# PySpark and MLlib deps +# sudo yum install -y python-matplotlib python-tornado scipy libgfortran +sudo yum install -y libgfortran +# SparkR deps +#sudo yum install -y R +# Other handy tools +sudo yum install -y pssh +# Ganglia +#sudo yum install -y ganglia ganglia-web ganglia-gmond ganglia-gmetad + +if [ "$1" == "gpu" ]; then +# CUDA +sudo yum install -y gcc-c++ +# Install NVIDIA Driver +sudo wget -P /root -q http://us.download.nvidia.com/XFree86/Linux-x86_64/346.96/NVIDIA-Linux-x86_64-346.96.run +sudo chmod +x /root/NVIDIA-Linux-x86_64-346.96.run +sudo /root/NVIDIA-Linux-x86_64-346.96.run -s > /root/driver.log 2>&1 +# Install CUDA (without driver installation... for Amazon Linux 2015.09) +sudo wget -P /root -q http://developer.download.nvidia.com/compute/cuda/7_0/Prod/local_installers/cuda_7.0.28_linux.run +sudo chmod +x /root/cuda_7.0.28_linux.run +sudo /root/cuda_7.0.28_linux.run -extract=/root +sudo /root/cuda-linux64-rel-7.0.28-19326674.run -noprompt > /root/cuda.log 2>&1 +rm -f *.run +fi + +# Root ssh config +sudo sed -i 's/PermitRootLogin.*/PermitRootLogin without-password/g' \ + /etc/ssh/sshd_config +sudo sed -i 's/disable_root.*/disable_root: 0/g' /etc/cloud/cloud.cfg + +# Set up ephemeral mounts +sudo sed -i 's/mounts.*//g' /etc/cloud/cloud.cfg +sudo sed -i 's/.*ephemeral.*//g' /etc/cloud/cloud.cfg +sudo sed -i 's/.*swap.*//g' /etc/cloud/cloud.cfg + +echo "mounts:" >> /etc/cloud/cloud.cfg +echo " - [ ephemeral0, /mnt, auto, \"defaults,noatime,nodiratime\", "\ + "\"0\", \"0\" ]" >> /etc/cloud.cloud.cfg + +for x in {1..23}; do + echo " - [ ephemeral$x, /mnt$((x + 1)), auto, "\ + "\"defaults,noatime,nodiratime\", \"0\", \"0\" ]" >> /etc/cloud/cloud.cfg +done + +# Install Maven (for Hadoop) +cd /tmp +sudo wget -q "http://archive.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz" +sudo tar xf apache-maven-3.3.9-bin.tar.gz +sudo mv apache-maven-3.3.9 /opt + +# Edit bash profile +echo "export PS1=\"\\u@\\h \\W]\\$ \"" >> ~/.bash_profile +echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> ~/.bash_profile +echo "export M2_HOME=/opt/apache-maven-3.3.9" >> ~/.bash_profile +echo "export M2_HOME=/opt/hadoop-2.4.1" >> ~/.bash_profile +echo "export PATH=/usr/local/cuda/bin:\$PATH:\$M2_HOME/bin" >> ~/.bash_profile +echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64" >> ~/.bash_profile + +source ~/.bash_profile + +# Build Hadoop to install native libs +sudo mkdir /root/hadoop-native +cd /tmp +#sudo yum install -y protobuf-compiler cmake openssl-devel +#wget "http://archive.apache.org/dist/hadoop/common/hadoop-2.4.1/hadoop-2.4.1-src.tar.gz" +#tar xvzf hadoop-2.4.1-src.tar.gz +#cd hadoop-2.4.1-src +#mvn package -Pdist,native -DskipTests -Dtar +#sudo mv hadoop-dist/target/hadoop-2.4.1/lib/native/* /root/hadoop-native +sudo wget -q "http://archive.apache.org/dist/hadoop/common/hadoop-2.4.1/hadoop-2.4.1.tar.gz" +sudo tar xf hadoop-2.4.1.tar.gz +sudo mv hadoop-2.4.1 /root/hadoop + +# Install Snappy lib (for Hadoop) +yum install -y snappy +sudo ln -sf /usr/lib64/libsnappy.so.1 /root/hadoop-native/. + +# Create /usr/bin/realpath which is used by R to find Java installations +# NOTE: /usr/bin/realpath is missing in CentOS AMIs. See +# http://superuser.com/questions/771104/usr-bin-realpath-not-found-in-centos-6-5 +sudo echo '#!/bin/bash' > /usr/bin/realpath +sudo echo 'readlink -e "$@"' >> /usr/bin/realpath +sudo chmod a+x /usr/bin/realpath + +mkdir -p /tmp/spark-events +chmod 777 /tmp/spark-events + diff --git a/ec2/gpu/spark-defaults.conf.add b/ec2/gpu/spark-defaults.conf.add new file mode 100644 index 0000000000..09b094f479 --- /dev/null +++ b/ec2/gpu/spark-defaults.conf.add @@ -0,0 +1,5 @@ + +spark.eventLog.enabled true +spark.eventLog.dir file:///tmp/spark-events +spark.history.fs.logDirectory file:///tmp/spark-events + diff --git a/ec2/gpu/spark-env.sh.add b/ec2/gpu/spark-env.sh.add new file mode 100644 index 0000000000..ed6a53bad9 --- /dev/null +++ b/ec2/gpu/spark-env.sh.add @@ -0,0 +1,2 @@ + +export LD_LIBRARY_PATH=/usr/local/cuda/lib64 diff --git a/ec2/gpu/spark_init.sh b/ec2/gpu/spark_init.sh new file mode 100644 index 0000000000..d2cbc0931d --- /dev/null +++ b/ec2/gpu/spark_init.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +pushd /root > /dev/null + +if [ -d "spark" ]; then + echo "Spark seems to be installed. Exiting." + return +fi + +# Github tag: +if [[ "$SPARK_VERSION" == *\|* ]] +then + mkdir spark + pushd spark > /dev/null + git init + repo=`python -c "print '$SPARK_VERSION'.split('|')[0]"` + git_hash=`python -c "print '$SPARK_VERSION'.split('|')[1]"` + git remote add origin $repo + git fetch origin + git checkout $git_hash + cd spark-gpu + ./make-distribution.sh -Dscala-2.11 -Phadoop-2.4 -Pyarn + popd > /dev/null + +# Pre-packaged spark version: +else + case "$SPARK_VERSION" in + 2.0.0) + wget http://s3.amazonaws.com/spark-gpu-public/spark-gpu-latest-bin-hadoop2.4.tgz + if [ $? != 0 ]; then + echo "ERROR: Unknown Spark version" + return -1 + fi + esac + + echo "Unpacking Spark" + tar xvf spark-*.tgz > /tmp/spark-ec2_spark.log + rm spark-*.tgz + mv spark-gpu spark +fi +popd > /dev/null diff --git a/ec2/gpu/user_data.txt b/ec2/gpu/user_data.txt new file mode 100644 index 0000000000..bcf7793e2f --- /dev/null +++ b/ec2/gpu/user_data.txt @@ -0,0 +1,12 @@ +#!/bin/bash + +sed -i 's/Defaults requiretty/Defaults !requiretty/g' /etc/sudoers + +sed -i 's/PermitRootLogin forced-commands-only/PermitRootLogin without-password/g' /etc/ssh/sshd_config + +cd /root/.ssh +cp authorized_keys authorized_keys.orig +cp /home/ec2-user/.ssh/authorized_keys . +service sshd restart + +yum install -y git diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index 19d5980560..2c05ac0624 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -51,7 +51,7 @@ raw_input = input xrange = range -SPARK_EC2_VERSION = "1.6.0" +SPARK_EC2_VERSION = "2.0.0" SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) VALID_SPARK_VERSIONS = set([ @@ -76,6 +76,7 @@ "1.5.1", "1.5.2", "1.6.0", + "2.0.0", ]) SPARK_TACHYON_MAP = { @@ -94,6 +95,7 @@ "1.5.1": "0.7.1", "1.5.2": "0.7.1", "1.6.0": "0.8.2", + "2.0.0": "0.8.2", } DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION @@ -192,12 +194,12 @@ def parse_args(): help="If you have multiple profiles (AWS or boto config), you can configure " + "additional, named profiles by using this option (default: %default)") parser.add_option( - "-t", "--instance-type", default="m1.large", + "-t", "--instance-type", default="g2.2xlarge", help="Type of instance to launch (default: %default). " + "WARNING: must be 64-bit; small instances won't work") parser.add_option( - "-m", "--master-instance-type", default="", - help="Master instance type (leave empty for same as instance-type)") + "-m", "--master-instance-type", default="t2.micro", + help="Master instance type (default: %default).") parser.add_option( "-r", "--region", default="us-east-1", help="EC2 region used to launch instances in, or to find them in (default: %default)") @@ -271,7 +273,7 @@ def parse_args(): help="If specified, launch slaves as spot instances with the given " + "maximum price (in dollars)") parser.add_option( - "--ganglia", action="store_true", default=True, + "--ganglia", action="store_true", default=False, help="Setup Ganglia monitoring on cluster (default: %default). NOTE: " + "the Ganglia page will be publicly accessible") parser.add_option( @@ -295,8 +297,8 @@ def parse_args(): help="Extra options to give to master through SPARK_MASTER_OPTS variable " + "(e.g -Dspark.worker.timeout=180)") parser.add_option( - "--user-data", type="string", default="", - help="Path to a user-data file (most AMIs interpret this as an initialization script)") + "--user-data", type="string", default=SPARK_EC2_DIR+"/gpu/user_data.txt", + help="Path to a user-data file (default:%default) (most AMIs interpret this as an initialization script)") parser.add_option( "--authorized-address", type="string", default="0.0.0.0/0", help="Address to authorize on created security groups (default: %default)") @@ -327,6 +329,9 @@ def parse_args(): parser.add_option( "--instance-profile-name", default=None, help="IAM profile name to launch instances under") + parser.add_option( + "-g", "--gpu", action="store_true", default=True, + help="enable GPU exploitation (default: %default)") (opts, args) = parser.parse_args() if len(args) != 2: @@ -444,6 +449,19 @@ def get_validate_spark_version(version, repo): "t2.large": "hvm", } +# Source: http://aws.amazon.com/amazon-linux-ami/ +# Last Updated: 2016-01-02 +EC2_AMAZON_HVM_AMI = { + "us-east-1": "ami-60b6c60a", + "us-west-2": "ami-f0091d91", + "us-west-1": "ami-d5ea86b5", + "eu-west-1": "ami-bff32ccc", + "eu-central-1": "ami-bc5b48d0", + "ap-southeast-1": "ami-c9b572aa", + "ap-northeast-1": "ami-383c1956", + "ap-southeast-2": "ami-48d38c2b", + "sa-east-1": "ami-6817af04", +} def get_tachyon_version(spark_version): return SPARK_TACHYON_MAP.get(spark_version, "") @@ -451,6 +469,15 @@ def get_tachyon_version(spark_version): # Attempt to resolve an appropriate AMI given the architecture and region of the request. def get_spark_ami(opts): + if opts.gpu: + if opts.region in EC2_AMAZON_HVM_AMI: + ami = EC2_AMAZON_HVM_AMI[opts.region] + print("Spark AMI: " + ami) + return ami + else: + print("Could not resolve AMAZON AMI for region: " + opts.region, file=stderr) + sys.exit(1) + if opts.instance_type in EC2_INSTANCE_TYPES: instance_type = EC2_INSTANCE_TYPES[opts.instance_type] else: @@ -851,11 +878,25 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): ) print("Running setup on master...") - setup_spark_cluster(master, opts) + setup_spark_cluster(master, slave_nodes, opts) print("Done!") -def setup_spark_cluster(master, opts): +def setup_spark_cluster(master, slave_nodes, opts): + if opts.gpu: + scp(master, opts, "%s/gpu/spark_init.sh" % SPARK_EC2_DIR, "spark-ec2/spark/init.sh") + scp(master, opts, "%s/gpu/spark-defaults.conf.add" % SPARK_EC2_DIR, "spark-ec2/templates/root/spark/conf/spark-defaults.conf.add") + scp(master, opts, "%s/gpu/spark-env.sh.add" % SPARK_EC2_DIR, "spark-ec2/templates/root/spark/conf/spark-env.sh.add") + ssh(master, opts, "cat spark-ec2/templates/root/spark/conf/spark-defaults.conf.add >> spark-ec2/templates/root/spark/conf/spark-defaults.conf") + ssh(master, opts, "cat spark-ec2/templates/root/spark/conf/spark-env.sh.add >> spark-ec2/templates/root/spark/conf/spark-env.sh") + scp(master, opts, "%s/gpu/create_image.sh" % SPARK_EC2_DIR, "create_image.sh") + ssh(master, opts, "chmod u+x ./create_image.sh") + ssh(master, opts, "./create_image.sh > ./create_image_master.log") + for slave in slave_nodes: + slave_address = get_dns_name(slave, opts.private_ips) + scp(slave_address, opts, "%s/gpu/create_image.sh" % SPARK_EC2_DIR, "create_image.sh") + ssh(slave_address, opts, "chmod u+x ./create_image.sh") + ssh(slave_address, opts, "./create_image.sh gpu > ./create_image_slave.log") ssh(master, opts, "chmod u+x spark-ec2/setup.sh") ssh(master, opts, "spark-ec2/setup.sh") print("Spark standalone cluster started at http://%s:8080" % master) @@ -1180,6 +1221,29 @@ def ssh(host, opts, command): tries = tries + 1 +def scp(host, opts, src, dst): + tries = 0 + while True: + try: + return subprocess.check_call( + ['scp'] + ssh_args(opts) + + [stringify_command(src), '%s@%s:%s' % (opts.user, host, stringify_command(dst))]) + except subprocess.CalledProcessError as e: + if tries > 5: + # If this was an ssh failure, provide the user with hints. + if e.returncode == 255: + raise UsageError( + "Failed to SCP to remote host {0}.\n" + "Please check that you have provided the correct --identity-file and " + "--key-pair parameters and try again.".format(host)) + else: + raise e + print("Error executing remote command, retrying after 30 seconds: {0}".format(e), + file=stderr) + time.sleep(30) + tries = tries + 1 + + # Backported from Python 2.7 for compatiblity with 2.6 (See SPARK-1990) def _check_output(*popenargs, **kwargs): if 'stdout' in kwargs: