Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions ec2/gpu/create_image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/bin/bash
# Creates an AMI for the Spark EC2 scripts starting with a stock Amazon
# Linux AMI.
# This has only been tested with Amazon Linux AMI 2014.03.2

set -e

if [ "$(id -u)" != "0" ]; then
echo "This script must be run as root" 1>&2
exit 1
fi


# Update
yum update -y
# Dev tools
sudo yum install -y java-1.8.0-openjdk-devel gcc gcc-c++ ant git
# Perf tools
sudo yum install -y dstat iotop strace sysstat htop perf
#sudo debuginfo-install -q -y glibc
#sudo debuginfo-install -q -y kernel
yum install -y kernel-devel-`uname -r`
sudo yum --enablerepo='*-debug*' install -q -y java-1.8.0-openjdk-debuginfo.x86_64

# PySpark and MLlib deps
# sudo yum install -y python-matplotlib python-tornado scipy libgfortran
sudo yum install -y libgfortran
# SparkR deps
#sudo yum install -y R
# Other handy tools
sudo yum install -y pssh
# Ganglia
#sudo yum install -y ganglia ganglia-web ganglia-gmond ganglia-gmetad

if [ "$1" == "gpu" ]; then
# CUDA
sudo yum install -y gcc-c++
# Install NVIDIA Driver
sudo wget -P /root -q http://us.download.nvidia.com/XFree86/Linux-x86_64/346.96/NVIDIA-Linux-x86_64-346.96.run
sudo chmod +x /root/NVIDIA-Linux-x86_64-346.96.run
sudo /root/NVIDIA-Linux-x86_64-346.96.run -s > /root/driver.log 2>&1
# Install CUDA (without driver installation... for Amazon Linux 2015.09)
sudo wget -P /root -q http://developer.download.nvidia.com/compute/cuda/7_0/Prod/local_installers/cuda_7.0.28_linux.run
sudo chmod +x /root/cuda_7.0.28_linux.run
sudo /root/cuda_7.0.28_linux.run -extract=/root
sudo /root/cuda-linux64-rel-7.0.28-19326674.run -noprompt > /root/cuda.log 2>&1
rm -f *.run
fi

# Root ssh config
sudo sed -i 's/PermitRootLogin.*/PermitRootLogin without-password/g' \
/etc/ssh/sshd_config
sudo sed -i 's/disable_root.*/disable_root: 0/g' /etc/cloud/cloud.cfg

# Set up ephemeral mounts
sudo sed -i 's/mounts.*//g' /etc/cloud/cloud.cfg
sudo sed -i 's/.*ephemeral.*//g' /etc/cloud/cloud.cfg
sudo sed -i 's/.*swap.*//g' /etc/cloud/cloud.cfg

echo "mounts:" >> /etc/cloud/cloud.cfg
echo " - [ ephemeral0, /mnt, auto, \"defaults,noatime,nodiratime\", "\
"\"0\", \"0\" ]" >> /etc/cloud.cloud.cfg

for x in {1..23}; do
echo " - [ ephemeral$x, /mnt$((x + 1)), auto, "\
"\"defaults,noatime,nodiratime\", \"0\", \"0\" ]" >> /etc/cloud/cloud.cfg
done

# Install Maven (for Hadoop)
cd /tmp
sudo wget -q "http://archive.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz"
sudo tar xf apache-maven-3.3.9-bin.tar.gz
sudo mv apache-maven-3.3.9 /opt

# Edit bash profile
echo "export PS1=\"\\u@\\h \\W]\\$ \"" >> ~/.bash_profile
echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> ~/.bash_profile
echo "export M2_HOME=/opt/apache-maven-3.3.9" >> ~/.bash_profile
echo "export M2_HOME=/opt/hadoop-2.4.1" >> ~/.bash_profile
echo "export PATH=/usr/local/cuda/bin:\$PATH:\$M2_HOME/bin" >> ~/.bash_profile
echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64" >> ~/.bash_profile

source ~/.bash_profile

# Build Hadoop to install native libs
sudo mkdir /root/hadoop-native
cd /tmp
#sudo yum install -y protobuf-compiler cmake openssl-devel
#wget "http://archive.apache.org/dist/hadoop/common/hadoop-2.4.1/hadoop-2.4.1-src.tar.gz"
#tar xvzf hadoop-2.4.1-src.tar.gz
#cd hadoop-2.4.1-src
#mvn package -Pdist,native -DskipTests -Dtar
#sudo mv hadoop-dist/target/hadoop-2.4.1/lib/native/* /root/hadoop-native
sudo wget -q "http://archive.apache.org/dist/hadoop/common/hadoop-2.4.1/hadoop-2.4.1.tar.gz"
sudo tar xf hadoop-2.4.1.tar.gz
sudo mv hadoop-2.4.1 /root/hadoop

# Install Snappy lib (for Hadoop)
yum install -y snappy
sudo ln -sf /usr/lib64/libsnappy.so.1 /root/hadoop-native/.

# Create /usr/bin/realpath which is used by R to find Java installations
# NOTE: /usr/bin/realpath is missing in CentOS AMIs. See
# http://superuser.com/questions/771104/usr-bin-realpath-not-found-in-centos-6-5
sudo echo '#!/bin/bash' > /usr/bin/realpath
sudo echo 'readlink -e "$@"' >> /usr/bin/realpath
sudo chmod a+x /usr/bin/realpath

mkdir -p /tmp/spark-events
chmod 777 /tmp/spark-events

5 changes: 5 additions & 0 deletions ec2/gpu/spark-defaults.conf.add
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

spark.eventLog.enabled true
spark.eventLog.dir file:///tmp/spark-events
spark.history.fs.logDirectory file:///tmp/spark-events

2 changes: 2 additions & 0 deletions ec2/gpu/spark-env.sh.add
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

export LD_LIBRARY_PATH=/usr/local/cuda/lib64
41 changes: 41 additions & 0 deletions ec2/gpu/spark_init.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

pushd /root > /dev/null

if [ -d "spark" ]; then
echo "Spark seems to be installed. Exiting."
return
fi

# Github tag:
if [[ "$SPARK_VERSION" == *\|* ]]
then
mkdir spark
pushd spark > /dev/null
git init
repo=`python -c "print '$SPARK_VERSION'.split('|')[0]"`
git_hash=`python -c "print '$SPARK_VERSION'.split('|')[1]"`
git remote add origin $repo
git fetch origin
git checkout $git_hash
cd spark-gpu
./make-distribution.sh -Dscala-2.11 -Phadoop-2.4 -Pyarn
popd > /dev/null

# Pre-packaged spark version:
else
case "$SPARK_VERSION" in
2.0.0)
wget http://s3.amazonaws.com/spark-gpu-public/spark-gpu-latest-bin-hadoop2.4.tgz
if [ $? != 0 ]; then
echo "ERROR: Unknown Spark version"
return -1
fi
esac

echo "Unpacking Spark"
tar xvf spark-*.tgz > /tmp/spark-ec2_spark.log
rm spark-*.tgz
mv spark-gpu spark
fi
popd > /dev/null
12 changes: 12 additions & 0 deletions ec2/gpu/user_data.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

sed -i 's/Defaults requiretty/Defaults !requiretty/g' /etc/sudoers

sed -i 's/PermitRootLogin forced-commands-only/PermitRootLogin without-password/g' /etc/ssh/sshd_config

cd /root/.ssh
cp authorized_keys authorized_keys.orig
cp /home/ec2-user/.ssh/authorized_keys .
service sshd restart

yum install -y git
82 changes: 73 additions & 9 deletions ec2/spark_ec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
raw_input = input
xrange = range

SPARK_EC2_VERSION = "1.6.0"
SPARK_EC2_VERSION = "2.0.0"
SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))

VALID_SPARK_VERSIONS = set([
Expand All @@ -76,6 +76,7 @@
"1.5.1",
"1.5.2",
"1.6.0",
"2.0.0",
])

SPARK_TACHYON_MAP = {
Expand All @@ -94,6 +95,7 @@
"1.5.1": "0.7.1",
"1.5.2": "0.7.1",
"1.6.0": "0.8.2",
"2.0.0": "0.8.2",
}

DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
Expand Down Expand Up @@ -192,12 +194,12 @@ def parse_args():
help="If you have multiple profiles (AWS or boto config), you can configure " +
"additional, named profiles by using this option (default: %default)")
parser.add_option(
"-t", "--instance-type", default="m1.large",
"-t", "--instance-type", default="g2.2xlarge",
help="Type of instance to launch (default: %default). " +
"WARNING: must be 64-bit; small instances won't work")
parser.add_option(
"-m", "--master-instance-type", default="",
help="Master instance type (leave empty for same as instance-type)")
"-m", "--master-instance-type", default="t2.micro",
help="Master instance type (default: %default).")
parser.add_option(
"-r", "--region", default="us-east-1",
help="EC2 region used to launch instances in, or to find them in (default: %default)")
Expand Down Expand Up @@ -271,7 +273,7 @@ def parse_args():
help="If specified, launch slaves as spot instances with the given " +
"maximum price (in dollars)")
parser.add_option(
"--ganglia", action="store_true", default=True,
"--ganglia", action="store_true", default=False,
help="Setup Ganglia monitoring on cluster (default: %default). NOTE: " +
"the Ganglia page will be publicly accessible")
parser.add_option(
Expand All @@ -295,8 +297,8 @@ def parse_args():
help="Extra options to give to master through SPARK_MASTER_OPTS variable " +
"(e.g -Dspark.worker.timeout=180)")
parser.add_option(
"--user-data", type="string", default="",
help="Path to a user-data file (most AMIs interpret this as an initialization script)")
"--user-data", type="string", default=SPARK_EC2_DIR+"/gpu/user_data.txt",
help="Path to a user-data file (default:%default) (most AMIs interpret this as an initialization script)")
parser.add_option(
"--authorized-address", type="string", default="0.0.0.0/0",
help="Address to authorize on created security groups (default: %default)")
Expand Down Expand Up @@ -327,6 +329,9 @@ def parse_args():
parser.add_option(
"--instance-profile-name", default=None,
help="IAM profile name to launch instances under")
parser.add_option(
"-g", "--gpu", action="store_true", default=True,
help="enable GPU exploitation (default: %default)")

(opts, args) = parser.parse_args()
if len(args) != 2:
Expand Down Expand Up @@ -444,13 +449,35 @@ def get_validate_spark_version(version, repo):
"t2.large": "hvm",
}

# Source: http://aws.amazon.com/amazon-linux-ami/
# Last Updated: 2016-01-02
EC2_AMAZON_HVM_AMI = {
"us-east-1": "ami-60b6c60a",
"us-west-2": "ami-f0091d91",
"us-west-1": "ami-d5ea86b5",
"eu-west-1": "ami-bff32ccc",
"eu-central-1": "ami-bc5b48d0",
"ap-southeast-1": "ami-c9b572aa",
"ap-northeast-1": "ami-383c1956",
"ap-southeast-2": "ami-48d38c2b",
"sa-east-1": "ami-6817af04",
}

def get_tachyon_version(spark_version):
return SPARK_TACHYON_MAP.get(spark_version, "")


# Attempt to resolve an appropriate AMI given the architecture and region of the request.
def get_spark_ami(opts):
if opts.gpu:
if opts.region in EC2_AMAZON_HVM_AMI:
ami = EC2_AMAZON_HVM_AMI[opts.region]
print("Spark AMI: " + ami)
return ami
else:
print("Could not resolve AMAZON AMI for region: " + opts.region, file=stderr)
sys.exit(1)

if opts.instance_type in EC2_INSTANCE_TYPES:
instance_type = EC2_INSTANCE_TYPES[opts.instance_type]
else:
Expand Down Expand Up @@ -851,11 +878,25 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
)

print("Running setup on master...")
setup_spark_cluster(master, opts)
setup_spark_cluster(master, slave_nodes, opts)
print("Done!")


def setup_spark_cluster(master, opts):
def setup_spark_cluster(master, slave_nodes, opts):
if opts.gpu:
scp(master, opts, "%s/gpu/spark_init.sh" % SPARK_EC2_DIR, "spark-ec2/spark/init.sh")
scp(master, opts, "%s/gpu/spark-defaults.conf.add" % SPARK_EC2_DIR, "spark-ec2/templates/root/spark/conf/spark-defaults.conf.add")
scp(master, opts, "%s/gpu/spark-env.sh.add" % SPARK_EC2_DIR, "spark-ec2/templates/root/spark/conf/spark-env.sh.add")
ssh(master, opts, "cat spark-ec2/templates/root/spark/conf/spark-defaults.conf.add >> spark-ec2/templates/root/spark/conf/spark-defaults.conf")
ssh(master, opts, "cat spark-ec2/templates/root/spark/conf/spark-env.sh.add >> spark-ec2/templates/root/spark/conf/spark-env.sh")
scp(master, opts, "%s/gpu/create_image.sh" % SPARK_EC2_DIR, "create_image.sh")
ssh(master, opts, "chmod u+x ./create_image.sh")
ssh(master, opts, "./create_image.sh > ./create_image_master.log")
for slave in slave_nodes:
slave_address = get_dns_name(slave, opts.private_ips)
scp(slave_address, opts, "%s/gpu/create_image.sh" % SPARK_EC2_DIR, "create_image.sh")
ssh(slave_address, opts, "chmod u+x ./create_image.sh")
ssh(slave_address, opts, "./create_image.sh gpu > ./create_image_slave.log")
ssh(master, opts, "chmod u+x spark-ec2/setup.sh")
ssh(master, opts, "spark-ec2/setup.sh")
print("Spark standalone cluster started at http://%s:8080" % master)
Expand Down Expand Up @@ -1180,6 +1221,29 @@ def ssh(host, opts, command):
tries = tries + 1


def scp(host, opts, src, dst):
tries = 0
while True:
try:
return subprocess.check_call(
['scp'] + ssh_args(opts) +
[stringify_command(src), '%s@%s:%s' % (opts.user, host, stringify_command(dst))])
except subprocess.CalledProcessError as e:
if tries > 5:
# If this was an ssh failure, provide the user with hints.
if e.returncode == 255:
raise UsageError(
"Failed to SCP to remote host {0}.\n"
"Please check that you have provided the correct --identity-file and "
"--key-pair parameters and try again.".format(host))
else:
raise e
print("Error executing remote command, retrying after 30 seconds: {0}".format(e),
file=stderr)
time.sleep(30)
tries = tries + 1


# Backported from Python 2.7 for compatiblity with 2.6 (See SPARK-1990)
def _check_output(*popenargs, **kwargs):
if 'stdout' in kwargs:
Expand Down