Skip to content

Commit 2abc3fe

Browse files
committed
support GPU on aws ec2
1 parent 49aaf65 commit 2abc3fe

File tree

6 files changed

+244
-9
lines changed

6 files changed

+244
-9
lines changed

ec2/gpu/create_image.sh

+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#!/bin/bash
2+
# Creates an AMI for the Spark EC2 scripts starting with a stock Amazon
3+
# Linux AMI.
4+
# This has only been tested with Amazon Linux AMI 2014.03.2
5+
6+
set -e
7+
8+
if [ "$(id -u)" != "0" ]; then
9+
echo "This script must be run as root" 1>&2
10+
exit 1
11+
fi
12+
13+
14+
# Update
15+
yum update -y
16+
# Dev tools
17+
sudo yum install -y java-1.8.0-openjdk-devel gcc gcc-c++ ant git
18+
# Perf tools
19+
sudo yum install -y dstat iotop strace sysstat htop perf
20+
#sudo debuginfo-install -q -y glibc
21+
#sudo debuginfo-install -q -y kernel
22+
yum install -y kernel-devel-`uname -r`
23+
sudo yum --enablerepo='*-debug*' install -q -y java-1.8.0-openjdk-debuginfo.x86_64
24+
25+
# PySpark and MLlib deps
26+
# sudo yum install -y python-matplotlib python-tornado scipy libgfortran
27+
sudo yum install -y libgfortran
28+
# SparkR deps
29+
#sudo yum install -y R
30+
# Other handy tools
31+
sudo yum install -y pssh
32+
# Ganglia
33+
#sudo yum install -y ganglia ganglia-web ganglia-gmond ganglia-gmetad
34+
35+
if [ "$1" == "gpu" ]; then
36+
# CUDA
37+
sudo yum install -y gcc-c++
38+
# Install NVIDIA Driver
39+
sudo wget -P /root -q http://us.download.nvidia.com/XFree86/Linux-x86_64/346.96/NVIDIA-Linux-x86_64-346.96.run
40+
sudo chmod +x /root/NVIDIA-Linux-x86_64-346.96.run
41+
sudo /root/NVIDIA-Linux-x86_64-346.96.run -s > /root/driver.log 2>&1
42+
# Install CUDA (without driver installation... for Amazon Linux 2015.09)
43+
sudo wget -P /root -q http://developer.download.nvidia.com/compute/cuda/7_0/Prod/local_installers/cuda_7.0.28_linux.run
44+
sudo chmod +x /root/cuda_7.0.28_linux.run
45+
sudo /root/cuda_7.0.28_linux.run -extract=/root
46+
sudo /root/cuda-linux64-rel-7.0.28-19326674.run -noprompt > /root/cuda.log 2>&1
47+
rm -f *.run
48+
fi
49+
50+
# Root ssh config
51+
sudo sed -i 's/PermitRootLogin.*/PermitRootLogin without-password/g' \
52+
/etc/ssh/sshd_config
53+
sudo sed -i 's/disable_root.*/disable_root: 0/g' /etc/cloud/cloud.cfg
54+
55+
# Set up ephemeral mounts
56+
sudo sed -i 's/mounts.*//g' /etc/cloud/cloud.cfg
57+
sudo sed -i 's/.*ephemeral.*//g' /etc/cloud/cloud.cfg
58+
sudo sed -i 's/.*swap.*//g' /etc/cloud/cloud.cfg
59+
60+
echo "mounts:" >> /etc/cloud/cloud.cfg
61+
echo " - [ ephemeral0, /mnt, auto, \"defaults,noatime,nodiratime\", "\
62+
"\"0\", \"0\" ]" >> /etc/cloud.cloud.cfg
63+
64+
for x in {1..23}; do
65+
echo " - [ ephemeral$x, /mnt$((x + 1)), auto, "\
66+
"\"defaults,noatime,nodiratime\", \"0\", \"0\" ]" >> /etc/cloud/cloud.cfg
67+
done
68+
69+
# Install Maven (for Hadoop)
70+
cd /tmp
71+
sudo wget -q "http://archive.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz"
72+
sudo tar xf apache-maven-3.3.9-bin.tar.gz
73+
sudo mv apache-maven-3.3.9 /opt
74+
75+
# Edit bash profile
76+
echo "export PS1=\"\\u@\\h \\W]\\$ \"" >> ~/.bash_profile
77+
echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> ~/.bash_profile
78+
echo "export M2_HOME=/opt/apache-maven-3.3.9" >> ~/.bash_profile
79+
echo "export M2_HOME=/opt/hadoop-2.4.1" >> ~/.bash_profile
80+
echo "export PATH=/usr/local/cuda/bin:\$PATH:\$M2_HOME/bin" >> ~/.bash_profile
81+
echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64" >> ~/.bash_profile
82+
83+
source ~/.bash_profile
84+
85+
# Build Hadoop to install native libs
86+
sudo mkdir /root/hadoop-native
87+
cd /tmp
88+
#sudo yum install -y protobuf-compiler cmake openssl-devel
89+
#wget "http://archive.apache.org/dist/hadoop/common/hadoop-2.4.1/hadoop-2.4.1-src.tar.gz"
90+
#tar xvzf hadoop-2.4.1-src.tar.gz
91+
#cd hadoop-2.4.1-src
92+
#mvn package -Pdist,native -DskipTests -Dtar
93+
#sudo mv hadoop-dist/target/hadoop-2.4.1/lib/native/* /root/hadoop-native
94+
sudo wget -q "http://archive.apache.org/dist/hadoop/common/hadoop-2.4.1/hadoop-2.4.1.tar.gz"
95+
sudo tar xf hadoop-2.4.1.tar.gz
96+
sudo mv hadoop-2.4.1 /root/hadoop
97+
98+
# Install Snappy lib (for Hadoop)
99+
yum install -y snappy
100+
sudo ln -sf /usr/lib64/libsnappy.so.1 /root/hadoop-native/.
101+
102+
# Create /usr/bin/realpath which is used by R to find Java installations
103+
# NOTE: /usr/bin/realpath is missing in CentOS AMIs. See
104+
# http://superuser.com/questions/771104/usr-bin-realpath-not-found-in-centos-6-5
105+
sudo echo '#!/bin/bash' > /usr/bin/realpath
106+
sudo echo 'readlink -e "$@"' >> /usr/bin/realpath
107+
sudo chmod a+x /usr/bin/realpath
108+
109+
mkdir -p /tmp/spark-events
110+
chmod 777 /tmp/spark-events
111+

ec2/gpu/spark-defaults.conf.add

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
spark.eventLog.enabled true
3+
spark.eventLog.dir file:///tmp/spark-events
4+
spark.history.fs.logDirectory file:///tmp/spark-events
5+

ec2/gpu/spark-env.sh.add

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
2+
export LD_LIBRARY_PATH=/usr/local/cuda/lib64

ec2/gpu/spark_init.sh

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
3+
pushd /root > /dev/null
4+
5+
if [ -d "spark" ]; then
6+
echo "Spark seems to be installed. Exiting."
7+
return
8+
fi
9+
10+
# Github tag:
11+
if [[ "$SPARK_VERSION" == *\|* ]]
12+
then
13+
mkdir spark
14+
pushd spark > /dev/null
15+
git init
16+
repo=`python -c "print '$SPARK_VERSION'.split('|')[0]"`
17+
git_hash=`python -c "print '$SPARK_VERSION'.split('|')[1]"`
18+
git remote add origin $repo
19+
git fetch origin
20+
git checkout $git_hash
21+
cd spark-gpu
22+
./make-distribution.sh -Dscala-2.11 -Phadoop-2.4 -Pyarn
23+
popd > /dev/null
24+
25+
# Pre-packaged spark version:
26+
else
27+
case "$SPARK_VERSION" in
28+
2.0.0)
29+
wget http://s3.amazonaws.com/spark-gpu-public/spark-gpu-latest-bin-hadoop2.4.tgz
30+
if [ $? != 0 ]; then
31+
echo "ERROR: Unknown Spark version"
32+
return -1
33+
fi
34+
esac
35+
36+
echo "Unpacking Spark"
37+
tar xvf spark-*.tgz > /tmp/spark-ec2_spark.log
38+
rm spark-*.tgz
39+
mv spark-gpu spark
40+
fi
41+
popd > /dev/null

ec2/gpu/user_data.txt

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
sed -i 's/Defaults requiretty/Defaults !requiretty/g' /etc/sudoers
4+
5+
sed -i 's/PermitRootLogin forced-commands-only/PermitRootLogin without-password/g' /etc/ssh/sshd_config
6+
7+
cd /root/.ssh
8+
cp authorized_keys authorized_keys.orig
9+
cp /home/ec2-user/.ssh/authorized_keys .
10+
service sshd restart
11+
12+
yum install -y git

ec2/spark_ec2.py

+73-9
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
raw_input = input
5252
xrange = range
5353

54-
SPARK_EC2_VERSION = "1.6.0"
54+
SPARK_EC2_VERSION = "2.0.0"
5555
SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
5656

5757
VALID_SPARK_VERSIONS = set([
@@ -76,6 +76,7 @@
7676
"1.5.1",
7777
"1.5.2",
7878
"1.6.0",
79+
"2.0.0",
7980
])
8081

8182
SPARK_TACHYON_MAP = {
@@ -94,6 +95,7 @@
9495
"1.5.1": "0.7.1",
9596
"1.5.2": "0.7.1",
9697
"1.6.0": "0.8.2",
98+
"2.0.0": "0.8.2",
9799
}
98100

99101
DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
@@ -192,12 +194,12 @@ def parse_args():
192194
help="If you have multiple profiles (AWS or boto config), you can configure " +
193195
"additional, named profiles by using this option (default: %default)")
194196
parser.add_option(
195-
"-t", "--instance-type", default="m1.large",
197+
"-t", "--instance-type", default="g2.2xlarge",
196198
help="Type of instance to launch (default: %default). " +
197199
"WARNING: must be 64-bit; small instances won't work")
198200
parser.add_option(
199-
"-m", "--master-instance-type", default="",
200-
help="Master instance type (leave empty for same as instance-type)")
201+
"-m", "--master-instance-type", default="t2.micro",
202+
help="Master instance type (default: %default).")
201203
parser.add_option(
202204
"-r", "--region", default="us-east-1",
203205
help="EC2 region used to launch instances in, or to find them in (default: %default)")
@@ -271,7 +273,7 @@ def parse_args():
271273
help="If specified, launch slaves as spot instances with the given " +
272274
"maximum price (in dollars)")
273275
parser.add_option(
274-
"--ganglia", action="store_true", default=True,
276+
"--ganglia", action="store_true", default=False,
275277
help="Setup Ganglia monitoring on cluster (default: %default). NOTE: " +
276278
"the Ganglia page will be publicly accessible")
277279
parser.add_option(
@@ -295,8 +297,8 @@ def parse_args():
295297
help="Extra options to give to master through SPARK_MASTER_OPTS variable " +
296298
"(e.g -Dspark.worker.timeout=180)")
297299
parser.add_option(
298-
"--user-data", type="string", default="",
299-
help="Path to a user-data file (most AMIs interpret this as an initialization script)")
300+
"--user-data", type="string", default=SPARK_EC2_DIR+"/gpu/user_data.txt",
301+
help="Path to a user-data file (default:%default) (most AMIs interpret this as an initialization script)")
300302
parser.add_option(
301303
"--authorized-address", type="string", default="0.0.0.0/0",
302304
help="Address to authorize on created security groups (default: %default)")
@@ -327,6 +329,9 @@ def parse_args():
327329
parser.add_option(
328330
"--instance-profile-name", default=None,
329331
help="IAM profile name to launch instances under")
332+
parser.add_option(
333+
"-g", "--gpu", action="store_true", default=True,
334+
help="enable GPU exploitation (default: %default)")
330335

331336
(opts, args) = parser.parse_args()
332337
if len(args) != 2:
@@ -444,13 +449,35 @@ def get_validate_spark_version(version, repo):
444449
"t2.large": "hvm",
445450
}
446451

452+
# Source: http://aws.amazon.com/amazon-linux-ami/
453+
# Last Updated: 2016-01-02
454+
EC2_AMAZON_HVM_AMI = {
455+
"us-east-1": "ami-60b6c60a",
456+
"us-west-2": "ami-f0091d91",
457+
"us-west-1": "ami-d5ea86b5",
458+
"eu-west-1": "ami-bff32ccc",
459+
"eu-central-1": "ami-bc5b48d0",
460+
"ap-southeast-1": "ami-c9b572aa",
461+
"ap-northeast-1": "ami-383c1956",
462+
"ap-southeast-2": "ami-48d38c2b",
463+
"sa-east-1": "ami-6817af04",
464+
}
447465

448466
def get_tachyon_version(spark_version):
449467
return SPARK_TACHYON_MAP.get(spark_version, "")
450468

451469

452470
# Attempt to resolve an appropriate AMI given the architecture and region of the request.
453471
def get_spark_ami(opts):
472+
if opts.gpu:
473+
if opts.region in EC2_AMAZON_HVM_AMI:
474+
ami = EC2_AMAZON_HVM_AMI[opts.region]
475+
print("Spark AMI: " + ami)
476+
return ami
477+
else:
478+
print("Could not resolve AMAZON AMI for region: " + opts.region, file=stderr)
479+
sys.exit(1)
480+
454481
if opts.instance_type in EC2_INSTANCE_TYPES:
455482
instance_type = EC2_INSTANCE_TYPES[opts.instance_type]
456483
else:
@@ -851,11 +878,25 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
851878
)
852879

853880
print("Running setup on master...")
854-
setup_spark_cluster(master, opts)
881+
setup_spark_cluster(master, slave_nodes, opts)
855882
print("Done!")
856883

857884

858-
def setup_spark_cluster(master, opts):
885+
def setup_spark_cluster(master, slave_nodes, opts):
886+
if opts.gpu:
887+
scp(master, opts, "%s/gpu/spark_init.sh" % SPARK_EC2_DIR, "spark-ec2/spark/init.sh")
888+
scp(master, opts, "%s/gpu/spark-defaults.conf.add" % SPARK_EC2_DIR, "spark-ec2/templates/root/spark/conf/spark-defaults.conf.add")
889+
scp(master, opts, "%s/gpu/spark-env.sh.add" % SPARK_EC2_DIR, "spark-ec2/templates/root/spark/conf/spark-env.sh.add")
890+
ssh(master, opts, "cat spark-ec2/templates/root/spark/conf/spark-defaults.conf.add >> spark-ec2/templates/root/spark/conf/spark-defaults.conf")
891+
ssh(master, opts, "cat spark-ec2/templates/root/spark/conf/spark-env.sh.add >> spark-ec2/templates/root/spark/conf/spark-env.sh")
892+
scp(master, opts, "%s/gpu/create_image.sh" % SPARK_EC2_DIR, "create_image.sh")
893+
ssh(master, opts, "chmod u+x ./create_image.sh")
894+
ssh(master, opts, "./create_image.sh > ./create_image_master.log")
895+
for slave in slave_nodes:
896+
slave_address = get_dns_name(slave, opts.private_ips)
897+
scp(slave_address, opts, "%s/gpu/create_image.sh" % SPARK_EC2_DIR, "create_image.sh")
898+
ssh(slave_address, opts, "chmod u+x ./create_image.sh")
899+
ssh(slave_address, opts, "./create_image.sh gpu > ./create_image_slave.log")
859900
ssh(master, opts, "chmod u+x spark-ec2/setup.sh")
860901
ssh(master, opts, "spark-ec2/setup.sh")
861902
print("Spark standalone cluster started at http://%s:8080" % master)
@@ -1180,6 +1221,29 @@ def ssh(host, opts, command):
11801221
tries = tries + 1
11811222

11821223

1224+
def scp(host, opts, src, dst):
1225+
tries = 0
1226+
while True:
1227+
try:
1228+
return subprocess.check_call(
1229+
['scp'] + ssh_args(opts) +
1230+
[stringify_command(src), '%s@%s:%s' % (opts.user, host, stringify_command(dst))])
1231+
except subprocess.CalledProcessError as e:
1232+
if tries > 5:
1233+
# If this was an ssh failure, provide the user with hints.
1234+
if e.returncode == 255:
1235+
raise UsageError(
1236+
"Failed to SCP to remote host {0}.\n"
1237+
"Please check that you have provided the correct --identity-file and "
1238+
"--key-pair parameters and try again.".format(host))
1239+
else:
1240+
raise e
1241+
print("Error executing remote command, retrying after 30 seconds: {0}".format(e),
1242+
file=stderr)
1243+
time.sleep(30)
1244+
tries = tries + 1
1245+
1246+
11831247
# Backported from Python 2.7 for compatiblity with 2.6 (See SPARK-1990)
11841248
def _check_output(*popenargs, **kwargs):
11851249
if 'stdout' in kwargs:

0 commit comments

Comments
 (0)