From f67dbf3f99cec0804170574a40614b149b71ea0f Mon Sep 17 00:00:00 2001 From: Vivek Panyam Date: Thu, 27 Jan 2022 01:03:37 -0500 Subject: [PATCH] [CI] Add scripts to create a buildkite AMI (#528) ### Summary: This PR enables us to build a buildkite CI stack AMI that has support for GPUs (drivers + container toolkit) See https://github.com/buildkite/elastic-ci-stack-for-aws for general info on buildkite CI ### Test Plan: - Ran a buildkite pipeline to build an AMI using `create_ami.sh` and confirmed that the resulting AMI has GPU support - Confirmed that instances started from that AMI correctly build the rest of Neuropod --- build/ci/buildkite_image/README.md | 9 ++++++ build/ci/buildkite_image/create_ami.sh | 23 +++++++++++++ build/ci/buildkite_image/get_base_ami.py | 22 +++++++++++++ build/ci/buildkite_image/gpu_ami.json | 32 +++++++++++++++++++ .../install-nvidia-container-toolkit.sh | 26 +++++++++++++++ 5 files changed, 112 insertions(+) create mode 100644 build/ci/buildkite_image/README.md create mode 100755 build/ci/buildkite_image/create_ami.sh create mode 100644 build/ci/buildkite_image/get_base_ami.py create mode 100644 build/ci/buildkite_image/gpu_ami.json create mode 100644 build/ci/buildkite_image/install-nvidia-container-toolkit.sh diff --git a/build/ci/buildkite_image/README.md b/build/ci/buildkite_image/README.md new file mode 100644 index 00000000..dcf2a103 --- /dev/null +++ b/build/ci/buildkite_image/README.md @@ -0,0 +1,9 @@ +Builds a buildkite image that has GPU drivers and the NVIDIA container toolkit installed + +Make sure you have docker installed. You also need the `requests` and `cfn_flip` python packages installed + +Instructions to run + +``` +AWS_ACCESS_KEY_ID=... AWS_SECRET_ACCESS_KEY=... AWS_REGION=us-east-2 BUILDKITE_STACK_VERSION=5.3.0 PACKER_LOG=1 ./create_ami.sh +``` diff --git a/build/ci/buildkite_image/create_ami.sh b/build/ci/buildkite_image/create_ami.sh new file mode 100755 index 00000000..3f0d2614 --- /dev/null +++ b/build/ci/buildkite_image/create_ami.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -eux + +# Get the base AMI +BASE_AMI=$(python3 ./get_base_ami.py --aws-region ${AWS_REGION} --buildkite-stack-version ${BUILDKITE_STACK_VERSION}) + +# Get the instances public IP +set +x +TOKEN=`curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600"` +PUBLIC_IP=`curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/public-ipv4` +LOCAL_IP=`curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/local-ipv4` +set -x + +echo "Public IP: ${PUBLIC_IP}. Local IP: ${LOCAL_IP}" + +docker run \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e PACKER_LOG \ + -v "${PWD}:/src" \ + --rm \ + -w /src \ + hashicorp/packer build -timestamp-ui -var "region=${AWS_REGION}" -var "source_ami=${BASE_AMI}" -var "public_ip=${PUBLIC_IP}" gpu_ami.json diff --git a/build/ci/buildkite_image/get_base_ami.py b/build/ci/buildkite_image/get_base_ami.py new file mode 100644 index 00000000..00eac022 --- /dev/null +++ b/build/ci/buildkite_image/get_base_ami.py @@ -0,0 +1,22 @@ +import argparse +import requests + +from cfn_tools import load_yaml + + +parser = argparse.ArgumentParser(description='Get the base AMI to use for a specific region and version of the buildkite elastic stack') +parser.add_argument('--aws-region', help='The AWS region (e.g. us-east-1)', required=True) +parser.add_argument('--buildkite-stack-version', help='The buildkite stack version (e.g. 5.3.0)', required=True) +args = parser.parse_args() + +# Get the base AMI +r = requests.get(f"https://s3.amazonaws.com/buildkite-aws-stack/v{args.buildkite_stack_version}/aws-stack.yml") + +if r.status_code != 200: + raise ValueError("Failed to fetch buildkite stack config") + +# Parse it and get the base AMI +config = load_yaml(r.text) +base_ami = config["Mappings"]["AWSRegion2AMI"][args.aws_region]["linuxamd64"] + +print(base_ami) diff --git a/build/ci/buildkite_image/gpu_ami.json b/build/ci/buildkite_image/gpu_ami.json new file mode 100644 index 00000000..5e548c71 --- /dev/null +++ b/build/ci/buildkite_image/gpu_ami.json @@ -0,0 +1,32 @@ +{ + "variables": { + "region": "us-east-1", + "instance_type": "p2.xlarge" + }, + + "builders": [ + { + "type": "amazon-ebs", + "region": "{{user `region`}}", + "source_ami": "{{user `source_ami`}}", + "instance_type": "{{user `instance_type`}}", + "ssh_username": "ec2-user", + "ami_name": "neuropod-buildkite-stack-gpu-{{isotime | clean_resource_name}}", + "ami_description": "Buildkite Elastic Stack AMI w/ GPU support", + "ami_groups": ["all"], + "temporary_security_group_source_cidrs": ["{{user `public_ip`}}/32"] + } + ], + "provisioners": [ + { + "type": "shell", + "script": "install-nvidia-container-toolkit.sh" + }, + { + "type": "shell", + "inline": [ + "rm /home/ec2-user/.ssh/authorized_keys" + ] + } + ] +} diff --git a/build/ci/buildkite_image/install-nvidia-container-toolkit.sh b/build/ci/buildkite_image/install-nvidia-container-toolkit.sh new file mode 100644 index 00000000..544f94da --- /dev/null +++ b/build/ci/buildkite_image/install-nvidia-container-toolkit.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -eux + +# Install nvidia drivers +# https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html +sudo yum groupinstall -y "Development Tools" +sudo yum install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r) +BASE_URL=https://us.download.nvidia.com/tesla +DRIVER_VERSION=470.82.01 +curl -fSsl -O $BASE_URL/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run +sudo sh NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --silent + +# Install the container toolkit +# Instructions from https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \ + && curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo + +sudo yum clean expire-cache +sudo yum install -y nvidia-docker2 +sudo systemctl restart docker + +# Automatically start it on boot +sudo systemctl --now enable docker + +# Sanity check +docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi