Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 74 additions & 31 deletions ci-operator/step-registry/ipi/conf/aws/ipi-conf-aws-commands.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,73 +18,113 @@ REGION="${LEASED_RESOURCE}"
# for general purpose work. Use by default, when supported in the
# region.
IS_M6A_REGION="no"
if aws ec2 describe-instance-type-offerings --region "${REGION}" | grep m6a ; then
if aws ec2 describe-instance-type-offerings --region "${REGION}" | grep -q m6a ; then
IS_M6A_REGION="yes"
fi

function eval_instance_capacity() {
local DESIRED_TYPE="$1"
local FALLBACK_TYPE="$2"
# During our initial adoption of m6a, AWS has report insufficient capacity at peak hours. For cost effectiveness
# and to ensure AWS eventual adds m6a capacity due to these errors, we want to continue to use them. However,
# if left unchecked, these peak hour errors can derail a statistically significant number of jobs.
# To mitigate the capacity issues, search.ci.openshift.org can tell us if previous jobs have failed to provision
# the desired instance type - in this region - in the last x minutes.
# If we find such an error, use the fallback instance type.

# Example error
# error creating EC2 instance: InsufficientInstanceCapacity: We currently do not have sufficient m6a.xlarge capacity
# in the Availability Zone you requested (us-east-1c). Our system will be working on provisioning additional capacity.
# You can currently get m6a.xlarge capacity by not specifying an Availability Zone in your request or choosing
# us-east-1a, us-east-1b, us-east-1d, us-east-1f.\n status code: 500, request id: ...

set +o errexit
local LOOK_BACK_PERIOD="30m"
local TARGET_TYPE="${DESIRED_TYPE}"
for retry in {1..30}; do
if err_count=$(curl -L -s "https://search.ci.openshift.org/search?search=InsufficientInstanceCapacity.*${DESIRED_TYPE}.*${REGION}&maxAge=${LOOK_BACK_PERIOD}&context=0&type=build-log" | jq length); then
if [[ "${err_count}" == "0" ]]; then
break # Use DESIRED_TYPE
else
>&2 echo "Recent instance AWS availability issue for ${DESIRED_TYPE} in ${REGION}; falling back to ${FALLBACK_TYPE}"
TARGET_TYPE="${FALLBACK_TYPE}"
break
fi
fi
sleep 2
>&2 echo "Error querying search.ci.openshift.com for AWS instance availability information (retry ${retry} of 30)."
done

echo "${TARGET_TYPE}"
set -o errexit
}

# Do not change auto-types unless it is coordinated with the cloud
# financial operations team. Savings plans may be in place to
# decrease the cost of certain instance families.
if [[ "${COMPUTE_NODE_TYPE}" == "" ]]; then
if [[ "${IS_M6A_REGION}" == "yes" ]]; then
COMPUTE_NODE_TYPE="m6a.xlarge"
COMPUTE_NODE_TYPE=$(eval_instance_capacity "m6a.xlarge" "m6i.xlarge")
else
COMPUTE_NODE_TYPE="m6i.xlarge"
fi
fi

CONTROL_PLANE_INSTANCE_SIZE="xlarge"
if [[ "${SIZE_VARIANT}" == "xlarge" ]]; then
CONTROL_PLANE_INSTANCE_SIZE="8xlarge"
elif [[ "${SIZE_VARIANT}" == "large" ]]; then
CONTROL_PLANE_INSTANCE_SIZE="4xlarge"
elif [[ "${SIZE_VARIANT}" == "compact" ]]; then
CONTROL_PLANE_INSTANCE_SIZE="2xlarge"
fi

# BootstrapInstanceType gets its value from pkg/types/aws/defaults/platform.go
architecture=${OCP_ARCH:-"amd64"}

if [[ "${IS_M6A_REGION}" == "yes" ]]; then
arch_instance_type=m6a
else
arch_instance_type=m6i
fi

if [[ "${CLUSTER_TYPE}" == "aws-arm64" ]]; then
architecture="arm64"
fi

if [[ x"${architecture}" == x"arm64" ]]; then
arch_instance_type=m6g
CONTROL_PLANE_INSTANCE_TYPE="${arch_instance_type}.${CONTROL_PLANE_INSTANCE_SIZE}"
else
if [[ "${IS_M6A_REGION}" == "yes" ]]; then
CONTROL_PLANE_INSTANCE_TYPE=$(eval_instance_capacity "m6a.${CONTROL_PLANE_INSTANCE_SIZE}" "m6i.${CONTROL_PLANE_INSTANCE_SIZE}")
else
CONTROL_PLANE_INSTANCE_TYPE="m6i.${CONTROL_PLANE_INSTANCE_SIZE}"
fi
arch_instance_type=$(echo -n "${CONTROL_PLANE_INSTANCE_TYPE}" | cut -d . -f 1)
fi

BOOTSTRAP_NODE_TYPE=${arch_instance_type}.large

workers=3
if [[ "${SIZE_VARIANT}" == "compact" ]]; then
workers=0
fi

master_type=${arch_instance_type}.xlarge
if [[ "${SIZE_VARIANT}" == "xlarge" ]]; then
master_type=${arch_instance_type}.8xlarge
elif [[ "${SIZE_VARIANT}" == "large" ]]; then
master_type=${arch_instance_type}.4xlarge
elif [[ "${SIZE_VARIANT}" == "compact" ]]; then
master_type=${arch_instance_type}.2xlarge
fi

# Generate working availability zones from the region
mapfile -t AVAILABILITY_ZONES < <(aws --region "${REGION}" ec2 describe-availability-zones | jq -r '.AvailabilityZones[] | select(.State == "available") | .ZoneName' | sort -u)
# Generate availability zones with OpenShift Installer required instance types

if [[ "${COMPUTE_NODE_TYPE}" == "${BOOTSTRAP_NODE_TYPE}" && "${COMPUTE_NODE_TYPE}" == "${master_type}" ]]; then ## all regions are the same
mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${master_type}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 1 ' | awk '{print $2}')
elif [[ "${master_type}" == null && "${COMPUTE_NODE_TYPE}" == null ]]; then ## two null regions
mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${master_type}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 1 ' | awk '{print $2}')
elif [[ "${master_type}" == null || "${COMPUTE_NODE_TYPE}" == null ]]; then ## one null region
if [[ "${BOOTSTRAP_NODE_TYPE}" == "${COMPUTE_NODE_TYPE}" || "${BOOTSTRAP_NODE_TYPE}" == "${master_type}" || "${master_type}" == "${COMPUTE_NODE_TYPE}" ]]; then ## "one null region and duplicates"
mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${master_type}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 1 ' | awk '{print $2}')
if [[ "${COMPUTE_NODE_TYPE}" == "${BOOTSTRAP_NODE_TYPE}" && "${COMPUTE_NODE_TYPE}" == "${CONTROL_PLANE_INSTANCE_TYPE}" ]]; then ## all regions are the same
mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${CONTROL_PLANE_INSTANCE_TYPE}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 1 ' | awk '{print $2}')
elif [[ "${CONTROL_PLANE_INSTANCE_TYPE}" == null && "${COMPUTE_NODE_TYPE}" == null ]]; then ## two null regions
mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${CONTROL_PLANE_INSTANCE_TYPE}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 1 ' | awk '{print $2}')
elif [[ "${CONTROL_PLANE_INSTANCE_TYPE}" == null || "${COMPUTE_NODE_TYPE}" == null ]]; then ## one null region
if [[ "${BOOTSTRAP_NODE_TYPE}" == "${COMPUTE_NODE_TYPE}" || "${BOOTSTRAP_NODE_TYPE}" == "${CONTROL_PLANE_INSTANCE_TYPE}" || "${CONTROL_PLANE_INSTANCE_TYPE}" == "${COMPUTE_NODE_TYPE}" ]]; then ## "one null region and duplicates"
mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${CONTROL_PLANE_INSTANCE_TYPE}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 1 ' | awk '{print $2}')
else ## "one null region and no duplicates"
mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${master_type}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 2 ' | awk '{print $2}')
mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${CONTROL_PLANE_INSTANCE_TYPE}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 2 ' | awk '{print $2}')
fi
elif [[ "${BOOTSTRAP_NODE_TYPE}" == "${COMPUTE_NODE_TYPE}" || "${BOOTSTRAP_NODE_TYPE}" == "${master_type}" || "${master_type}" == "${COMPUTE_NODE_TYPE}" ]]; then ## duplicates regions with no null region
mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${master_type}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 2 ' | awk '{print $2}')
elif [[ "${BOOTSTRAP_NODE_TYPE}" != "${COMPUTE_NODE_TYPE}" && "${COMPUTE_NODE_TYPE}" != "${master_type}" ]]; then # three different regions
mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${master_type}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 3 ' | awk '{print $2}')
elif [[ "${BOOTSTRAP_NODE_TYPE}" == "${COMPUTE_NODE_TYPE}" || "${BOOTSTRAP_NODE_TYPE}" == "${CONTROL_PLANE_INSTANCE_TYPE}" || "${CONTROL_PLANE_INSTANCE_TYPE}" == "${COMPUTE_NODE_TYPE}" ]]; then ## duplicates regions with no null region
mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${CONTROL_PLANE_INSTANCE_TYPE}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 2 ' | awk '{print $2}')
elif [[ "${BOOTSTRAP_NODE_TYPE}" != "${COMPUTE_NODE_TYPE}" && "${COMPUTE_NODE_TYPE}" != "${CONTROL_PLANE_INSTANCE_TYPE}" ]]; then # three different regions
mapfile -t INSTANCE_ZONES < <(aws --region "${REGION}" ec2 describe-instance-type-offerings --location-type availability-zone --filters Name=instance-type,Values="${BOOTSTRAP_NODE_TYPE}","${CONTROL_PLANE_INSTANCE_TYPE}","${COMPUTE_NODE_TYPE}" | jq -r '.InstanceTypeOfferings[].Location' | sort | uniq -c | grep ' 3 ' | awk '{print $2}')
fi
# Generate availability zones based on these 2 criterias
# Generate availability zones based on these 2 criteria
mapfile -t ZONES < <(echo "${AVAILABILITY_ZONES[@]}" "${INSTANCE_ZONES[@]}" | sed 's/ /\n/g' | sort -R | uniq -d)
# Calculate the maximum number of availability zones from the region
MAX_ZONES_COUNT="${#ZONES[@]}"
Expand Down Expand Up @@ -114,6 +154,9 @@ else
echo "zones already set in install-config.yaml, skipped"
fi

echo "Using control plane instance type: ${CONTROL_PLANE_INSTANCE_TYPE}"
echo "Using compute instance type: ${COMPUTE_NODE_TYPE}"

PATCH="${SHARED_DIR}/install-config-common.yaml.patch"
cat > "${PATCH}" << EOF
baseDomain: ${BASE_DOMAIN}
Expand All @@ -127,7 +170,7 @@ controlPlane:
name: master
platform:
aws:
type: ${master_type}
type: ${CONTROL_PLANE_INSTANCE_TYPE}
compute:
- architecture: ${architecture}
name: worker
Expand Down