Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/ensure-builder/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ runs:
echo "runner_label=$USERNAME-$runner_type" >> $GITHUB_OUTPUT
if [[ $TYPE == builder-x86 ]]; then
# 128-core x86 instance types with least evictions
echo "instance_type=r6in.32xlarge r6a.32xlarge i4i.32xlarge r7iz.32xlarge" >> $GITHUB_OUTPUT
echo "instance_type=i4i.32xlarge m6a.32xlarge m6i.32xlarge m6id.32xlarge m6idn.32xlarge m6in.32xlarge m7a.32xlarge r6a.32xlarge r6i.32xlarge r6id.32xlarge r6in.32xlarge r7iz.32xlarge" >> $GITHUB_OUTPUT
echo "ami_id=ami-04d8422a9ba4de80f" >> $GITHUB_OUTPUT
echo "ebs_cache_size=256" >> $GITHUB_OUTPUT
echo "runner_concurrency=20" >> $GITHUB_OUTPUT
Expand Down
12 changes: 6 additions & 6 deletions .github/ensure-tester-with-images/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,14 @@ runs:
export BUILDER_SPOT_IP=${{ env.BUILDER_SPOT_IP }}
export BUILDER_SPOT_KEY=~/.ssh/build_instance_key
scripts/run_on_builder "
sudo mkdir -p /var/lib/docker/tmp
sudo mkdir -p /var/lib/docker/tmp-images

sudo flock /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.lock bash -c '
if ! [ -f /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli ] ; then
docker save aztecprotocol/aztec:${{ env.IMAGE_KEY }} aztecprotocol/end-to-end:${{ env.IMAGE_KEY }} | brotli -2 > /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli.tmp
mv /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli.tmp /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli
sudo flock /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.lock bash -c '
if ! [ -f /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli ] ; then
docker save aztecprotocol/aztec:${{ env.IMAGE_KEY }} aztecprotocol/end-to-end:${{ env.IMAGE_KEY }} | brotli -2 > /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli.tmp
mv /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli.tmp /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli
fi'
sudo cat /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli
sudo cat /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli
" | brotli --decompress | docker load

- name: Test
Expand Down
2 changes: 1 addition & 1 deletion .github/ensure-tester/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ runs:
elif [[ $TYPE == 128core-* ]]; then
SIZE=32xlarge
fi
echo "instance_type=m6a.$SIZE r6in.$SIZE r6a.$SIZE i4i.$SIZE r7iz.$SIZE" >> $GITHUB_OUTPUT
echo "instance_type=i4i.$SIZE m6a.$SIZE m6i.$SIZE m6id.$SIZE m6idn.$SIZE m6in.$SIZE m7a.$SIZE r6a.$SIZE r6i.$SIZE r6id.$SIZE r6in.$SIZE r7iz.$SIZE" >> $GITHUB_OUTPUT

- name: Start Tester
uses: ./.github/spot-runner-action
Expand Down
25 changes: 10 additions & 15 deletions .github/spot-runner-action/dist/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -260,13 +260,6 @@ class Ec2Instance {
LaunchTemplateData: {
ImageId: this.config.ec2AmiId,
InstanceInitiatedShutdownBehavior: "terminate",
InstanceRequirements: {
// We do not know what the instance types correspond to
// just let the user send a list of allowed instance types
VCpuCount: { Min: 0 },
MemoryMiB: { Min: 0 },
AllowedInstanceTypes: this.config.ec2InstanceType,
},
SecurityGroupIds: [this.config.ec2SecurityGroupId],
KeyName: this.config.ec2KeyName,
UserData: userDataScript,
Expand Down Expand Up @@ -326,6 +319,9 @@ class Ec2Instance {
Type: "instant",
LaunchTemplateConfigs: [fleetLaunchConfig],
ClientToken: this.config.clientToken || undefined,
SpotOptions: {
AllocationStrategy: "price-capacity-optimized",
},
TargetCapacitySpecification: {
TotalTargetCapacity: 1,
OnDemandTargetCapacity: useOnDemand ? 1 : 0,
Expand All @@ -336,13 +332,13 @@ class Ec2Instance {
const client = yield this.getEc2Client();
const fleet = yield client.createFleet(createFleetRequest).promise();
if (fleet.Errors && fleet.Errors.length > 0) {
core.warning(JSON.stringify(fleet.Errors, null, 2));
for (const error of fleet.Errors) {
if (error.ErrorCode === "RequestLimitExceeded" ||
error.ErrorCode === "InsufficientInstanceCapacity") {
return error.ErrorCode;
}
}
core.error(JSON.stringify(fleet.Errors, null, 2));
}
const instances = ((fleet === null || fleet === void 0 ? void 0 : fleet.Instances) || [])[0] || {};
return (instances.InstanceIds || [])[0] || "";
Expand Down Expand Up @@ -728,11 +724,10 @@ function requestAndWaitForSpot(config) {
}
let instanceId = "";
for (const ec2Strategy of ec2SpotStrategies) {
let backoff = 1;
let backoff = 0;
core.info(`Starting instance with ${ec2Strategy} strategy`);
// 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff
// TODO make longer lived spot request?
for (let i = 0; i < 6; i++) {
const MAX_ATTEMPTS = 3; // uses exponential backoff
for (let i = 0; i < MAX_ATTEMPTS; i++) {
// Start instance
const instanceIdOrError = yield ec2Client.requestMachine(
// we fallback to on-demand
Expand All @@ -742,15 +737,15 @@ function requestAndWaitForSpot(config) {
instanceIdOrError === "InsufficientInstanceCapacity") {
core.info("Failed to create instance due to " +
instanceIdOrError +
" , waiting 10 seconds and trying again.");
backoff += 1;
", waiting " + 5 * Math.pow(2, backoff) + " seconds and trying again.");
}
else {
instanceId = instanceIdOrError;
break;
}
// wait 10 seconds
yield new Promise((r) => setTimeout(r, 10000 * Math.pow(2, backoff)));
yield new Promise((r) => setTimeout(r, 5000 * Math.pow(2, backoff)));
backoff += 1;
}
if (instanceId) {
core.info("Successfully requested instance with ID " + instanceId);
Expand Down
12 changes: 4 additions & 8 deletions .github/spot-runner-action/src/ec2.ts
Original file line number Diff line number Diff line change
Expand Up @@ -179,13 +179,6 @@ export class Ec2Instance {
LaunchTemplateData: {
ImageId: this.config.ec2AmiId,
InstanceInitiatedShutdownBehavior: "terminate",
InstanceRequirements: {
// We do not know what the instance types correspond to
// just let the user send a list of allowed instance types
VCpuCount: { Min: 0 },
MemoryMiB: { Min: 0 },
AllowedInstanceTypes: this.config.ec2InstanceType,
},
SecurityGroupIds: [this.config.ec2SecurityGroupId],
KeyName: this.config.ec2KeyName,
UserData: userDataScript,
Expand Down Expand Up @@ -245,6 +238,9 @@ export class Ec2Instance {
Type: "instant",
LaunchTemplateConfigs: [fleetLaunchConfig],
ClientToken: this.config.clientToken || undefined,
SpotOptions: {
AllocationStrategy: "price-capacity-optimized",
},
TargetCapacitySpecification: {
TotalTargetCapacity: 1,
OnDemandTargetCapacity: useOnDemand ? 1 : 0,
Expand All @@ -255,6 +251,7 @@ export class Ec2Instance {
const client = await this.getEc2Client();
const fleet = await client.createFleet(createFleetRequest).promise();
if (fleet.Errors && fleet.Errors.length > 0) {
core.warning(JSON.stringify(fleet.Errors, null, 2));
for (const error of fleet.Errors) {
if (
error.ErrorCode === "RequestLimitExceeded" ||
Expand All @@ -263,7 +260,6 @@ export class Ec2Instance {
return error.ErrorCode;
}
}
core.error(JSON.stringify(fleet.Errors, null, 2));
}
const instances: CreateFleetInstance = (fleet?.Instances || [])[0] || {};
return (instances.InstanceIds || [])[0] || "";
Expand Down
9 changes: 5 additions & 4 deletions .github/spot-runner-action/src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise<string> {
for (const ec2Strategy of ec2SpotStrategies) {
let backoff = 0;
core.info(`Starting instance with ${ec2Strategy} strategy`);
for (let i = 0; i < 6; i++) {
const MAX_ATTEMPTS = 3; // uses exponential backoff
for (let i = 0; i < MAX_ATTEMPTS; i++) {
// Start instance
const instanceIdOrError =
await ec2Client.requestMachine(
Expand All @@ -75,18 +76,18 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise<string> {
instanceIdOrError === "RequestLimitExceeded" ||
instanceIdOrError === "InsufficientInstanceCapacity"
) {
backoff += 1;
core.info(
"Failed to create instance due to " +
instanceIdOrError +
" , waiting " + 10000 * 2 ** backoff + " seconds and trying again."
", waiting " + 5 * 2 ** backoff + " seconds and trying again."
);
} else {
instanceId = instanceIdOrError;
break;
}
// wait 10 seconds
await new Promise((r) => setTimeout(r, 10000 * 2 ** backoff));
await new Promise((r) => setTimeout(r, 5000 * 2 ** backoff));
backoff += 1;
}
if (instanceId) {
core.info("Successfully requested instance with ID " + instanceId);
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ jobs:
concurrency_key: docs-preview-${{ inputs.username || github.actor }}-x86
- name: "Docs Preview"
timeout-minutes: 30
run: earthly --no-output ./docs/+deploy-preview --PR=${{ github.event.number }} --AZTEC_BOT_COMMENTER_GITHUB_TOKEN=${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} --NETLIFY_AUTH_TOKEN=${{ secrets.NETLIFY_AUTH_TOKEN }} --NETLIFY_SITE_ID=${{ secrets.NETLIFY_SITE_ID }}
run: earthly-ci --no-output ./docs/+deploy-preview --PR=${{ github.event.number }} --AZTEC_BOT_COMMENTER_GITHUB_TOKEN=${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} --NETLIFY_AUTH_TOKEN=${{ secrets.NETLIFY_AUTH_TOKEN }} --NETLIFY_SITE_ID=${{ secrets.NETLIFY_SITE_ID }}

bb-bench:
runs-on: ubuntu-latest
Expand Down
4 changes: 2 additions & 2 deletions scripts/ci/attach_ebs_cache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,9 @@ fi
# Create a mount point and mount the volume
mkdir -p /var/lib/docker
mount $BLKDEVICE /var/lib/docker
service docker restart
# clear our images temp folder
rm -rf /var/lib/docker/tmp
rm -rf /var/lib/docker/tmp-images
systemctl restart docker
# important: everything (except earthly ls) should go through earthly-ci
scripts/earthly-ci bootstrap
touch /home/ubuntu/.setup-complete
10 changes: 8 additions & 2 deletions scripts/earthly-ci
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ OUTPUT_FILE=$(mktemp)
INCONSISTENT_GRAPH_STATE_COUNT=0 # Counter for 'inconsistent graph state' errors

# Maximum attempts
MAX_ATTEMPTS=8
MAX_ATTEMPTS=5
ATTEMPT_COUNT=0

# earthly settings
Expand Down Expand Up @@ -45,9 +45,15 @@ while [ $ATTEMPT_COUNT -lt $MAX_ATTEMPTS ]; do
echo "Got 'inconsistent graph state' or 'failed to get state for index'. Sleeping for 30 seconds and retrying."
sleep 30
elif grep 'Error: pull ping error: pull ping response' $OUTPUT_FILE >/dev/null; then
echo "Got 'Error: pull ping error: pull ping response', intermittent failure when writing out images to docker"
echo "Got 'Error: pull ping error: pull ping response', intermittent failure when writing out images to docker. If this persists, try 'systemctl restart docker' on the spot instance."
elif grep '================================= System Info ==================================' $OUTPUT_FILE >/dev/null; then
echo "Detected an Earthly daemon restart, possibly due to it (mis)detecting a cache setting change, trying again..."
elif grep 'dial unix /run/buildkit/buildkitd.sock' $OUTPUT_FILE >/dev/null; then
echo "Detected earthly unable to find buildkit, waiting and trying again..."
sleep 20
elif grep 'The container name "/earthly-buildkitd" is already in use by container' $OUTPUT_FILE >/dev/null; then
echo "Detected earthly bootstrap happening in parallel and failing, waiting and trying again."
sleep 20
else
# If other errors, exit the script
exit 1
Expand Down