From 78375e67c5ca7c69a85913da712dd9622c850b93 Mon Sep 17 00:00:00 2001
From: spypsy <spypsy@outlook.com>
Date: Mon, 3 Jun 2024 16:21:34 +0000
Subject: [PATCH 1/4] feat: autoscale prover agents on AWS

---
 yarn-project/aztec/terraform/node/main.tf | 139 +++++++++++++++++++---
 1 file changed, 124 insertions(+), 15 deletions(-)

diff --git a/yarn-project/aztec/terraform/node/main.tf b/yarn-project/aztec/terraform/node/main.tf
index 4b177950eb47..b371945b1958 100644
--- a/yarn-project/aztec/terraform/node/main.tf
+++ b/yarn-project/aztec/terraform/node/main.tf
@@ -551,18 +551,16 @@ resource "aws_security_group_rule" "allow-node-udp-out" {
 
 
 
-
-// Configuration for proving agents
-
+# Configuration for proving agents
 resource "aws_cloudwatch_log_group" "aztec-proving-agent-log-group" {
-  count             = local.total_agents
-  name              = "/fargate/service/${var.DEPLOY_TAG}/aztec-proving-agent-${floor(count.index / local.agents_per_sequencer) + 1}-${(count.index % local.agents_per_sequencer) + 1}"
+  count             = local.node_count
+  name              = "/fargate/service/${var.DEPLOY_TAG}/aztec-proving-agent-group-${count.index + 1}"
   retention_in_days = 14
 }
 
 resource "aws_service_discovery_service" "aztec-proving-agent" {
-  count = local.total_agents
-  name  = "${var.DEPLOY_TAG}-aztec-proving-agent-${floor(count.index / local.agents_per_sequencer) + 1}-${(count.index % local.agents_per_sequencer) + 1}"
+  count = local.node_count
+  name  = "${var.DEPLOY_TAG}-aztec-proving-agent-group-${count.index + 1}"
 
   health_check_custom_config {
     failure_threshold = 1
@@ -593,7 +591,7 @@ resource "aws_service_discovery_service" "aztec-proving-agent" {
 
 # Define task definitions for each node.
 resource "aws_ecs_task_definition" "aztec-proving-agent" {
-  count                    = local.total_agents
+  count                    = local.node_count
   family                   = "${var.DEPLOY_TAG}-aztec-proving-agent-${floor(count.index / local.agents_per_sequencer) + 1}-${(count.index % local.agents_per_sequencer) + 1}"
   requires_compatibilities = ["FARGATE"]
   network_mode             = "awsvpc"
@@ -605,7 +603,7 @@ resource "aws_ecs_task_definition" "aztec-proving-agent" {
   container_definitions = <<DEFINITIONS
 [
   {
-    "name": "${var.DEPLOY_TAG}-aztec-proving-agent-${floor(count.index / local.agents_per_sequencer) + 1}-${(count.index % local.agents_per_sequencer) + 1}",
+    "name": "${var.DEPLOY_TAG}-aztec-proving-agent-group-${count.index + 1}",
     "image": "${var.DOCKERHUB_ACCOUNT}/aztec:${var.DEPLOY_TAG}",
     "command": ["start", "--prover"],
     "essential": true,
@@ -629,8 +627,8 @@ resource "aws_ecs_task_definition" "aztec-proving-agent" {
         "value": "${var.DEPLOY_TAG}"
       },
       {
-        "name": "PROVER_URL",
-        "value": "http://${var.DEPLOY_TAG}-aztec-node-${floor(count.index / local.agents_per_sequencer) + 1}.local/${var.DEPLOY_TAG}/aztec-node-${floor(count.index / local.agents_per_sequencer) + 1}"
+        "name": "AZTEC_NODE_URL",
+        "value": "http://${var.DEPLOY_TAG}-aztec-node-${count.index + 1}.local/${var.DEPLOY_TAG}/aztec-node-${count.index + 1}"
       },
       {
         "name": "PROVER_AGENTS",
@@ -660,7 +658,7 @@ resource "aws_ecs_task_definition" "aztec-proving-agent" {
     "logConfiguration": {
       "logDriver": "awslogs",
       "options": {
-        "awslogs-group": "/fargate/service/${var.DEPLOY_TAG}/aztec-proving-agent-${floor(count.index / local.agents_per_sequencer) + 1}-${(count.index % local.agents_per_sequencer) + 1}",
+        "awslogs-group": ${aws_cloudwatch_log_group.aztec-proving-agent-log-group[count.index].name},
         "awslogs-region": "eu-west-2",
         "awslogs-stream-prefix": "ecs"
       }
@@ -671,8 +669,8 @@ DEFINITIONS
 }
 
 resource "aws_ecs_service" "aztec-proving-agent" {
-  count                              = local.total_agents
-  name                               = "${var.DEPLOY_TAG}-aztec-proving-agent-${floor(count.index / local.agents_per_sequencer) + 1}-${(count.index % local.agents_per_sequencer) + 1}"
+  count                              = local.node_count
+  name                               = "${var.DEPLOY_TAG}-aztec-proving-agent-group-${count.index + 1}"
   cluster                            = data.terraform_remote_state.setup_iac.outputs.ecs_cluster_id
   launch_type                        = "FARGATE"
   desired_count                      = 1
@@ -691,9 +689,120 @@ resource "aws_ecs_service" "aztec-proving-agent" {
 
   service_registries {
     registry_arn   = aws_service_discovery_service.aztec-proving-agent[count.index].arn
-    container_name = "${var.DEPLOY_TAG}-aztec-proving-agent-${floor(count.index / local.agents_per_sequencer) + 1}-${(count.index % local.agents_per_sequencer) + 1}"
+    container_name = "${var.DEPLOY_TAG}-aztec-proving-agent-group-${count.index + 1}"
     container_port = 80
   }
 
   task_definition = aws_ecs_task_definition.aztec-proving-agent[count.index].family
 }
+
+
+# Create CloudWatch metrics for the proving agents
+resource "aws_cloudwatch_metric_alarm" "cpu_high" {
+  count               = local.node_count
+  alarm_name          = "${var.DEPLOY_TAG}-proving-agent-cpu-high-${count.index + 1}"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "CPUUtilization"
+  namespace           = "AWS/ECS"
+  period              = "300"
+  statistic           = "Average"
+  threshold           = "10"
+  alarm_description   = "Alert when CPU utilization is greater than 10%"
+  dimensions = {
+    ClusterName = data.terraform_remote_state.setup_iac.outputs.ecs_cluster_id
+    ServiceName = "${aws_ecs_service.aztec-proving-agent[count.index].name}"
+  }
+  alarm_actions = [aws_appautoscaling_policy.scale_out[count.index].arn]
+}
+
+resource "aws_cloudwatch_metric_alarm" "cpu_low" {
+  count               = local.node_count
+  alarm_name          = "${var.DEPLOY_TAG}-proving-agent-cpu-low-${count.index + 1}"
+  comparison_operator = "LessThanThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "CPUUtilization"
+  namespace           = "AWS/ECS"
+  period              = "300"
+  statistic           = "Average"
+  threshold           = "10"
+  alarm_description   = "Alarm when CPU utilization is less than 10%"
+  dimensions = {
+    ClusterName = data.terraform_remote_state.setup_iac.outputs.ecs_cluster_id
+    ServiceName = "${aws_ecs_service.aztec-proving-agent[count.index].name}"
+  }
+  alarm_actions = [aws_appautoscaling_policy.scale_in[count.index].arn]
+}
+
+# Create Auto Scaling Target for ECS Service
+resource "aws_appautoscaling_target" "ecs_proving_agent" {
+  count              = local.node_count
+  max_capacity       = var.AGENTS_PER_SEQUENCER
+  min_capacity       = 1
+  resource_id        = "service/${data.terraform_remote_state.setup_iac.outputs.ecs_cluster_id}/${aws_ecs_service.aztec_proving_agent[count.index].name}"
+  scalable_dimension = "ecs:service:DesiredCount"
+  service_namespace  = "ecs"
+}
+
+# Create Scaling Policy for Scaling Out
+resource "aws_appautoscaling_policy" "scale_out" {
+  count              = local.node_count
+  name               = "${var.DEPLOY_TAG}-scale-out-${count.index}"
+  policy_type        = "StepScaling"
+  resource_id        = aws_appautoscaling_target.ecs_proving_agent[count.index].resource_id
+  scalable_dimension = aws_appautoscaling_target.ecs_proving_agent[count.index].scalable_dimension
+  service_namespace  = aws_appautoscaling_target.ecs_proving_agent[count.index].service_namespace
+
+  step_scaling_policy_configuration {
+    adjustment_type         = "ChangeInCapacity"
+    cooldown                = 60
+    metric_aggregation_type = "Average"
+
+    step_adjustment {
+      scaling_adjustment          = 1
+      metric_interval_lower_bound = 0
+    }
+  }
+}
+
+# Create Scaling Policy for Scaling In
+resource "aws_appautoscaling_policy" "scale_in" {
+  count              = local.node_count
+  name               = "${var.DEPLOY_TAG}-scale-in-${count.index + 1}"
+  policy_type        = "StepScaling"
+  resource_id        = aws_appautoscaling_target.ecs_proving_agent[count.index].resource_id
+  scalable_dimension = aws_appautoscaling_target.ecs_proving_agent[count.index].scalable_dimension
+  service_namespace  = aws_appautoscaling_target.ecs_proving_agent[count.index].service_namespace
+
+  step_scaling_policy_configuration {
+    adjustment_type         = "ChangeInCapacity"
+    cooldown                = 60
+    metric_aggregation_type = "Average"
+
+    step_adjustment {
+      scaling_adjustment          = -1
+      metric_interval_upper_bound = 0
+    }
+  }
+}
+
+# Link the High CPU alarm to the scale out policy
+resource "aws_cloudwatch_metric_alarm" "cpu_high" {
+  count               = local.node_count
+  alarm_name          = "${var.DEPLOY_TAG}-cpu-high-${count.index + 1}"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = "1"
+  metric_name         = "CPUUtilization"
+  namespace           = "AWS/ECS"
+  period              = "300"
+  statistic           = "Average"
+  threshold           = "10"
+  alarm_description   = "Alarm when CPU utilization is greater than 10%"
+  dimensions = {
+    ClusterName = data.terraform_remote_state.setup_iac.outputs.ecs_cluster_id
+    ServiceName = "${aws_ecs_service.aztec_proving_agent[count.index].name}"
+  }
+  alarm_actions             = [aws_appautoscaling_policy.scale_out[count.index].arn]
+  insufficient_data_actions = []
+  ok_actions                = []
+}

From 75552b1bf66e14cd7b005c989c2885d47105054c Mon Sep 17 00:00:00 2001
From: spypsy <spypsy@outlook.com>
Date: Mon, 3 Jun 2024 16:23:55 +0000
Subject: [PATCH 2/4] fix some vars

---
 yarn-project/aztec/terraform/node/main.tf | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/yarn-project/aztec/terraform/node/main.tf b/yarn-project/aztec/terraform/node/main.tf
index b371945b1958..80cc2a6dd66e 100644
--- a/yarn-project/aztec/terraform/node/main.tf
+++ b/yarn-project/aztec/terraform/node/main.tf
@@ -59,7 +59,6 @@ locals {
   node_count             = length(local.publisher_private_keys)
   data_dir               = "/usr/src/yarn-project/aztec/data"
   agents_per_sequencer   = var.AGENTS_PER_SEQUENCER
-  total_agents           = local.node_count * local.agents_per_sequencer
 }
 
 resource "aws_cloudwatch_log_group" "aztec-node-log-group" {
@@ -737,7 +736,7 @@ resource "aws_cloudwatch_metric_alarm" "cpu_low" {
 # Create Auto Scaling Target for ECS Service
 resource "aws_appautoscaling_target" "ecs_proving_agent" {
   count              = local.node_count
-  max_capacity       = var.AGENTS_PER_SEQUENCER
+  max_capacity       = local.agents_per_sequencer
   min_capacity       = 1
   resource_id        = "service/${data.terraform_remote_state.setup_iac.outputs.ecs_cluster_id}/${aws_ecs_service.aztec_proving_agent[count.index].name}"
   scalable_dimension = "ecs:service:DesiredCount"

From a838f9e21b971d26e4bc77afad0299701de4cc02 Mon Sep 17 00:00:00 2001
From: spypsy <spypsy@outlook.com>
Date: Mon, 3 Jun 2024 16:45:35 +0000
Subject: [PATCH 3/4] adjustements for immediate scaling

---
 yarn-project/aztec/terraform/node/main.tf | 41 ++++++-----------------
 1 file changed, 10 insertions(+), 31 deletions(-)

diff --git a/yarn-project/aztec/terraform/node/main.tf b/yarn-project/aztec/terraform/node/main.tf
index 80cc2a6dd66e..340506a7a916 100644
--- a/yarn-project/aztec/terraform/node/main.tf
+++ b/yarn-project/aztec/terraform/node/main.tf
@@ -591,7 +591,7 @@ resource "aws_service_discovery_service" "aztec-proving-agent" {
 # Define task definitions for each node.
 resource "aws_ecs_task_definition" "aztec-proving-agent" {
   count                    = local.node_count
-  family                   = "${var.DEPLOY_TAG}-aztec-proving-agent-${floor(count.index / local.agents_per_sequencer) + 1}-${(count.index % local.agents_per_sequencer) + 1}"
+  family                   = "${var.DEPLOY_TAG}-aztec-proving-agent-group-${count.index + 1}"
   requires_compatibilities = ["FARGATE"]
   network_mode             = "awsvpc"
   cpu                      = "16384"
@@ -704,8 +704,8 @@ resource "aws_cloudwatch_metric_alarm" "cpu_high" {
   evaluation_periods  = "1"
   metric_name         = "CPUUtilization"
   namespace           = "AWS/ECS"
-  period              = "300"
-  statistic           = "Average"
+  period              = "60"
+  statistic           = "Maximum"
   threshold           = "10"
   alarm_description   = "Alert when CPU utilization is greater than 10%"
   dimensions = {
@@ -722,8 +722,8 @@ resource "aws_cloudwatch_metric_alarm" "cpu_low" {
   evaluation_periods  = "1"
   metric_name         = "CPUUtilization"
   namespace           = "AWS/ECS"
-  period              = "300"
-  statistic           = "Average"
+  period              = "60"
+  statistic           = "Maximum"
   threshold           = "10"
   alarm_description   = "Alarm when CPU utilization is less than 10%"
   dimensions = {
@@ -747,7 +747,7 @@ resource "aws_appautoscaling_target" "ecs_proving_agent" {
 resource "aws_appautoscaling_policy" "scale_out" {
   count              = local.node_count
   name               = "${var.DEPLOY_TAG}-scale-out-${count.index}"
-  policy_type        = "StepScaling"
+  policy_type        = "TargetTrackingScaling"
   resource_id        = aws_appautoscaling_target.ecs_proving_agent[count.index].resource_id
   scalable_dimension = aws_appautoscaling_target.ecs_proving_agent[count.index].scalable_dimension
   service_namespace  = aws_appautoscaling_target.ecs_proving_agent[count.index].service_namespace
@@ -755,10 +755,10 @@ resource "aws_appautoscaling_policy" "scale_out" {
   step_scaling_policy_configuration {
     adjustment_type         = "ChangeInCapacity"
     cooldown                = 60
-    metric_aggregation_type = "Average"
+    metric_aggregation_type = "Maximum"
 
     step_adjustment {
-      scaling_adjustment          = 1
+      scaling_adjustment          = local.agents_per_sequencer - 1 # -1 since we're adding our target to the existing 1
       metric_interval_lower_bound = 0
     }
   }
@@ -776,32 +776,11 @@ resource "aws_appautoscaling_policy" "scale_in" {
   step_scaling_policy_configuration {
     adjustment_type         = "ChangeInCapacity"
     cooldown                = 60
-    metric_aggregation_type = "Average"
+    metric_aggregation_type = "Maximum"
 
     step_adjustment {
-      scaling_adjustment          = -1
+      scaling_adjustment          = -local.agents_per_sequencer + 1 # +1 since we're removing our target from the existing 1
       metric_interval_upper_bound = 0
     }
   }
 }
-
-# Link the High CPU alarm to the scale out policy
-resource "aws_cloudwatch_metric_alarm" "cpu_high" {
-  count               = local.node_count
-  alarm_name          = "${var.DEPLOY_TAG}-cpu-high-${count.index + 1}"
-  comparison_operator = "GreaterThanThreshold"
-  evaluation_periods  = "1"
-  metric_name         = "CPUUtilization"
-  namespace           = "AWS/ECS"
-  period              = "300"
-  statistic           = "Average"
-  threshold           = "10"
-  alarm_description   = "Alarm when CPU utilization is greater than 10%"
-  dimensions = {
-    ClusterName = data.terraform_remote_state.setup_iac.outputs.ecs_cluster_id
-    ServiceName = "${aws_ecs_service.aztec_proving_agent[count.index].name}"
-  }
-  alarm_actions             = [aws_appautoscaling_policy.scale_out[count.index].arn]
-  insufficient_data_actions = []
-  ok_actions                = []
-}

From 8e9b87067e9be798ae9896cfc8fdd218e84c9017 Mon Sep 17 00:00:00 2001
From: spypsy <spypsy@outlook.com>
Date: Mon, 3 Jun 2024 18:27:23 +0000
Subject: [PATCH 4/4] use ExactCapacity

---
 yarn-project/aztec/terraform/node/main.tf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/yarn-project/aztec/terraform/node/main.tf b/yarn-project/aztec/terraform/node/main.tf
index 340506a7a916..4c93d283d1f4 100644
--- a/yarn-project/aztec/terraform/node/main.tf
+++ b/yarn-project/aztec/terraform/node/main.tf
@@ -753,12 +753,12 @@ resource "aws_appautoscaling_policy" "scale_out" {
   service_namespace  = aws_appautoscaling_target.ecs_proving_agent[count.index].service_namespace
 
   step_scaling_policy_configuration {
-    adjustment_type         = "ChangeInCapacity"
+    adjustment_type         = "ExactCapacity"
     cooldown                = 60
     metric_aggregation_type = "Maximum"
 
     step_adjustment {
-      scaling_adjustment          = local.agents_per_sequencer - 1 # -1 since we're adding our target to the existing 1
+      scaling_adjustment          = local.agents_per_sequencer
       metric_interval_lower_bound = 0
     }
   }
@@ -774,12 +774,12 @@ resource "aws_appautoscaling_policy" "scale_in" {
   service_namespace  = aws_appautoscaling_target.ecs_proving_agent[count.index].service_namespace
 
   step_scaling_policy_configuration {
-    adjustment_type         = "ChangeInCapacity"
+    adjustment_type         = "ExactCapacity"
     cooldown                = 60
     metric_aggregation_type = "Maximum"
 
     step_adjustment {
-      scaling_adjustment          = -local.agents_per_sequencer + 1 # +1 since we're removing our target from the existing 1
+      scaling_adjustment          = 1
       metric_interval_upper_bound = 0
     }
   }