From 78375e67c5ca7c69a85913da712dd9622c850b93 Mon Sep 17 00:00:00 2001 From: spypsy Date: Mon, 3 Jun 2024 16:21:34 +0000 Subject: [PATCH 1/4] feat: autoscale prover agents on AWS --- yarn-project/aztec/terraform/node/main.tf | 139 +++++++++++++++++++--- 1 file changed, 124 insertions(+), 15 deletions(-) diff --git a/yarn-project/aztec/terraform/node/main.tf b/yarn-project/aztec/terraform/node/main.tf index 4b177950eb47..b371945b1958 100644 --- a/yarn-project/aztec/terraform/node/main.tf +++ b/yarn-project/aztec/terraform/node/main.tf @@ -551,18 +551,16 @@ resource "aws_security_group_rule" "allow-node-udp-out" { - -// Configuration for proving agents - +# Configuration for proving agents resource "aws_cloudwatch_log_group" "aztec-proving-agent-log-group" { - count = local.total_agents - name = "/fargate/service/${var.DEPLOY_TAG}/aztec-proving-agent-${floor(count.index / local.agents_per_sequencer) + 1}-${(count.index % local.agents_per_sequencer) + 1}" + count = local.node_count + name = "/fargate/service/${var.DEPLOY_TAG}/aztec-proving-agent-group-${count.index + 1}" retention_in_days = 14 } resource "aws_service_discovery_service" "aztec-proving-agent" { - count = local.total_agents - name = "${var.DEPLOY_TAG}-aztec-proving-agent-${floor(count.index / local.agents_per_sequencer) + 1}-${(count.index % local.agents_per_sequencer) + 1}" + count = local.node_count + name = "${var.DEPLOY_TAG}-aztec-proving-agent-group-${count.index + 1}" health_check_custom_config { failure_threshold = 1 @@ -593,7 +591,7 @@ resource "aws_service_discovery_service" "aztec-proving-agent" { # Define task definitions for each node. resource "aws_ecs_task_definition" "aztec-proving-agent" { - count = local.total_agents + count = local.node_count family = "${var.DEPLOY_TAG}-aztec-proving-agent-${floor(count.index / local.agents_per_sequencer) + 1}-${(count.index % local.agents_per_sequencer) + 1}" requires_compatibilities = ["FARGATE"] network_mode = "awsvpc" @@ -605,7 +603,7 @@ resource "aws_ecs_task_definition" "aztec-proving-agent" { container_definitions = < Date: Mon, 3 Jun 2024 16:23:55 +0000 Subject: [PATCH 2/4] fix some vars --- yarn-project/aztec/terraform/node/main.tf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yarn-project/aztec/terraform/node/main.tf b/yarn-project/aztec/terraform/node/main.tf index b371945b1958..80cc2a6dd66e 100644 --- a/yarn-project/aztec/terraform/node/main.tf +++ b/yarn-project/aztec/terraform/node/main.tf @@ -59,7 +59,6 @@ locals { node_count = length(local.publisher_private_keys) data_dir = "/usr/src/yarn-project/aztec/data" agents_per_sequencer = var.AGENTS_PER_SEQUENCER - total_agents = local.node_count * local.agents_per_sequencer } resource "aws_cloudwatch_log_group" "aztec-node-log-group" { @@ -737,7 +736,7 @@ resource "aws_cloudwatch_metric_alarm" "cpu_low" { # Create Auto Scaling Target for ECS Service resource "aws_appautoscaling_target" "ecs_proving_agent" { count = local.node_count - max_capacity = var.AGENTS_PER_SEQUENCER + max_capacity = local.agents_per_sequencer min_capacity = 1 resource_id = "service/${data.terraform_remote_state.setup_iac.outputs.ecs_cluster_id}/${aws_ecs_service.aztec_proving_agent[count.index].name}" scalable_dimension = "ecs:service:DesiredCount" From a838f9e21b971d26e4bc77afad0299701de4cc02 Mon Sep 17 00:00:00 2001 From: spypsy Date: Mon, 3 Jun 2024 16:45:35 +0000 Subject: [PATCH 3/4] adjustements for immediate scaling --- yarn-project/aztec/terraform/node/main.tf | 41 ++++++----------------- 1 file changed, 10 insertions(+), 31 deletions(-) diff --git a/yarn-project/aztec/terraform/node/main.tf b/yarn-project/aztec/terraform/node/main.tf index 80cc2a6dd66e..340506a7a916 100644 --- a/yarn-project/aztec/terraform/node/main.tf +++ b/yarn-project/aztec/terraform/node/main.tf @@ -591,7 +591,7 @@ resource "aws_service_discovery_service" "aztec-proving-agent" { # Define task definitions for each node. resource "aws_ecs_task_definition" "aztec-proving-agent" { count = local.node_count - family = "${var.DEPLOY_TAG}-aztec-proving-agent-${floor(count.index / local.agents_per_sequencer) + 1}-${(count.index % local.agents_per_sequencer) + 1}" + family = "${var.DEPLOY_TAG}-aztec-proving-agent-group-${count.index + 1}" requires_compatibilities = ["FARGATE"] network_mode = "awsvpc" cpu = "16384" @@ -704,8 +704,8 @@ resource "aws_cloudwatch_metric_alarm" "cpu_high" { evaluation_periods = "1" metric_name = "CPUUtilization" namespace = "AWS/ECS" - period = "300" - statistic = "Average" + period = "60" + statistic = "Maximum" threshold = "10" alarm_description = "Alert when CPU utilization is greater than 10%" dimensions = { @@ -722,8 +722,8 @@ resource "aws_cloudwatch_metric_alarm" "cpu_low" { evaluation_periods = "1" metric_name = "CPUUtilization" namespace = "AWS/ECS" - period = "300" - statistic = "Average" + period = "60" + statistic = "Maximum" threshold = "10" alarm_description = "Alarm when CPU utilization is less than 10%" dimensions = { @@ -747,7 +747,7 @@ resource "aws_appautoscaling_target" "ecs_proving_agent" { resource "aws_appautoscaling_policy" "scale_out" { count = local.node_count name = "${var.DEPLOY_TAG}-scale-out-${count.index}" - policy_type = "StepScaling" + policy_type = "TargetTrackingScaling" resource_id = aws_appautoscaling_target.ecs_proving_agent[count.index].resource_id scalable_dimension = aws_appautoscaling_target.ecs_proving_agent[count.index].scalable_dimension service_namespace = aws_appautoscaling_target.ecs_proving_agent[count.index].service_namespace @@ -755,10 +755,10 @@ resource "aws_appautoscaling_policy" "scale_out" { step_scaling_policy_configuration { adjustment_type = "ChangeInCapacity" cooldown = 60 - metric_aggregation_type = "Average" + metric_aggregation_type = "Maximum" step_adjustment { - scaling_adjustment = 1 + scaling_adjustment = local.agents_per_sequencer - 1 # -1 since we're adding our target to the existing 1 metric_interval_lower_bound = 0 } } @@ -776,32 +776,11 @@ resource "aws_appautoscaling_policy" "scale_in" { step_scaling_policy_configuration { adjustment_type = "ChangeInCapacity" cooldown = 60 - metric_aggregation_type = "Average" + metric_aggregation_type = "Maximum" step_adjustment { - scaling_adjustment = -1 + scaling_adjustment = -local.agents_per_sequencer + 1 # +1 since we're removing our target from the existing 1 metric_interval_upper_bound = 0 } } } - -# Link the High CPU alarm to the scale out policy -resource "aws_cloudwatch_metric_alarm" "cpu_high" { - count = local.node_count - alarm_name = "${var.DEPLOY_TAG}-cpu-high-${count.index + 1}" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" - metric_name = "CPUUtilization" - namespace = "AWS/ECS" - period = "300" - statistic = "Average" - threshold = "10" - alarm_description = "Alarm when CPU utilization is greater than 10%" - dimensions = { - ClusterName = data.terraform_remote_state.setup_iac.outputs.ecs_cluster_id - ServiceName = "${aws_ecs_service.aztec_proving_agent[count.index].name}" - } - alarm_actions = [aws_appautoscaling_policy.scale_out[count.index].arn] - insufficient_data_actions = [] - ok_actions = [] -} From 8e9b87067e9be798ae9896cfc8fdd218e84c9017 Mon Sep 17 00:00:00 2001 From: spypsy Date: Mon, 3 Jun 2024 18:27:23 +0000 Subject: [PATCH 4/4] use ExactCapacity --- yarn-project/aztec/terraform/node/main.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yarn-project/aztec/terraform/node/main.tf b/yarn-project/aztec/terraform/node/main.tf index 340506a7a916..4c93d283d1f4 100644 --- a/yarn-project/aztec/terraform/node/main.tf +++ b/yarn-project/aztec/terraform/node/main.tf @@ -753,12 +753,12 @@ resource "aws_appautoscaling_policy" "scale_out" { service_namespace = aws_appautoscaling_target.ecs_proving_agent[count.index].service_namespace step_scaling_policy_configuration { - adjustment_type = "ChangeInCapacity" + adjustment_type = "ExactCapacity" cooldown = 60 metric_aggregation_type = "Maximum" step_adjustment { - scaling_adjustment = local.agents_per_sequencer - 1 # -1 since we're adding our target to the existing 1 + scaling_adjustment = local.agents_per_sequencer metric_interval_lower_bound = 0 } } @@ -774,12 +774,12 @@ resource "aws_appautoscaling_policy" "scale_in" { service_namespace = aws_appautoscaling_target.ecs_proving_agent[count.index].service_namespace step_scaling_policy_configuration { - adjustment_type = "ChangeInCapacity" + adjustment_type = "ExactCapacity" cooldown = 60 metric_aggregation_type = "Maximum" step_adjustment { - scaling_adjustment = -local.agents_per_sequencer + 1 # +1 since we're removing our target from the existing 1 + scaling_adjustment = 1 metric_interval_upper_bound = 0 } }