diff --git a/spartan/metrics/values/prod.yaml b/spartan/metrics/values/prod.yaml index c2c2b33f5e1b..d84f0621192a 100644 --- a/spartan/metrics/values/prod.yaml +++ b/spartan/metrics/values/prod.yaml @@ -1,6 +1,17 @@ opentelemetry-collector: + replicaCount: 3 + resources: + requests: + memory: 12Gi + cpu: "1.5" nodeSelector: node-type: infra + pool: spot + tolerations: + - key: "cloud.google.com/gke-spot" + operator: "Equal" + value: "true" + effect: "NoSchedule" ports: jaeger-compact: enabled: false @@ -29,14 +40,21 @@ prometheus: server: resources: requests: - memory: 7Gi - cpu: 1.5 + memory: 26Gi + cpu: "3.5" nodeSelector: node-type: infra + pool: spot + tolerations: + - key: "cloud.google.com/gke-spot" + operator: "Equal" + value: "true" + effect: "NoSchedule" + persistentVolume: enabled: true size: 100Gi - replicaCount: 10 + replicaCount: 3 statefulSet: enabled: true alertmanager: @@ -57,6 +75,10 @@ tempo: # https://artifacthub.io/packages/helm/grafana/grafana grafana: + resources: + requests: + memory: 5Gi + cpu: "1.5" nodeSelector: node-type: infra service: diff --git a/spartan/terraform/gke-cluster/cluster/main.tf b/spartan/terraform/gke-cluster/cluster/main.tf index 4d5e93f0ed7b..afa68c485b0a 100644 --- a/spartan/terraform/gke-cluster/cluster/main.tf +++ b/spartan/terraform/gke-cluster/cluster/main.tf @@ -273,3 +273,93 @@ resource "google_container_node_pool" "spot_nodes_2core" { auto_upgrade = false } } + +# Create 2 core high memory spot instance node pool with autoscaling, used for metrics +resource "google_container_node_pool" "spot_nodes_2core-highmem" { + name = "${var.cluster_name}-2core-highmem-spot" + location = var.zone + cluster = var.cluster_name + version = var.node_version + # Enable autoscaling + autoscaling { + min_node_count = 0 + max_node_count = 8 + } + + # Node configuration + node_config { + machine_type = "n2-highmem-2" + spot = true + + service_account = var.service_account + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + labels = { + env = "production" + pool = "spot" + local-ssd = "false" + node-type = "infra" + } + tags = ["aztec-gke-node", "spot"] + + # Spot instance termination handler + taint { + key = "cloud.google.com/gke-spot" + value = "true" + effect = "NO_SCHEDULE" + } + } + + # Management configuration + management { + auto_repair = true + auto_upgrade = false + } +} + +# Create 4 core high memory spot instance node pool with autoscaling, used for metrics +resource "google_container_node_pool" "spot_nodes_4core-highmem" { + name = "${var.cluster_name}-4core-highmem-spot" + location = var.zone + cluster = var.cluster_name + version = var.node_version + # Enable autoscaling + autoscaling { + min_node_count = 0 + max_node_count = 8 + } + + # Node configuration + node_config { + machine_type = "n2-highmem-4" + spot = true + + service_account = var.service_account + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + labels = { + env = "production" + pool = "spot" + local-ssd = "false" + node-type = "infra" + } + tags = ["aztec-gke-node", "spot"] + + # Spot instance termination handler + taint { + key = "cloud.google.com/gke-spot" + value = "true" + effect = "NO_SCHEDULE" + } + } + + # Management configuration + management { + auto_repair = true + auto_upgrade = false + } +}