Skip to content

Commit

Permalink
Sync Terraform & Helm changes
Browse files Browse the repository at this point in the history
GitOrigin-RevId: 79f4d473a837aabd9ffa7f7b1646c5e6e9427de2
  • Loading branch information
sionescu committed Jan 24, 2025
1 parent 7ca6db3 commit 6705ea5
Show file tree
Hide file tree
Showing 22 changed files with 138 additions and 40 deletions.
4 changes: 2 additions & 2 deletions terraform/aptos-node/aws/versions.tf
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
terraform {
required_version = "~> 1.9.1"
required_version = "~> 1.9.8"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.35.0"
version = "~> 4.67.0"
}
helm = {
source = "hashicorp/helm"
Expand Down
14 changes: 14 additions & 0 deletions terraform/aptos-node/gcp/kubernetes.tf
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ resource "helm_release" "validator" {
}
validator = {
name = var.validator_name
config = {
storage = {
rocksdb_configs = {
enable_storage_sharding = var.enable_storage_sharding
}
}
}
storage = {
class = kubernetes_storage_class.ssd.metadata[0].name
}
Expand All @@ -64,6 +71,13 @@ resource "helm_release" "validator" {
}]
}
fullnode = {
config = {
storage = {
rocksdb_configs = {
enable_storage_sharding = var.enable_storage_sharding
}
}
}
storage = {
class = kubernetes_storage_class.ssd.metadata[0].name
}
Expand Down
6 changes: 6 additions & 0 deletions terraform/aptos-node/gcp/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,12 @@ variable "num_fullnode_groups" {
default = 1
}

variable "enable_storage_sharding" {
description = "Enable storage sharding for VN and VFN nodes"
type = bool
default = true
}

variable "gke_maintenance_policy" {
description = "The maintenance policy to use for the cluster. See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#maintenance_policy"
type = object({
Expand Down
17 changes: 13 additions & 4 deletions terraform/fullnode/aws/backup.tf
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,16 @@ data "aws_iam_policy_document" "backup-assume-role" {
values = ["sts.amazonaws.com"]
}
}
# Allow the AWS Backup service to assume this role
statement {
actions = ["sts:AssumeRole"]
effect = "Allow"

principals {
type = "Service"
identifiers = ["backup.amazonaws.com"]
}
}
}

data "aws_iam_policy_document" "backup" {
Expand All @@ -69,10 +79,9 @@ data "aws_iam_policy_document" "backup" {
}

resource "aws_iam_role" "backup" {
name = "aptos-${local.workspace_name}-backup"
path = var.iam_path
permissions_boundary = var.permissions_boundary_policy
assume_role_policy = data.aws_iam_policy_document.backup-assume-role.json
name = "aptos-${local.workspace_name}-backup"
path = var.iam_path
assume_role_policy = data.aws_iam_policy_document.backup-assume-role.json
}

resource "aws_iam_role_policy" "backup" {
Expand Down
8 changes: 8 additions & 0 deletions terraform/fullnode/aws/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,11 @@ output "kubernetes_ca_certificate" {
output "kubernetes_token" {
value = data.aws_eks_cluster_auth.aptos.token
}

output "s3_backup_role" {
value = aws_iam_role.backup
}

output "s3_backup_bucket" {
value = aws_s3_bucket.backup
}
4 changes: 2 additions & 2 deletions terraform/fullnode/aws/versions.tf
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
terraform {
required_version = "~> 1.9.1"
required_version = "~> 1.9.8"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.35.0"
version = "~> 4.67.0"
}
helm = {
source = "hashicorp/helm"
Expand Down
22 changes: 13 additions & 9 deletions terraform/helm/aptos-node/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@ Aptos blockchain node deployment
| cluster_name | string | `"unknown"` | |
| enablePrivilegedMode | bool | `false` | TEST ONLY: Enable running as root for profiling |
| fullnode.affinity | object | `{}` | |
| fullnode.config | object | `{"full_node_networks":[{"network_id":"public","seeds":{}}]}` | Fullnode configuration. See NodeConfig https://github.com/aptos-labs/aptos-core/blob/main/config/src/config/mod.rs |
| fullnode.config | object | `{"full_node_networks":[{"network_id":"public"}]}` | Fullnode configuration. See NodeConfig https://github.com/aptos-labs/aptos-core/blob/main/config/src/config/mod.rs |
| fullnode.force_enable_telemetry | bool | `false` | Flag to force enable telemetry service (useful for forge tests) |
| fullnode.groups | list | `[{"dns_name":"vfn","name":"fullnode","replicas":1}]` | Specify fullnode groups by `name` and number of `replicas` |
| fullnode.nodeSelector | object | `{}` | |
| fullnode.podAnnotations | string | `nil` | |
| fullnode.resources.limits.cpu | int | `14` | |
| fullnode.resources.limits.memory | string | `"56Gi"` | |
| fullnode.resources.requests.cpu | int | `14` | |
| fullnode.resources.requests.memory | string | `"56Gi"` | |
| fullnode.resources.limits.cpu | int | `30` | |
| fullnode.resources.limits.memory | string | `"60Gi"` | |
| fullnode.resources.requests.cpu | int | `30` | |
| fullnode.resources.requests.memory | string | `"60Gi"` | |
| fullnode.rust_log | string | `"info"` | Log level for the fullnode |
| fullnode.storage.class | string | `nil` | Kubernetes storage class to use for fullnode persistent storage |
| fullnode.storage.size | string | `"2048Gi"` | Size of fullnode persistent storage |
Expand Down Expand Up @@ -65,14 +65,17 @@ Aptos blockchain node deployment
| service.fullnode.enableRestApi | bool | `true` | Enable the REST API on fullnodes |
| service.fullnode.external.type | string | `"LoadBalancer"` | The Kubernetes ServiceType to use for fullnodes' HAProxy |
| service.fullnode.externalTrafficPolicy | string | `"Local"` | The externalTrafficPolicy for the fullnode service |
| service.fullnode.internal.annotations | object | `{}` | |
| service.fullnode.internal.headless | bool | `false` | |
| service.fullnode.internal.type | string | `"ClusterIP"` | The Kubernetes ServiceType to use for fullnodes |
| service.fullnode.loadBalancerSourceRanges | string | `nil` | If set and if the ServiceType is LoadBalancer, allow traffic to fullnodes from these CIDRs |
| service.internalDomain | string | `nil` | If set, the base domain name to use for internal LBs |
| service.validator.enableAdminPort | bool | `false` | Enable the admin port on the validator |
| service.validator.enableMetricsPort | bool | `false` | Enable the metrics port on the validator |
| service.validator.enableRestApi | bool | `true` | Enable the REST API on the validator |
| service.validator.external.type | string | `"LoadBalancer"` | The Kubernetes ServiceType to use for validator's HAProxy |
| service.validator.externalTrafficPolicy | string | `"Local"` | The externalTrafficPolicy for the validator service |
| service.validator.internal.annotations | object | `{}` | |
| service.validator.internal.headless | bool | `false` | |
| service.validator.internal.type | string | `"ClusterIP"` | The Kubernetes ServiceType to use for validator |
| service.validator.loadBalancerSourceRanges | string | `nil` | If set and if the ServiceType is LoadBalancer, allow traffic to validators from these CIDRs |
Expand All @@ -88,14 +91,15 @@ Aptos blockchain node deployment
| validator.name | string | `nil` | Internal: name of your validator for use in labels |
| validator.nodeSelector | object | `{}` | |
| validator.podAnnotations | string | `nil` | |
| validator.resources.limits.cpu | int | `14` | |
| validator.resources.limits.memory | string | `"56Gi"` | |
| validator.resources.requests.cpu | int | `14` | |
| validator.resources.requests.memory | string | `"56Gi"` | |
| validator.resources.limits.cpu | int | `30` | |
| validator.resources.limits.memory | string | `"60Gi"` | |
| validator.resources.requests.cpu | int | `30` | |
| validator.resources.requests.memory | string | `"60Gi"` | |
| validator.rust_log | string | `"info"` | Log level for the validator |
| validator.storage.class | string | `nil` | Kubernetes storage class to use for validator persistent storage |
| validator.storage.size | string | `"2048Gi"` | Size of validator persistent storage |
| validator.tolerations | list | `[]` | |
| validator.useConsensusHealthCheckAsStartupProbe | bool | `false` | |

## Resource Descriptions

Expand Down
11 changes: 11 additions & 0 deletions terraform/helm/aptos-node/templates/validator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,17 @@ spec:
{{- end }}
{{- with $.Values.validator }}
imagePullPolicy: {{ .image.pullPolicy }}
{{- if $.Values.validator.useConsensusHealthCheckAsStartupProbe }}
startupProbe:
httpGet:
path: /consensus_health_check
port: 9101
scheme: HTTP
failureThreshold: 2147483647 # set it to the max value since we don't want to restart the pod automatically even if it can't participate in consensus
periodSeconds: 1
successThreshold: 1
timeoutSeconds: 3
{{- end }}
args:
- /bin/bash
- -c
Expand Down
1 change: 1 addition & 0 deletions terraform/helm/aptos-node/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ validator:
podAnnotations:
# Determines which log levels are retained by the Vector pipeline
# aptos.dev/min-log-level-to-retain: warn
useConsensusHealthCheckAsStartupProbe: false # once https://github.com/aptos-labs/aptos-core/pull/15512 is rolled out to all networks (at the time of writing (dec 21 2024) it's only in devnet) we can remove this flag and always add this probe

fullnode:
# -- Specify fullnode groups by `name` and number of `replicas`
Expand Down
2 changes: 1 addition & 1 deletion terraform/helm/autoscaling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@
| metrics-server.resources.requests.memory | string | `"200Mi"` | |

----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.13.1](https://github.com/norwoodj/helm-docs/releases/v1.13.1)
Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2)
2 changes: 1 addition & 1 deletion terraform/helm/chaos/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ Chaos Mesh for Aptos
| serviceAccount.name | string | `nil` | |

----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.13.1](https://github.com/norwoodj/helm-docs/releases/v1.13.1)
Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2)
2 changes: 1 addition & 1 deletion terraform/helm/forge/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ Aptos Forge test framework
| serviceAccount.name | string | `nil` | |

----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.13.1](https://github.com/norwoodj/helm-docs/releases/v1.13.1)
Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2)
24 changes: 14 additions & 10 deletions terraform/helm/fullnode/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
| backup.config.azure.account | string | `nil` | |
| backup.config.azure.container | string | `nil` | |
| backup.config.azure.sas | string | `nil` | |
| backup.config.concurrent_data_requests | string | `nil` | Number of concurrent requests to the PFN backup port |
| backup.config.gcs.bucket | string | `nil` | |
| backup.config.location | string | `nil` | Which of the below backup configurations to use |
| backup.config.r2.bucket | string | `nil` | |
| backup.config.r2.endpoint_url | string | `nil` | |
| backup.config.s3.bucket | string | `nil` | |
| backup.config.state_snapshot_interval_epochs | int | `2` | State snapshot interval epochs |
| backup.config.transaction_batch_size | int | `1000000` | Transaction batch size |
Expand All @@ -22,10 +25,10 @@
| backup.image.repo | string | `"aptoslabs/tools"` | Image repo to use for backup images |
| backup.image.tag | string | `nil` | Image tag to use for backup images |
| backup.nodeSelector | object | `{}` | |
| backup.resources.limits.cpu | int | `4` | |
| backup.resources.limits.cpu | int | `6` | |
| backup.resources.limits.memory | string | `"8Gi"` | |
| backup.resources.requests.cpu | int | `4` | |
| backup.resources.requests.memory | string | `"8Gi"` | |
| backup.resources.requests.memory | string | `"4Gi"` | |
| backup.tolerations | list | `[]` | |
| backup_compaction.affinity | object | `{}` | |
| backup_compaction.nodeSelector | object | `{}` | |
Expand All @@ -36,11 +39,12 @@
| backup_compaction.schedule | string | `"@daily"` | The schedule for backup compaction |
| backup_compaction.tolerations | list | `[]` | |
| backup_verify.affinity | object | `{}` | |
| backup_verify.config.concurrent_downloads | int | `50` | |
| backup_verify.nodeSelector | object | `{}` | |
| backup_verify.resources.limits.cpu | int | `8` | |
| backup_verify.resources.limits.memory | string | `"32Gi"` | |
| backup_verify.resources.requests.cpu | int | `4` | |
| backup_verify.resources.requests.memory | string | `"16Gi"` | |
| backup_verify.resources.limits.cpu | int | `32` | |
| backup_verify.resources.limits.memory | string | `"60Gi"` | |
| backup_verify.resources.requests.cpu | int | `8` | |
| backup_verify.resources.requests.memory | string | `"8Gi"` | |
| backup_verify.schedule | string | `"@daily"` | The schedule for backup verification |
| backup_verify.tolerations | list | `[]` | |
| chain.era | int | `1` | Bump this number to wipe the underlying storage |
Expand All @@ -61,10 +65,10 @@
| manageImages | bool | `true` | If true, helm will always override the deployed image with what is configured in the helm values. If not, helm will take the latest image from the currently running workloads, which is useful if you have a separate procedure to update images (e.g. rollout) |
| metrics.destination | string | `"dev"` | The upstream sink for metrics. Supported values are "dev" and "prod" |
| nodeSelector | object | `{}` | |
| resources.limits.cpu | int | `14` | |
| resources.limits.memory | string | `"56Gi"` | |
| resources.requests.cpu | int | `14` | |
| resources.requests.memory | string | `"56Gi"` | |
| resources.limits.cpu | int | `30` | |
| resources.limits.memory | string | `"60Gi"` | |
| resources.requests.cpu | int | `30` | |
| resources.requests.memory | string | `"60Gi"` | |
| restore.affinity | object | `{}` | |
| restore.config.azure.account | string | `nil` | |
| restore.config.azure.container | string | `nil` | |
Expand Down
16 changes: 16 additions & 0 deletions terraform/helm/fullnode/files/backup/r2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
env_vars: []
commands:
create_backup: 'echo "$BACKUP_NAME"'
create_for_write: |
FILE_HANDLE="$BACKUP_HANDLE/$FILE_NAME"
echo "$FILE_HANDLE"
exec 1>&-
gzip -c | aws s3 cp --endpoint-url "$R2_ENDPOINT_URL" - "s3://$BUCKET/$SUB_DIR/$FILE_HANDLE"
open_for_read: 'aws s3 cp --endpoint-url "$R2_ENDPOINT_URL" "s3://$BUCKET/$SUB_DIR/$FILE_HANDLE" - | gzip -cd'
save_metadata_line: |
FILE_HANDLE="metadata/$FILE_NAME"
echo "$FILE_HANDLE"
exec 1>&-
gzip -c | aws s3 cp --endpoint-url "$R2_ENDPOINT_URL" - "s3://$BUCKET/$SUB_DIR/$FILE_HANDLE"
list_metadata_files: '(aws s3 ls --endpoint-url "$R2_ENDPOINT_URL" s3://$BUCKET/$SUB_DIR/metadata/ ||:) | sed -ne "s#.* \(.*\)#metadata/\1#p"'
backup_metadata_file: 'aws s3 mv --endpoint-url "$R2_ENDPOINT_URL" s3://$BUCKET/$SUB_DIR/metadata/$FILE_NAME s3://$BUCKET/$SUB_DIR/metadata_backup/$FILE_NAME --no-progress'
16 changes: 16 additions & 0 deletions terraform/helm/fullnode/templates/_backup.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,22 @@
- name: SAS
value: {{ .config.azure.sas }}
{{- end }}
{{- if hasPrefix "r2" (toString .config.location) }}
- name: BUCKET
value: {{ .config.r2.bucket }}
- name: R2_ENDPOINT_URL
value: {{ .config.r2.endpoint_url }}
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: r2-credentials
key: access-key-id
- name: AWS_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: r2-credentials
key: secret-access-key
{{- end }}
{{- if hasPrefix "scw_s3" (toString .config.location) }}
- name: AWS_ACCESS_KEY_ID
value: {{ .config.scw_s3.access_key }}
Expand Down
4 changes: 4 additions & 0 deletions terraform/helm/fullnode/templates/backup-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ spec:
{{- end }}
- --metadata-cache-dir
- /tmp/aptos-backup-verify-metadata
{{- if .Values.backup_verify.config.concurrent_downloads }}
- --concurrent-downloads
- "{{ .Values.backup_verify.config.concurrent_downloads }}"
{{- end }}
- --command-adapter-config
# use the same config with the backup sts
- /opt/aptos/etc/{{ .Values.backup.config.location }}.yaml
Expand Down
15 changes: 10 additions & 5 deletions terraform/helm/fullnode/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,11 @@ backup:
pullPolicy: IfNotPresent
resources:
limits:
cpu: 8
cpu: 6
memory: 8Gi
requests:
cpu: 4
memory: 8Gi
memory: 4Gi
nodeSelector: {}
tolerations: []
affinity: {}
Expand All @@ -135,6 +135,9 @@ backup:
bucket:
gcs:
bucket:
r2:
bucket:
endpoint_url:
azure:
account:
container:
Expand All @@ -151,14 +154,16 @@ backup_verify:
schedule: "@daily"
resources:
limits:
cpu: 8
memory: 16Gi
cpu: 32
memory: 60Gi
requests:
cpu: 8
memory: 16Gi
memory: 8Gi
nodeSelector: {}
tolerations: []
affinity: {}
config:
concurrent_downloads: 50

backup_compaction:
# -- The schedule for backup compaction
Expand Down
2 changes: 1 addition & 1 deletion terraform/helm/genesis/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,4 @@ Aptos blockchain automated genesis ceremony for testnets
| serviceAccount.name | string | `nil` | The name of the service account to use. If not set and create is true, a name is generated using the fullname template |

----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.13.1](https://github.com/norwoodj/helm-docs/releases/v1.13.1)
Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2)
2 changes: 1 addition & 1 deletion terraform/helm/kube-state-metrics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
| kube-state-metrics.podAnnotations."prometheus.io/scrape" | string | `"true"` | |

----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.13.1](https://github.com/norwoodj/helm-docs/releases/v1.13.1)
Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2)
2 changes: 1 addition & 1 deletion terraform/helm/monitoring/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,4 @@
| validator.name | string | `nil` | |

----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.13.1](https://github.com/norwoodj/helm-docs/releases/v1.13.1)
Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2)
2 changes: 1 addition & 1 deletion terraform/helm/prometheus-node-exporter/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
| prometheus-node-exporter.podAnnotations."prometheus.io/scrape" | string | `"true"` | |

----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.13.1](https://github.com/norwoodj/helm-docs/releases/v1.13.1)
Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2)
2 changes: 1 addition & 1 deletion terraform/helm/testnet-addons/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,4 @@ Additional components for aptos-nodes testnet
| waypoint.tolerations | list | `[]` | |

----------------------------------------------
Autogenerated from chart metadata using [helm-docs v1.13.1](https://github.com/norwoodj/helm-docs/releases/v1.13.1)
Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2)

0 comments on commit 6705ea5

Please sign in to comment.