diff --git a/spartan/scripts/network_pause.sh b/spartan/scripts/network_pause.sh index 3d28a5389b44..706b9553b911 100755 --- a/spartan/scripts/network_pause.sh +++ b/spartan/scripts/network_pause.sh @@ -31,16 +31,55 @@ if [[ -z "$NAMESPACE" ]]; then usage fi -log "Snapshotting $NAMESPACE" -$scripts_dir/manual_snapshot.sh $NAMESPACE +CONFIGMAP_NAME="network-pause-state" -log "Waiting for snapshot upload" -sleep 60 # staging-ignition takes 28s +# Guard against double-pause (would overwrite saved state with zeros) +if kubectl get configmap "$CONFIGMAP_NAME" -n "$NAMESPACE" &>/dev/null; then + die "Namespace $NAMESPACE is already paused (ConfigMap $CONFIGMAP_NAME exists). Run network_resume.sh first." +fi + +# Snapshot if the cronjob exists (not all networks have snapshots enabled) +SNAPSHOT_CRONJOB="$NAMESPACE-snapshot-aztec-snapshots" +if kubectl get cronjob "$SNAPSHOT_CRONJOB" -n "$NAMESPACE" &>/dev/null; then + log "Snapshotting $NAMESPACE" + $scripts_dir/manual_snapshot.sh $NAMESPACE + log "Waiting for snapshot upload" + sleep 60 # staging-ignition takes 28s +else + log "Snapshot cronjob not found ($SNAPSHOT_CRONJOB), skipping snapshot" +fi + +# Collect current replica counts before scaling down +log "Collecting current replica counts" + +SS_JSON=$(kubectl get statefulset -n "$NAMESPACE" -o json | \ + jq '[.items[] | {key: .metadata.name, value: .spec.replicas}] | from_entries') +DEPLOY_JSON=$(kubectl get deployment -n "$NAMESPACE" -o json | \ + jq '[.items[] | {key: .metadata.name, value: .spec.replicas}] | from_entries') + +CRONJOB_JSON=$(kubectl get cronjob -n "$NAMESPACE" -o json | \ + jq '[.items[] | select(.spec.suspend != true) | .metadata.name]') + +STATE_JSON=$(jq -n \ + --arg paused_at "$(date -Is)" \ + --argjson statefulsets "$SS_JSON" \ + --argjson deployments "$DEPLOY_JSON" \ + --argjson cronjobs "$CRONJOB_JSON" \ + '{paused_at: $paused_at, statefulsets: $statefulsets, deployments: $deployments, cronjobs: $cronjobs}') + +log "Saving pause state to ConfigMap $CONFIGMAP_NAME" +kubectl create configmap "$CONFIGMAP_NAME" \ + -n "$NAMESPACE" \ + --from-literal=state="$STATE_JSON" + +# Scale everything down except eth-devnet (L1 beacon chain cannot recover from long pauses) log "Pausing namespace $NAMESPACE" for item_type in statefulset deployment; do - for item in $(kubectl get $item_type -n $NAMESPACE -o jsonpath='{.items[*].metadata.name}'); do - kubectl scale -n $NAMESPACE $item_type/$item --replicas 0 + for item in $(kubectl get "$item_type" -n "$NAMESPACE" -o json | \ + jq -r '.items[] | select(.metadata.labels["app.kubernetes.io/name"] != "eth-devnet") | .metadata.name'); do + log " Scaling $item_type/$item to 0" + kubectl scale -n "$NAMESPACE" "$item_type/$item" --replicas 0 done done @@ -48,3 +87,5 @@ log "Suspending cronjobs" for item in $(kubectl get cronjob -n $NAMESPACE -o jsonpath='{.items[*].metadata.name}'); do kubectl -n $NAMESPACE patch cronjobs $item -p '{"spec" : {"suspend" : true }}' done + +log "Namespace $NAMESPACE paused successfully. State saved to ConfigMap $CONFIGMAP_NAME." diff --git a/spartan/scripts/network_resume.sh b/spartan/scripts/network_resume.sh new file mode 100755 index 000000000000..6058a17bf250 --- /dev/null +++ b/spartan/scripts/network_resume.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +set -euo pipefail + +spartan=$(git rev-parse --show-toplevel)/spartan +scripts_dir=$spartan/scripts + +log() { echo "[INFO] $(date -Is) - $*"; } +err() { echo "[ERROR] $(date -Is) - $*" >&2; } +die() { err "$*"; exit 1; } + +usage() { + echo "Usage: $0 [namespace]" + echo "" + echo "Arguments:" + echo " namespace - Kubernetes namespace (default: from NAMESPACE env var)" + echo "" + echo "Environment variables:" + echo " NAMESPACE - K8s namespace (required if not passed as argument)" + echo "" + exit 1 +} + +NAMESPACE="${1:-${NAMESPACE:-}}" + +if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + usage +fi + +if [[ -z "$NAMESPACE" ]]; then + usage +fi + +CONFIGMAP_NAME="network-pause-state" + +# Read saved state +log "Reading pause state from ConfigMap $CONFIGMAP_NAME" +STATE_JSON=$(kubectl get configmap "$CONFIGMAP_NAME" -n "$NAMESPACE" -o jsonpath='{.data.state}') || \ + die "ConfigMap $CONFIGMAP_NAME not found in namespace $NAMESPACE. Is the network paused?" + +echo "$STATE_JSON" | jq . >/dev/null 2>&1 || die "Invalid JSON in ConfigMap $CONFIGMAP_NAME" +paused_at=$(echo "$STATE_JSON" | jq -r '.paused_at') +log "Network was paused at $paused_at" + +# Restore statefulset replicas +log "Restoring statefulsets" +for name in $(echo "$STATE_JSON" | jq -r '.statefulsets | keys[]'); do + replicas=$(echo "$STATE_JSON" | jq -r --arg name "$name" '.statefulsets[$name]') + if [[ "$replicas" -gt 0 ]]; then + log " Scaling statefulset/$name to $replicas replicas" + kubectl scale -n "$NAMESPACE" statefulset/"$name" --replicas "$replicas" + fi +done + +# Restore deployment replicas +log "Restoring deployments" +for name in $(echo "$STATE_JSON" | jq -r '.deployments | keys[]'); do + replicas=$(echo "$STATE_JSON" | jq -r --arg name "$name" '.deployments[$name]') + if [[ "$replicas" -gt 0 ]]; then + log " Scaling deployment/$name to $replicas replicas" + kubectl scale -n "$NAMESPACE" deployment/"$name" --replicas "$replicas" + fi +done + +# Unsuspend only cronjobs that were active before pause +log "Unsuspending cronjobs" +for name in $(echo "$STATE_JSON" | jq -r '.cronjobs[]'); do + log " Unsuspending cronjob/$name" + kubectl -n "$NAMESPACE" patch cronjobs "$name" -p '{"spec" : {"suspend" : false }}' +done + +# Clean up +log "Cleaning up ConfigMap $CONFIGMAP_NAME" +kubectl delete configmap "$CONFIGMAP_NAME" -n "$NAMESPACE" + +log "Namespace $NAMESPACE resumed successfully."