diff --git a/charts/ozone/templates/_helpers.tpl b/charts/ozone/templates/_helpers.tpl index 71ea063..59794b4 100644 --- a/charts/ozone/templates/_helpers.tpl +++ b/charts/ozone/templates/_helpers.tpl @@ -51,22 +51,116 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{- $pods | join "," }} {{- end }} -{{/* Common configuration environment variables */}} -{{- define "ozone.configuration.env" -}} +{{/* List of comma separated om ids */}} +{{- define "ozone.om.cluster.ids" -}} + {{- $pods := list }} + {{- $replicas := .Values.om.replicas | int }} + {{- range $i := until $replicas }} + {{- $pods = append $pods (printf "%s-om-%d" $.Release.Name $i) }} + {{- end }} + {{- $pods | join "," }} +{{- end }} + +{{/* List of comma separated scm ids */}} +{{- define "ozone.scm.cluster.ids" -}} + {{- $pods := list }} + {{- $replicas := .Values.scm.replicas | int }} + {{- range $i := until $replicas }} + {{- $pods = append $pods (printf "%s-scm-%d" $.Release.Name $i) }} + {{- end }} + {{- $pods | join "," }} +{{- end }} + +{{/* List of decommission om nodes */}} +{{- define "ozone.om.decommissioned.nodes" -}} + {{- $nodes := list }} + {{- $statefulset := lookup "apps/v1" "StatefulSet" $.Release.Namespace (printf "%s-om" $.Release.Name) -}} + {{- if $statefulset }} + {{- $oldCount := $statefulset.spec.replicas | int -}} + {{- $newCount := .Values.om.replicas | int }} + {{- range $i := until $oldCount }} + {{- $minCount := max $newCount 1 -}} + {{- if ge $i $minCount }} + {{- $nodes = append $nodes (printf "%s-om-%d" $.Release.Name $i) }} + {{- end }} + {{- end }} + {{- end }} + {{- $nodes | join "," }} +{{- end }} + +{{/* List of bootstrap om nodes */}} +{{- define "ozone.om.bootstrap.nodes" -}} + {{- $nodes := list }} + {{- $statefulset := lookup "apps/v1" "StatefulSet" $.Release.Namespace (printf "%s-om" $.Release.Name) -}} + {{- if $statefulset }} + {{- $oldCount := $statefulset.spec.replicas | int -}} + {{- $newCount := .Values.om.replicas | int }} + {{- range $i := until $newCount }} + {{- if ge $i $oldCount }} + {{- $nodes = append $nodes (printf "%s-om-%d" $.Release.Name $i) }} + {{- end }} + {{- end }} + {{- end }} + {{- $nodes | join ","}} +{{- end }} + +{{/* List of decommission scm nodes */}} +{{- define "ozone.scm.decommissioned.nodes" -}} + {{- $nodes := list }} + {{- $statefulset := lookup "apps/v1" "StatefulSet" $.Release.Namespace (printf "%s-scm" $.Release.Name) -}} + {{- if $statefulset }} + {{- $oldCount := $statefulset.spec.replicas | int -}} + {{- $newCount := .Values.scm.replicas | int }} + {{- range $i := until $oldCount }} + {{- if ge $i $newCount }} + {{- $nodes = append $nodes (printf "%s-scm-%d" $.Release.Name $i) }} + {{- end }} + {{- end }} + {{- end }} + {{- $nodes | join "," -}} +{{- end }} + +{{/* List of decommission data nodes */}} +{{- define "ozone.data.decommissioned.hosts" -}} + {{- $hosts := list }} + {{- $statefulset := lookup "apps/v1" "StatefulSet" $.Release.Namespace (printf "%s-datanode" $.Release.Name) -}} + {{- if $statefulset }} + {{- $oldCount := $statefulset.spec.replicas | int -}} + {{- $newCount := .Values.datanode.replicas | int }} + {{- range $i := until $oldCount }} + {{- if ge $i $newCount }} + {{- $hosts = append $hosts (printf "%s-datanode-%d.%s-datanode-headless.%s.svc.cluster.local" $.Release.Name $i $.Release.Name $.Release.Namespace) }} + {{- end }} + {{- end }} + {{- end }} + {{- $hosts | join "," -}} +{{- end }} + +{{- define "ozone.configuration.env.common" -}} - name: OZONE-SITE.XML_hdds.datanode.dir value: /data/storage - name: OZONE-SITE.XML_ozone.scm.datanode.id.dir value: /data/metadata - name: OZONE-SITE.XML_ozone.metadata.dirs value: /data/metadata -- name: OZONE-SITE.XML_ozone.scm.block.client.address - value: {{ include "ozone.scm.pods" . }} -- name: OZONE-SITE.XML_ozone.scm.client.address - value: {{ include "ozone.scm.pods" . }} -- name: OZONE-SITE.XML_ozone.scm.names - value: {{ include "ozone.scm.pods" . }} -- name: OZONE-SITE.XML_ozone.om.address - value: {{ include "ozone.om.pods" . }} +- name: OZONE-SITE.XML_ozone.scm.ratis.enable + value: "true" +- name: OZONE-SITE.XML_ozone.scm.service.ids + value: {{ .Values.clusterId }} +- name: OZONE-SITE.XML_ozone.scm.nodes.{{ .Values.clusterId }} + value: {{ include "ozone.scm.cluster.ids" . }} + {{/*- name: OZONE-SITE.XML_ozone.scm.skip.bootstrap.validation*/}} + {{/* value: {{ quote .Values.scm.skipBootstrapValidation }}*/}} +{{- range $i, $val := until ( .Values.scm.replicas | int ) }} +- name: {{ printf "OZONE-SITE.XML_ozone.scm.address.%s.%s-scm-%d" $.Values.clusterId $.Release.Name $i }} + value: {{ printf "%s-scm-%d.%s-scm-headless.%s.svc.cluster.local" $.Release.Name $i $.Release.Name $.Release.Namespace }} +{{- end }} +- name: OZONE-SITE.XML_ozone.scm.primordial.node.id + value: {{ printf "%s-scm-0" $.Release.Name }} +- name: OZONE-SITE.XML_ozone.om.ratis.enable + value: "true" +- name: OZONE-SITE.XML_ozone.om.service.ids + value: {{ .Values.clusterId }} - name: OZONE-SITE.XML_hdds.scm.safemode.min.datanode value: "3" - name: OZONE-SITE.XML_ozone.datanode.pipeline.limit @@ -78,3 +172,49 @@ app.kubernetes.io/instance: {{ .Release.Name }} value: "{{- printf "%s-recon.%s.svc.cluster.local" $.Release.Name $.Release.Namespace }}:9891" {{- end }} {{- end }} + +{{/* Common configuration environment variables */}} +{{- define "ozone.configuration.env" -}} +{{- $bOmNodes := ternary (splitList "," (include "ozone.om.bootstrap.nodes" .)) (list) (ne "" (include "ozone.om.bootstrap.nodes" .)) }} +{{- $dOmNodes := ternary (splitList "," (include "ozone.om.decommissioned.nodes" .)) (list) (ne "" (include "ozone.om.decommissioned.nodes" .)) }} +{{- $activeOmNodes := ternary (splitList "," (include "ozone.om.cluster.ids" .)) (list) (ne "" (include "ozone.om.cluster.ids" .)) }} +{{ include "ozone.configuration.env.common" . }} +{{- if gt (len $dOmNodes) 0 }} +{{- $decomIds := $dOmNodes | join "," }} +- name: OZONE-SITE.XML_ozone.om.decommissioned.nodes.{{ .Values.clusterId }} + value: {{ $decomIds }} +{{- else}} +- name: OZONE-SITE.XML_ozone.om.decommissioned.nodes.{{ .Values.clusterId }} + value: "" +{{- end }} +- name: OZONE-SITE.XML_ozone.om.nodes.{{ .Values.clusterId }} + value: {{ $activeOmNodes | join "," }} +{{- range $tempId := $activeOmNodes }} +- name: {{ printf "OZONE-SITE.XML_ozone.om.address.%s.%s" $.Values.clusterId $tempId }} + value: {{ printf "%s.%s-om-headless.%s.svc.cluster.local" $tempId $.Release.Name $.Release.Namespace }} +{{- end }} +{{- range $tempId := $dOmNodes }} +- name: {{ printf "OZONE-SITE.XML_ozone.om.address.%s.%s" $.Values.clusterId $tempId }} + value: {{ printf "%s-helm-manager-decommission-%s-svc.%s.svc.cluster.local" $.Release.Name $tempId $.Release.Namespace }} +{{- end }} +{{- end }} + +{{/* Common configuration environment variables for pre hook */}} +{{- define "ozone.configuration.env.prehook" -}} +{{- $bOmNodes := ternary (splitList "," (include "ozone.om.bootstrap.nodes" .)) (list) (ne "" (include "ozone.om.bootstrap.nodes" .)) }} +{{- $dOmNodes := ternary (splitList "," (include "ozone.om.decommissioned.nodes" .)) (list) (ne "" (include "ozone.om.decommissioned.nodes" .)) }} +{{- $activeOmNodes := ternary (splitList "," (include "ozone.om.cluster.ids" .)) (list) (ne "" (include "ozone.om.cluster.ids" .)) }} +{{- $allOmNodes := concat $activeOmNodes $dOmNodes }} +{{ include "ozone.configuration.env.common" . }} +- name: OZONE-SITE.XML_ozone.om.decommissioned.nodes.{{ .Values.clusterId }} + value: "" +{{- range $tempId := $allOmNodes }} +- name: {{ printf "OZONE-SITE.XML_ozone.om.address.%s.%s" $.Values.clusterId $tempId }} + value: {{ printf "%s.%s-om-headless.%s.svc.cluster.local" $tempId $.Release.Name $.Release.Namespace }} +{{- end }} +{{ $allOmNodes = append $allOmNodes "om-leader-transfer"}} +- name: OZONE-SITE.XML_ozone.om.nodes.{{ .Values.clusterId }} + value: {{ $allOmNodes | join "," }} +- name: "OZONE-SITE.XML_ozone.om.address.{{ .Values.clusterId }}.om-leader-transfer" + value: localhost +{{- end }} \ No newline at end of file diff --git a/charts/ozone/templates/datanode/datanode-service-headless.yaml b/charts/ozone/templates/datanode/datanode-service-headless.yaml index 375abb1..6c62959 100644 --- a/charts/ozone/templates/datanode/datanode-service-headless.yaml +++ b/charts/ozone/templates/datanode/datanode-service-headless.yaml @@ -28,6 +28,10 @@ spec: ports: - name: ui port: {{ .Values.datanode.service.port }} + - name: ratis-ipc + port: {{ .Values.datanode.service.ratisIpcPort }} + - name: ipc + port: {{ .Values.datanode.service.ipcPort }} selector: {{- include "ozone.selectorLabels" . | nindent 4 }} app.kubernetes.io/component: datanode diff --git a/charts/ozone/templates/datanode/datanode-statefulset.yaml b/charts/ozone/templates/datanode/datanode-statefulset.yaml index e47a8d4..5b9dfce 100644 --- a/charts/ozone/templates/datanode/datanode-statefulset.yaml +++ b/charts/ozone/templates/datanode/datanode-statefulset.yaml @@ -18,7 +18,7 @@ {{- $env := concat .Values.env .Values.datanode.env }} {{- $envFrom := concat .Values.envFrom .Values.datanode.envFrom }} -{{- $podAnnotations := mergeOverwrite (deepCopy .Values.podAnnotations) .Values.datanode.podAnnotations }} +{{- $podAnnotations := mergeOverwrite (deepCopy (default dict .Values.podAnnotations)) (default dict .Values.datanode.podAnnotations) }} {{- $nodeSelector := or .Values.datanode.nodeSelector .Values.nodeSelector }} {{- $affinity := or .Values.datanode.affinity .Values.affinity }} {{- $tolerations := or .Values.datanode.tolerations .Values.tolerations }} @@ -40,7 +40,7 @@ spec: template: metadata: annotations: - checksum/config: {{ include (print $.Template.BasePath "/ozone-configmap.yaml") . | sha256sum }} + checksum/config: {{ include (print $.Template.BasePath "/ozone-configmap.yaml") . | cat (include "ozone.configuration.env" .) | sha256sum }} {{- with $podAnnotations }} {{- toYaml . | nindent 8 }} {{- end }} @@ -69,6 +69,10 @@ spec: ports: - name: ui containerPort: {{ .Values.datanode.service.port }} + - name: ratis-ipc + containerPort: {{ .Values.datanode.service.ratisIpcPort }} + - name: ipc + containerPort: {{ .Values.datanode.service.ipcPort }} livenessProbe: httpGet: path: / diff --git a/charts/ozone/templates/helm/om-decommission-job.yaml b/charts/ozone/templates/helm/om-decommission-job.yaml new file mode 100644 index 0000000..45b2d98 --- /dev/null +++ b/charts/ozone/templates/helm/om-decommission-job.yaml @@ -0,0 +1,105 @@ +{{- if .Values.om.persistence.enabled }} +{{- $dnodes := ternary (splitList "," (include "ozone.om.decommissioned.nodes" .)) (list) (ne "" (include "ozone.om.decommissioned.nodes" .)) }} +{{- $env := concat .Values.env .Values.helm.env }} +{{- $envFrom := concat .Values.envFrom .Values.helm.envFrom }} +{{- $nodeSelector := or .Values.helm.nodeSelector .Values.nodeSelector }} +{{- $affinity := or .Values.helm.affinity .Values.affinity }} +{{- $tolerations := or .Values.helm.tolerations .Values.tolerations }} +{{- $securityContext := or .Values.helm.securityContext .Values.securityContext }} +{{- if (gt (len $dnodes) 0) }} +{{- range $dnode := $dnodes }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ printf "%s-helm-manager-decommission-%s" $.Release.Name $dnode }} + labels: + {{- include "ozone.labels" $ | nindent 4 }} + app.kubernetes.io/component: helm-manager + annotations: + "helm.sh/hook": post-upgrade + "helm.sh/hook-weight": "0" + "helm.sh/hook-delete-policy": hook-succeeded, hook-failed +spec: + backoffLimit: {{ $.Values.helm.backoffLimit }} + template: + metadata: + labels: + {{- include "ozone.selectorLabels" $ | nindent 8 }} + app.kubernetes.io/component: helm-manager + spec: + containers: + - name: om-decommission + image: "{{ $.Values.image.repository }}:{{ $.Values.image.tag | default $.Chart.AppVersion }}" + imagePullPolicy: {{ $.Values.image.pullPolicy }} + {{- with $.Values.om.command }} + command: {{- tpl (toYaml .) $ | nindent 12 }} + {{- end }} + args: + - sh + - -c + - | + set -e + decommission_finalizer() { + echo "Init decommission finalizer process..." + while true; do + IFS= read -r line; + echo "$line" + if echo "$line" | grep -q "Successfully decommissioned OM {{ $dnode }}"; then + echo "{{ $dnode }} was successfully decommissioned!" + if [ -d /old{{ $.Values.om.persistence.path }} ]; then + echo "Delete old data on pvc to enable rescheduling without manual PVC deletion!" + rm -rf /old{{ $.Values.om.persistence.path }}/* + echo "Data deleted!" + fi + break; + fi + done + echo "Decommission finalizer process finished!" + exit 0 + } + exec ozone admin om decommission -id={{ $.Values.clusterId }} -nodeid={{ $dnode }} -hostname={{ printf "%s-helm-manager-decommission-%s-svc.%s.svc.cluster.local" $.Release.Name $dnode $.Release.Namespace }} | decommission_finalizer + env: + {{- include "ozone.configuration.env" $ | nindent 12 }} + {{- with $env }} + {{- tpl (toYaml .) $ | nindent 12 }} + {{- end }} + {{- with $envFrom }} + envFrom: {{- tpl (toYaml .) $ | nindent 12 }} + {{- end }} + ports: + - name: om-rpc + containerPort: {{ $.Values.om.service.rpcPort }} + - name: om-ratis + containerPort: {{ $.Values.om.service.ratisPort }} + volumeMounts: + - name: config + mountPath: {{ $.Values.configuration.dir }} + - name: om-data + mountPath: {{ $.Values.om.persistence.path }} + - name: om-data-old + mountPath: /old{{ $.Values.om.persistence.path }} + {{- with $nodeSelector }} + nodeSelector: {{- toYaml . | nindent 8 }} + {{- end }} + {{- with $securityContext }} + securityContext: {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - name: om-data-old + persistentVolumeClaim: + claimName: {{ $.Release.Name}}-om-{{ $dnode }} + - name: om-data + emptyDir: { } + - name: config + projected: + sources: + - configMap: + name: {{ $.Release.Name }}-ozone + {{- with $.Values.configuration.filesFrom }} + {{- tpl (toYaml .) $ | nindent 14 }} + {{- end }} + restartPolicy: Never +{{- end }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/ozone/templates/helm/om-decommission-service.yaml b/charts/ozone/templates/helm/om-decommission-service.yaml new file mode 100644 index 0000000..466c5c5 --- /dev/null +++ b/charts/ozone/templates/helm/om-decommission-service.yaml @@ -0,0 +1,30 @@ +{{- if .Values.om.persistence.enabled }} +{{- $dnodes := ternary (splitList "," (include "ozone.om.decommissioned.nodes" .)) (list) (ne "" (include "ozone.om.decommissioned.nodes" .)) }} +{{- if (gt (len $dnodes) 0) }} +{{- range $dnode := $dnodes }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ printf "%s-helm-manager-decommission-%s-svc" $.Release.Name $dnode }} + labels: + {{- include "ozone.labels" $ | nindent 4 }} + app.kubernetes.io/component: helm-manager + annotations: + "helm.sh/hook": post-upgrade + "helm.sh/hook-weight": "-10" + "helm.sh/hook-delete-policy": hook-succeeded, hook-failed +spec: + selector: + job-name: {{ printf "%s-helm-manager-decommission-%s" $.Release.Name $dnode }} + ports: + - name: rpc + port: {{ $.Values.om.service.rpcPort }} + targetPort: {{ $.Values.om.service.rpcPort }} + - name: ratis + port: {{ $.Values.om.service.ratisPort }} + targetPort: {{ $.Values.om.service.ratisPort }} + type: ClusterIP +{{- end }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/ozone/templates/helm/om-leader-transfer-job.yaml b/charts/ozone/templates/helm/om-leader-transfer-job.yaml new file mode 100644 index 0000000..673b47a --- /dev/null +++ b/charts/ozone/templates/helm/om-leader-transfer-job.yaml @@ -0,0 +1,84 @@ +{{- if .Values.om.persistence.enabled }} +{{- $dnodes := ternary (splitList "," (include "ozone.om.decommissioned.nodes" .)) (list) (ne "" (include "ozone.om.decommissioned.nodes" .)) }} +{{- $env := concat .Values.env .Values.helm.env }} +{{- $envFrom := concat .Values.envFrom .Values.helm.envFrom }} +{{- $nodeSelector := or .Values.helm.nodeSelector .Values.nodeSelector }} +{{- $affinity := or .Values.helm.affinity .Values.affinity }} +{{- $tolerations := or .Values.helm.tolerations .Values.tolerations }} +{{- $securityContext := or .Values.helm.securityContext .Values.securityContext }} +{{- if (gt (len $dnodes) 0) }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ printf "%s-helm-manager-leader-transfer" $.Release.Name }} + labels: + {{- include "ozone.labels" $ | nindent 4 }} + app.kubernetes.io/component: helm-manager + annotations: + "helm.sh/hook": pre-upgrade + "helm.sh/hook-weight": "0" + "helm.sh/hook-delete-policy": hook-succeeded,hook-failed +spec: + backoffLimit: {{ $.Values.helm.backoffLimit }} + template: + metadata: + labels: + {{- include "ozone.selectorLabels" $ | nindent 8 }} + app.kubernetes.io/component: helm-manager + spec: + containers: + - name: om-leader-transfer + image: "{{ $.Values.image.repository }}:{{ $.Values.image.tag | default $.Chart.AppVersion }}" + imagePullPolicy: {{ $.Values.image.pullPolicy }} + {{- with $.Values.om.command }} + command: {{- tpl (toYaml .) $ | nindent 12 }} + {{- end }} + args: + - sh + - -c + - | + set -e + exec ozone admin om transfer -id={{ $.Values.clusterId }} -n={{ $.Release.Name }}-om-0 + env: + {{- include "ozone.configuration.env.prehook" $ | nindent 12 }} + {{- with $env }} + {{- tpl (toYaml .) $ | nindent 12 }} + {{- end }} + {{- with $envFrom }} + envFrom: + {{- tpl (toYaml .) $ | nindent 12 }} + {{- end }} + ports: + - name: om-rpc + containerPort: {{ $.Values.om.service.rpcPort }} + {{- if gt (int $.Values.om.replicas) 1 }} + - name: om-ratis + containerPort: {{ $.Values.om.service.ratisPort }} + {{- end }} + volumeMounts: + - name: config + mountPath: {{ $.Values.configuration.dir }} + - name: om-data + mountPath: {{ $.Values.om.persistence.path }} + {{- with $nodeSelector }} + nodeSelector: {{- toYaml . | nindent 8 }} + {{- end }} + {{- with $securityContext }} + securityContext: {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - name: om-data + emptyDir: { } + - name: config + projected: + sources: + - configMap: + name: {{ $.Release.Name }}-ozone + {{- with $.Values.configuration.filesFrom }} + {{- tpl (toYaml .) $ | nindent 14 }} + {{- end }} + restartPolicy: Never + +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/ozone/templates/om/om-bootstrap-configmap.yaml b/charts/ozone/templates/om/om-bootstrap-configmap.yaml new file mode 100644 index 0000000..141e19e --- /dev/null +++ b/charts/ozone/templates/om/om-bootstrap-configmap.yaml @@ -0,0 +1,98 @@ +{{- if and .Values.om.persistence.enabled (gt (len (ternary (splitList "," (include "ozone.om.bootstrap.nodes" .)) (list) (ne "" (include "ozone.om.bootstrap.nodes" .)))) 0) }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Release.Name }}-om-bootstrap-script + labels: + {{- include "ozone.labels" . | nindent 4 }} + app.kubernetes.io/component: om +data: + om-bootstrap.sh: |- + #!/bin/sh + set -e + + HELM_MANAGER_PATH="{{ .Values.om.persistence.path }}{{ .Values.helm.persistence.path }}" + HELM_MANAGER_BOOTSTRAPPED_FILE="$HELM_MANAGER_PATH/bootstrapped" + + # These are templated from Helm + OZONE_OM_ARGS_LIST="{{- range .Values.om.args }} {{ . }} {{- end }}" + OZONE_OM_BOOTSTRAP_NODES="{{ include "ozone.om.bootstrap.nodes" . }}" + OZONE_OM_CLUSTER_IDS="{{ include "ozone.om.cluster.ids" . }}" + OZONE_CLUSTER_ID="{{ .Values.clusterId }}" + + if [ -z "$OZONE_OM_BOOTSTRAP_NODES" ]; then + echo "No bootstrap handling needed!" + exit 0 + fi + + joinArr() { + local IFS="," + echo "$*" + } + + run_bootstrap() { + local overwriteCmd="$1" + local max_attempts=3 + local attempt=1 + local base_delay=5 + local exit_code=0 + + echo "Bootstrapping node config for this node: $overwriteCmd" + + while [ $attempt -le $max_attempts ]; do + echo "Bootstrap attempt $attempt of $max_attempts" + + if ozone admin om --set "ozone.om.nodes.$OZONE_CLUSTER_ID=$overwriteCmd" --bootstrap; then + echo "$HOSTNAME was successfully bootstrapped!" + mkdir -p "$HELM_MANAGER_PATH" + touch "$HELM_MANAGER_BOOTSTRAPPED_FILE" + exit 0 + else + exit_code=$? + echo "Bootstrap failed with exit code $exit_code, attempt $attempt of $max_attempts" + + if [ $attempt -lt $max_attempts ]; then + local delay=$((base_delay * (1 << (attempt - 1)))) + echo "Retrying in $delay seconds..." + sleep $delay + fi + + attempt=$((attempt + 1)) + fi + done + + echo "Bootstrap failed after $max_attempts attempts with exit code $exit_code" + exit 1 + } + + bootstrapHosts="$OZONE_OM_BOOTSTRAP_NODES" + echo "Need to handle bootstrap for nodes $bootstrapHosts" + + IFS=',' read -r -a hostArray <<< "$bootstrapHosts" + doBootstrap=false + nodesConfigOverwriteList=() + + for host in "${hostArray[@]}"; do + if [[ "$host" == "$HOSTNAME" ]]; then + doBootstrap=true + activeNodesConfig="$OZONE_OM_CLUSTER_IDS" + IFS=',' read -r -a overwriteArray <<< "$activeNodesConfig" + for overwriteHost in "${overwriteArray[@]}"; do + nodesConfigOverwriteList+=("$overwriteHost") + if [[ "$overwriteHost" == "$HOSTNAME" ]]; then + break; + fi + done + break + fi + done + + if [ "$doBootstrap" = true ] && [ ! -f "$HELM_MANAGER_BOOTSTRAPPED_FILE" ]; then + echo "$HOSTNAME must be started with bootstrap arg!" + overwriteCmd="$(joinArr "${nodesConfigOverwriteList[@]}")" + run_bootstrap "$overwriteCmd" + else + echo "$HOSTNAME must not be started with bootstrap arg, or is already bootstrapped." + exit 0 + fi +{{- end }} diff --git a/charts/ozone/templates/om/om-service-headless.yaml b/charts/ozone/templates/om/om-service-headless.yaml index d16659b..8aaccaa 100644 --- a/charts/ozone/templates/om/om-service-headless.yaml +++ b/charts/ozone/templates/om/om-service-headless.yaml @@ -28,6 +28,12 @@ spec: ports: - name: ui port: {{ .Values.om.service.port }} + - name: rpc + port: {{ .Values.om.service.rpcPort }} + {{- if gt (int .Values.om.replicas) 1 }} + - name: ratis + port: {{ .Values.om.service.ratisPort }} + {{- end }} selector: {{- include "ozone.selectorLabels" . | nindent 4 }} app.kubernetes.io/component: om diff --git a/charts/ozone/templates/om/om-statefulset.yaml b/charts/ozone/templates/om/om-statefulset.yaml index 379cec8..75f7d89 100644 --- a/charts/ozone/templates/om/om-statefulset.yaml +++ b/charts/ozone/templates/om/om-statefulset.yaml @@ -18,11 +18,12 @@ {{- $env := concat .Values.env .Values.om.env }} {{- $envFrom := concat .Values.envFrom .Values.om.envFrom }} -{{- $podAnnotations := mergeOverwrite (deepCopy .Values.podAnnotations) .Values.om.podAnnotations }} +{{- $podAnnotations := mergeOverwrite (deepCopy (default dict .Values.podAnnotations)) (default dict .Values.om.podAnnotations) }} {{- $nodeSelector := or .Values.om.nodeSelector .Values.nodeSelector }} {{- $affinity := or .Values.om.affinity .Values.affinity }} {{- $tolerations := or .Values.om.tolerations .Values.tolerations }} {{- $securityContext := or .Values.om.securityContext .Values.securityContext }} +{{- $bnodes := ternary (splitList "," (include "ozone.om.bootstrap.nodes" .)) (list) (ne "" (include "ozone.om.bootstrap.nodes" .)) }} apiVersion: apps/v1 kind: StatefulSet metadata: @@ -40,7 +41,7 @@ spec: template: metadata: annotations: - checksum/config: {{ include (print $.Template.BasePath "/ozone-configmap.yaml") . | sha256sum }} + checksum/config: {{ include (print $.Template.BasePath "/ozone-configmap.yaml") . | cat (include "ozone.configuration.env" .) | sha256sum }} {{- with $podAnnotations }} {{- toYaml . | nindent 8 }} {{- end }} @@ -48,6 +49,28 @@ spec: {{- include "ozone.selectorLabels" . | nindent 8 }} app.kubernetes.io/component: om spec: + {{- if and .Values.om.persistence.enabled (gt (len $bnodes) 0) }} + initContainers: + - name: om-bootstrap + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["/bin/sh", "/scripts/om-bootstrap.sh"] + env: + {{- include "ozone.configuration.env" . | nindent 12 }} + {{- with $env }} + {{- tpl (toYaml .) $ | nindent 12 }} + {{- end }} + {{- with $envFrom }} + envFrom: {{- tpl (toYaml .) $ | nindent 12 }} + {{- end }} + volumeMounts: + - name: config + mountPath: {{ .Values.configuration.dir }} + - name: {{ .Release.Name }}-om + mountPath: {{ .Values.om.persistence.path }} + - name: om-bootstrap-script + mountPath: /scripts + {{- end }} containers: - name: om image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" @@ -55,13 +78,11 @@ spec: {{- with .Values.om.command }} command: {{- tpl (toYaml .) $ | nindent 12 }} {{- end }} - {{- with .Values.om.args }} - args: {{- tpl (toYaml .) $ | nindent 12 }} - {{- end }} + args: {{- tpl (toYaml .Values.om.args) $ | nindent 12 }} env: {{- include "ozone.configuration.env" . | nindent 12 }} - name: WAITFOR - value: {{ $.Release.Name }}-scm-0.{{ $.Release.Name }}-scm-headless:9876 + value: {{ $.Release.Name }}-scm-0.{{ $.Release.Name }}-scm-headless:{{ .Values.scm.service.port }} - name: ENSURE_OM_INITIALIZED value: /data/metadata/om/current/VERSION {{- with $env }} @@ -72,9 +93,11 @@ spec: {{- end }} ports: - name: rpc - containerPort: 9862 + containerPort: {{ .Values.om.service.rpcPort }} - name: ui containerPort: {{ .Values.om.service.port }} + - name: ratis + containerPort: {{ .Values.om.service.ratisPort }} livenessProbe: httpGet: path: / @@ -101,6 +124,12 @@ spec: securityContext: {{- toYaml . | nindent 8 }} {{- end }} volumes: + {{- if and .Values.om.persistence.enabled (gt (len $bnodes) 0) }} + - name: om-bootstrap-script + configMap: + name: {{ .Release.Name }}-om-bootstrap-script + defaultMode: 0777 + {{- end }} - name: config projected: sources: @@ -111,7 +140,7 @@ spec: {{- end }} {{- if not .Values.om.persistence.enabled }} - name: {{ .Release.Name }}-om - emptyDir: {} + emptyDir: { } {{- end }} {{- if .Values.om.persistence.enabled }} volumeClaimTemplates: diff --git a/charts/ozone/templates/ozone-configmap.yaml b/charts/ozone/templates/ozone-configmap.yaml index dbee026..94f101a 100644 --- a/charts/ozone/templates/ozone-configmap.yaml +++ b/charts/ozone/templates/ozone-configmap.yaml @@ -21,5 +21,9 @@ kind: ConfigMap metadata: name: {{ .Release.Name }}-ozone labels: {{- include "ozone.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-upgrade, pre-install + "helm.sh/hook-weight": "-10" + "helm.sh/resource-policy": keep data: {{- tpl (toYaml .Values.configuration.files) $ | nindent 4 }} diff --git a/charts/ozone/templates/s3g/s3g-statefulset.yaml b/charts/ozone/templates/s3g/s3g-statefulset.yaml index 4a11f07..51f15fb 100644 --- a/charts/ozone/templates/s3g/s3g-statefulset.yaml +++ b/charts/ozone/templates/s3g/s3g-statefulset.yaml @@ -18,7 +18,7 @@ {{- $env := concat .Values.env .Values.s3g.env }} {{- $envFrom := concat .Values.envFrom .Values.s3g.envFrom }} -{{- $podAnnotations := mergeOverwrite (deepCopy .Values.podAnnotations) .Values.s3g.podAnnotations }} +{{- $podAnnotations := mergeOverwrite (deepCopy (default dict .Values.podAnnotations)) (default dict .Values.s3g.podAnnotations) }} {{- $nodeSelector := or .Values.s3g.nodeSelector .Values.nodeSelector }} {{- $affinity := or .Values.s3g.affinity .Values.affinity }} {{- $tolerations := or .Values.s3g.tolerations .Values.tolerations }} @@ -40,7 +40,7 @@ spec: template: metadata: annotations: - checksum/config: {{ include (print $.Template.BasePath "/ozone-configmap.yaml") . | sha256sum }} + checksum/config: {{ include (print $.Template.BasePath "/ozone-configmap.yaml") . | cat (include "ozone.configuration.env" .) | sha256sum }} {{- with $podAnnotations }} {{- toYaml . | nindent 8 }} {{- end }} diff --git a/charts/ozone/templates/scm/scm-service-headless.yaml b/charts/ozone/templates/scm/scm-service-headless.yaml index dce5857..d71c004 100644 --- a/charts/ozone/templates/scm/scm-service-headless.yaml +++ b/charts/ozone/templates/scm/scm-service-headless.yaml @@ -28,6 +28,18 @@ spec: ports: - name: ui port: {{ .Values.scm.service.port }} + - name: rpc-datanode + port: {{ .Values.scm.service.rpcDatanodePort }} + - name: block-client + port: {{ .Values.scm.service.blockClientPort }} + - name: rpc-client + port: {{ .Values.scm.service.rpcClientPort }} + {{- if gt (int .Values.scm.replicas) 1 }} + - name: ratis + port: {{ .Values.scm.service.ratisPort }} + - name: grpc + port: {{ .Values.scm.service.grpcPort }} + {{- end }} selector: {{- include "ozone.selectorLabels" . | nindent 4 }} app.kubernetes.io/component: scm diff --git a/charts/ozone/templates/scm/scm-statefulset.yaml b/charts/ozone/templates/scm/scm-statefulset.yaml index 27cf1f3..6c1d144 100644 --- a/charts/ozone/templates/scm/scm-statefulset.yaml +++ b/charts/ozone/templates/scm/scm-statefulset.yaml @@ -18,7 +18,7 @@ {{- $env := concat .Values.env .Values.scm.env }} {{- $envFrom := concat .Values.envFrom .Values.scm.envFrom }} -{{- $podAnnotations := mergeOverwrite (deepCopy .Values.podAnnotations) .Values.scm.podAnnotations }} +{{- $podAnnotations := mergeOverwrite (deepCopy (default dict .Values.podAnnotations)) (default dict .Values.scm.podAnnotations) }} {{- $nodeSelector := or .Values.scm.nodeSelector .Values.nodeSelector }} {{- $affinity := or .Values.scm.affinity .Values.affinity }} {{- $tolerations := or .Values.scm.tolerations .Values.tolerations }} @@ -32,6 +32,7 @@ metadata: app.kubernetes.io/component: scm spec: replicas: {{ .Values.scm.replicas }} + podManagementPolicy: Parallel serviceName: {{ .Release.Name }}-scm-headless selector: matchLabels: @@ -40,7 +41,7 @@ spec: template: metadata: annotations: - checksum/config: {{ include (print $.Template.BasePath "/ozone-configmap.yaml") . | sha256sum }} + checksum/config: {{ include (print $.Template.BasePath "/ozone-configmap.yaml") . | cat (include "ozone.configuration.env" .) | sha256sum }} {{- with $podAnnotations }} {{- toYaml . | nindent 8 }} {{- end }} @@ -65,6 +66,24 @@ spec: mountPath: {{ .Values.configuration.dir }} - name: {{ .Release.Name }}-scm mountPath: {{ .Values.scm.persistence.path }} + {{- if gt (int .Values.scm.replicas) 1 }} + - name: bootstrap + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + args: ["ozone", "scm", "--bootstrap"] + env: + {{- include "ozone.configuration.env" . | nindent 12 }} + {{- with $env }} + {{- tpl (toYaml .) $ | nindent 12 }} + {{- end }} + {{- with $envFrom }} + envFrom: {{- tpl (toYaml .) $ | nindent 12 }} + {{- end }} + volumeMounts: + - name: config + mountPath: {{ .Values.configuration.dir }} + - name: {{ .Release.Name }}-scm + mountPath: {{ .Values.scm.persistence.path }} + {{- end }} containers: - name: scm image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" @@ -85,11 +104,19 @@ spec: {{- end }} ports: - name: rpc-client - containerPort: 9860 + containerPort: {{ .Values.scm.service.rpcClientPort }} + - name: block-client + containerPort: {{ .Values.scm.service.blockClientPort }} - name: rpc-datanode - containerPort: 9861 + containerPort: {{ .Values.scm.service.rpcDatanodePort }} - name: ui containerPort: {{ .Values.scm.service.port }} + {{- if gt (int .Values.scm.replicas) 1 }} + - name: ratis + containerPort: {{ .Values.scm.service.ratisPort }} + - name: grpc + containerPort: {{ .Values.scm.service.grpcPort }} + {{- end }} livenessProbe: httpGet: path: / diff --git a/charts/ozone/values.yaml b/charts/ozone/values.yaml index a06ea87..d89032a 100644 --- a/charts/ozone/values.yaml +++ b/charts/ozone/values.yaml @@ -21,6 +21,9 @@ image: imagePullSecrets: [] +# Cluster ID +clusterId: cluster1 + # Common environment variables (templated) env: [] # Common envFrom items to set up environment variables (templated) @@ -89,6 +92,8 @@ datanode: service: type: ClusterIP port: 9882 + ratisIpcPort: 9858 + ipcPort: 9859 nodePort: ~ labels: {} annotations: {} @@ -109,7 +114,7 @@ datanode: # Ozone Manager configuration om: # Number of Ozone Manager replicas - replicas: 1 + replicas: 3 # Command to launch Ozone Manager (templated) command: ~ # Arguments to launch Ozone Manager (templated) @@ -125,7 +130,18 @@ om: # Constrain Ozone Manager pods to nodes with specific node labels nodeSelector: {} # Constrain Ozone Manager pods to nodes by affinity/anti-affinity rules - affinity: {} + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/component + operator: In + values: + - scm + topologyKey: kubernetes.io/hostname # Allow to schedule Ozone Manager pods on nodes with matching taints tolerations: [] # Ozone Manager security context (overwrites common security context) @@ -134,6 +150,8 @@ om: service: type: ClusterIP port: 9874 + ratisPort: 9872 + rpcPort: 9862 nodePort: ~ labels: {} annotations: {} @@ -151,6 +169,65 @@ om: # The name of a specific storage class name to use storageClassName: ~ +# Storage Container Manager configuration +scm: + # Number of Storage Container Manager replicas + replicas: 1 + # Command to launch Storage Container Manager (templated) + command: ~ + # Arguments to launch Storage Container Manager (templated) + args: ["ozone", "scm"] + # Additional Storage Container Manager environment variables (templated) + env: [] + # Additional Storage Container Manager envFrom items to set up environment variables (templated) + envFrom: [] + # Storage Container Manager resource requests and limits + resources: {} + # Constrain Storage Container Manager pods to nodes with specific node labels + nodeSelector: {} + # Constrain Storage Container Manager pods to nodes by affinity/anti-affinity rules + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/component + operator: In + values: + - om + topologyKey: kubernetes.io/hostname + # Allow to schedule Storage Container Manager pods on nodes with matching taints + tolerations: [] + # Storage Container Manager security context (overwrites common security context) + securityContext: {} + # Storage Container Manager service configuration + service: + type: ClusterIP + port: 9876 + rpcDatanodePort: 9861 + blockClientPort: 9863 + rpcClientPort: 9860 + ratisPort: 9894 + grpcPort: 9895 + nodePort: ~ + labels: {} + annotations: {} + # Storage Container Manager persistence + persistence: + # Enable persistence + enabled: false + # Persistence access modes + accessModes: + - ReadWriteOnce + # Path for Storage Container Manager volume mount + path: /data + # Volume size + size: 10Gi + # The name of a specific storage class name to use + storageClassName: ~ + # S3 Gateway configuration s3g: # Number of S3 Gateway replicas @@ -203,17 +280,11 @@ s3g: # The name of a specific storage class name to use storageClassName: ~ -# Storage Container Manager configuration -scm: - # Number of Storage Container Manager replicas - replicas: 1 - # Command to launch Storage Container Manager (templated) - command: ~ - # Arguments to launch Storage Container Manager (templated) - args: ["ozone", "scm"] - # Additional Storage Container Manager environment variables (templated) +# Helm Manager configuration +helm: + # Additional Helm Manager environment variables (templated) env: [] - # Additional Storage Container Manager envFrom items to set up environment variables (templated) + # Additional Helm Manager envFrom items to set up environment variables (templated) envFrom: [] # Storage Container Manager resource requests and limits resources: {} @@ -221,20 +292,19 @@ scm: podAnnotations: {} # Constrain Storage Container Manager pods to nodes with specific node labels nodeSelector: {} - # Constrain Storage Container Manager pods to nodes by affinity/anti-affinity rules + # Constrain Helm Manager pods to nodes by affinity/anti-affinity rules affinity: {} - # Allow to schedule Storage Container Manager pods on nodes with matching taints + # Allow to schedule Helm Manager pods on nodes with matching taints tolerations: [] - # Storage Container Manager security context (overwrites common security context) + # Helm Manager security context (overwrites common security context) securityContext: {} - # Storage Container Manager service configuration - service: - type: ClusterIP - port: 9876 - nodePort: ~ - labels: {} - annotations: {} - # Storage Container Manager persistence + # Decommissioning is handled with a post-upgrade helm hook job. + # To avoid endless retries of decommissioning, this limit is set. + # This can happen if PVC has been deleted or is not reachable. + # This is used for decommissioning OM + backoffLimit: 5 + # Helm Manager persistence (this is enabled automatically if al least one + # of datanode, scm or om is enabled) persistence: # Enable persistence enabled: false