diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl index 06f1564d..968b6ad9 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/_helpers.tpl @@ -55,7 +55,7 @@ Generate the health monitoring agent image URI based on AWS region */}} {{- define "health-monitoring-agent.imageUri" -}} {{- $region := "" -}} -{{- $imageTag := .Values.imageTag | default "1.0.790.0_1.0.266.0" -}} +{{- $imageTag := .Values.imageTag | default "1.0.819.0_1.0.267.0" -}} {{/* Debug: Show image tag selection if debug is enabled */}} {{- if .Values.debug -}} diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml index 17c9a3d8..c7bee94c 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml @@ -85,12 +85,6 @@ spec: - ml.g5.16xlarge - ml.g5.24xlarge - ml.g5.48xlarge - - ml.inf2.xlarge - - ml.inf2.8xlarge - - ml.inf2.24xlarge - - ml.inf2.48xlarge - - ml.trn1.32xlarge - - ml.trn1n.32xlarge - ml.g6.xlarge - ml.g6.2xlarge - ml.g6.4xlarge @@ -109,7 +103,6 @@ spec: - ml.g6e.12xlarge - ml.g6e.24xlarge - ml.g6e.48xlarge - - ml.trn2.48xlarge - ml.p6-b200.48xlarge - ml.p6e-gb200.36xlarge containers: @@ -166,3 +159,93 @@ spec: operator: Exists - effect: NoExecute operator: Exists +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: health-monitoring-agent-non-nvidia + namespace: {{ .Values.namespace }} + labels: + app: health-monitoring-agent-non-nvidia +spec: + selector: + matchLabels: + app: health-monitoring-agent-non-nvidia + template: + metadata: + labels: + app: health-monitoring-agent-non-nvidia + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - ml.inf2.xlarge + - ml.inf2.8xlarge + - ml.inf2.24xlarge + - ml.inf2.48xlarge + - ml.trn1.32xlarge + - ml.trn1n.32xlarge + - ml.trn2.48xlarge + containers: + - name: health-monitoring-agent-non-nvidia + args: + - --enable-k8s-exporter=false + - --config.system-log-monitor=/config/system-message-monitor.json + image: {{ include "health-monitoring-agent.imageUri" . }} + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + imagePullPolicy: IfNotPresent + securityContext: + runAsUser: 1000 + runAsGroup: 2000 + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: NVIDIA_VISIBLE_DEVICES + value: "void" + - name: NVIDIA_DRIVER_CAPABILITIES + value: "" + volumeMounts: + - name: log + mountPath: /var/log + - name: kmsg + mountPath: /dev/kmsg + readOnly: true + # Make sure node problem detector is in the same timezone + # with the host. + - name: localtime + mountPath: /etc/localtime + readOnly: true + serviceAccountName: health-monitoring-agent + volumes: + - name: log + # Config `log` to your system log directory + hostPath: + path: /var/log/ + - name: kmsg + hostPath: + path: /dev/kmsg + - name: localtime + hostPath: + path: /etc/localtime + tolerations: + - effect: NoSchedule + operator: Exists + - effect: NoExecute + operator: Exists \ No newline at end of file diff --git a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml index ab98571d..4717b266 100644 --- a/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml +++ b/helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml @@ -25,7 +25,7 @@ imageTag: "" # Override the health monitoring agent image URI # If specified, this will override the automatic region-based URI selection -# Example: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0" +# Example: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0" hmaimage: "" # Enable debug output for region selection process diff --git a/helm_chart/readme.md b/helm_chart/readme.md index 15f1ba61..f357efd6 100644 --- a/helm_chart/readme.md +++ b/helm_chart/readme.md @@ -234,19 +234,19 @@ helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system - **Supported Regions and their ECR URIs**: ``` - us-east-1 (US East (N. Virginia)): 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 - us-west-2 (US West (Oregon)): 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 - us-east-2 (US East (Ohio)): 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 - us-west-1 (US West (N. California)): 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 - eu-central-1 (Europe (Frankfurt)): 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 - eu-north-1 (Europe (Stockholm)): 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 - eu-west-1 (Europe (Ireland)): 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 - eu-west-2 (Europe (London)): 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 - ap-northeast-1 (Asia Pacific (Tokyo)): 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 - ap-south-1 (Asia Pacific (Mumbai)): 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 - ap-southeast-1 (Asia Pacific (Singapore)): 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 - ap-southeast-2 (Asia Pacific (Sydney)): 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 - sa-east-1 (South America (São Paulo)): 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.790.0_1.0.266.0 + us-east-1 (US East (N. Virginia)): 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 + us-west-2 (US West (Oregon)): 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 + us-east-2 (US East (Ohio)): 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 + us-west-1 (US West (N. California)): 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 + eu-central-1 (Europe (Frankfurt)): 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 + eu-north-1 (Europe (Stockholm)): 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 + eu-west-1 (Europe (Ireland)): 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 + eu-west-2 (Europe (London)): 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 + ap-northeast-1 (Asia Pacific (Tokyo)): 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 + ap-south-1 (Asia Pacific (Mumbai)): 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 + ap-southeast-1 (Asia Pacific (Singapore)): 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 + ap-southeast-2 (Asia Pacific (Sydney)): 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 + sa-east-1 (South America (São Paulo)): 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.819.0_1.0.267.0 ``` ## 7. Troubleshooting