From 0ebad71ea9cf915e313abfe0f857c48566cb06aa Mon Sep 17 00:00:00 2001 From: Ivan Kolodiazhnyi Date: Mon, 23 Sep 2024 13:24:41 +0300 Subject: [PATCH] feat: Add optional deployment for NVIDIA Networking NIC Configuration Operator Signed-off-by: Ivan Kolodiazhnyi --- deployment/network-operator/Chart.yaml | 4 + .../.helmignore | 23 ++ .../Chart.yaml | 6 + ....nvidia.com_nicconfigurationtemplates.yaml | 192 ++++++++++++ ...nfiguration.net.nvidia.com_nicdevices.yaml | 279 ++++++++++++++++++ .../templates/_helpers.tpl | 58 ++++ .../templates/config-daemon.yaml | 64 ++++ .../templates/operator.yaml | 56 ++++ .../templates/role.yaml | 99 +++++++ .../templates/role_binding.yaml | 16 + .../templates/serviceaccount.yaml | 12 + .../supported-nic-firmware-configmap.yaml | 13 + .../values.yaml | 52 ++++ deployment/network-operator/values.yaml | 17 ++ hack/release.go | 4 + hack/release.yaml | 8 + hack/templates/values/values.template | 17 ++ 17 files changed, 920 insertions(+) create mode 100644 deployment/network-operator/charts/nic-configuration-operator-chart/.helmignore create mode 100644 deployment/network-operator/charts/nic-configuration-operator-chart/Chart.yaml create mode 100644 deployment/network-operator/charts/nic-configuration-operator-chart/crds/configuration.net.nvidia.com_nicconfigurationtemplates.yaml create mode 100644 deployment/network-operator/charts/nic-configuration-operator-chart/crds/configuration.net.nvidia.com_nicdevices.yaml create mode 100644 deployment/network-operator/charts/nic-configuration-operator-chart/templates/_helpers.tpl create mode 100644 deployment/network-operator/charts/nic-configuration-operator-chart/templates/config-daemon.yaml create mode 100644 deployment/network-operator/charts/nic-configuration-operator-chart/templates/operator.yaml create mode 100644 deployment/network-operator/charts/nic-configuration-operator-chart/templates/role.yaml create mode 100644 deployment/network-operator/charts/nic-configuration-operator-chart/templates/role_binding.yaml create mode 100644 deployment/network-operator/charts/nic-configuration-operator-chart/templates/serviceaccount.yaml create mode 100644 deployment/network-operator/charts/nic-configuration-operator-chart/templates/supported-nic-firmware-configmap.yaml create mode 100644 deployment/network-operator/charts/nic-configuration-operator-chart/values.yaml diff --git a/deployment/network-operator/Chart.yaml b/deployment/network-operator/Chart.yaml index 8a365e6d..e7fe335a 100644 --- a/deployment/network-operator/Chart.yaml +++ b/deployment/network-operator/Chart.yaml @@ -20,3 +20,7 @@ dependencies: name: sriov-network-operator repository: '' version: 0.1.0 +- condition: nicConfigurationOperator.enabled + name: nic-configuration-operator-chart + repository: '' + version: 0.0.1 diff --git a/deployment/network-operator/charts/nic-configuration-operator-chart/.helmignore b/deployment/network-operator/charts/nic-configuration-operator-chart/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/deployment/network-operator/charts/nic-configuration-operator-chart/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/deployment/network-operator/charts/nic-configuration-operator-chart/Chart.yaml b/deployment/network-operator/charts/nic-configuration-operator-chart/Chart.yaml new file mode 100644 index 00000000..64974caa --- /dev/null +++ b/deployment/network-operator/charts/nic-configuration-operator-chart/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: nic-configuration-operator-chart +description: A Helm chart for NIC Configuration Operator +type: application +version: 0.0.1 +appVersion: "latest" diff --git a/deployment/network-operator/charts/nic-configuration-operator-chart/crds/configuration.net.nvidia.com_nicconfigurationtemplates.yaml b/deployment/network-operator/charts/nic-configuration-operator-chart/crds/configuration.net.nvidia.com_nicconfigurationtemplates.yaml new file mode 100644 index 00000000..3802273f --- /dev/null +++ b/deployment/network-operator/charts/nic-configuration-operator-chart/crds/configuration.net.nvidia.com_nicconfigurationtemplates.yaml @@ -0,0 +1,192 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: nicconfigurationtemplates.configuration.net.nvidia.com +spec: + group: configuration.net.nvidia.com + names: + kind: NicConfigurationTemplate + listKind: NicConfigurationTemplateList + plural: nicconfigurationtemplates + singular: nicconfigurationtemplate + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: NicConfigurationTemplate is the Schema for the nicconfigurationtemplates + API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Defines the desired state of NICs + properties: + nicSelector: + description: NIC selector configuration + properties: + nicType: + description: Type of the NIC to be selected, e.g. 101d,1015,a2d6 + etc. + type: string + pciAddresses: + description: Array of PCI addresses to be selected, e.g. "0000:03:00.0" + items: + type: string + type: array + serialNumbers: + description: Serial numbers of the NICs to be selected, e.g. MT2116X09299 + items: + type: string + type: array + required: + - nicType + type: object + nodeSelector: + additionalProperties: + type: string + description: NodeSelector contains labels required on the node + type: object + resetToDefault: + default: false + description: |- + ResetToDefault specifies whether node agent needs to perform a reset flow + The following operations will be performed: + * Nvconfig reset of all non-volatile configurations + - Mstconfig -d reset for each PF + - Mstconfig -d set ADVANCED_PCI_SETTINGS=1 + * Node reboot + - Applies new NIC NV config + - Will undo any runtime configuration previously performed for the device/driver + type: boolean + template: + description: Configuration template to be applied to matching devices + properties: + gpuDirectOptimized: + description: GPU Direct optimization settings + properties: + enabled: + description: Optimize GPU Direct + type: boolean + env: + description: GPU direct environment, e.g. Baremetal + type: string + required: + - enabled + - env + type: object + linkType: + description: LinkType to be configured, Ethernet|Infiniband + enum: + - Ethernet + - Infiniband + type: string + numVfs: + description: Number of VFs to be configured + type: integer + pciPerformanceOptimized: + description: PCI performance optimization settings + properties: + enabled: + description: Specifies whether to enable PCI performance optimization + type: boolean + maxAccOutRead: + description: Specifies the PCIe Max Accumulative Outstanding + read bytes + type: integer + maxReadRequest: + description: Specifies the size of a single PCI read request + in bytes + enum: + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + type: integer + required: + - enabled + type: object + rawNvConfig: + description: List of arbitrary nv config parameters + items: + properties: + name: + description: Name of the arbitrary nvconfig parameter + type: string + value: + description: Value of the arbitrary nvconfig parameter + type: string + required: + - name + - value + type: object + type: array + roceOptimized: + description: RoCE optimization settings + properties: + enabled: + description: Optimize RoCE + type: boolean + qos: + description: Quality of Service settings + properties: + pfc: + description: Priority-based Flow Control configuration, + e.g. "0,0,0,1,0,0,0,0" + pattern: ^([01],){7}[01]$ + type: string + trust: + description: Trust mode for QoS settings, e.g. trust-dscp + type: string + required: + - pfc + - trust + type: object + required: + - enabled + - qos + type: object + required: + - linkType + - numVfs + type: object + required: + - nicSelector + - template + type: object + status: + description: Defines the observed state of NicConfigurationTemplate + properties: + nicDevices: + description: NicDevice CRs matching this configuration template + items: + type: string + type: array + required: + - nicDevices + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/deployment/network-operator/charts/nic-configuration-operator-chart/crds/configuration.net.nvidia.com_nicdevices.yaml b/deployment/network-operator/charts/nic-configuration-operator-chart/crds/configuration.net.nvidia.com_nicdevices.yaml new file mode 100644 index 00000000..c0e4f2b2 --- /dev/null +++ b/deployment/network-operator/charts/nic-configuration-operator-chart/crds/configuration.net.nvidia.com_nicdevices.yaml @@ -0,0 +1,279 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: nicdevices.configuration.net.nvidia.com +spec: + group: configuration.net.nvidia.com + names: + kind: NicDevice + listKind: NicDeviceList + plural: nicdevices + singular: nicdevice + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: NicDevice is the Schema for the nicdevices API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NicDeviceSpec defines the desired state of NicDevice + properties: + configuration: + description: Configuration specifies the configuration requested by + NicConfigurationTemplate + properties: + resetToDefault: + description: |- + ResetToDefault specifies whether node agent needs to perform a reset flow + The following operations will be performed: + * Nvconfig reset of all non-volatile configurations + - Mstconfig -d reset for each PF + - Mstconfig -d set ADVANCED_PCI_SETTINGS=1 + * Node reboot + - Applies new NIC NV config + - Will undo any runtime configuration previously performed for the device/driver + type: boolean + template: + description: Configuration template applied from the NicConfigurationTemplate + CR + properties: + gpuDirectOptimized: + description: GPU Direct optimization settings + properties: + enabled: + description: Optimize GPU Direct + type: boolean + env: + description: GPU direct environment, e.g. Baremetal + type: string + required: + - enabled + - env + type: object + linkType: + description: LinkType to be configured, Ethernet|Infiniband + enum: + - Ethernet + - Infiniband + type: string + numVfs: + description: Number of VFs to be configured + type: integer + pciPerformanceOptimized: + description: PCI performance optimization settings + properties: + enabled: + description: Specifies whether to enable PCI performance + optimization + type: boolean + maxAccOutRead: + description: Specifies the PCIe Max Accumulative Outstanding + read bytes + type: integer + maxReadRequest: + description: Specifies the size of a single PCI read request + in bytes + enum: + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + type: integer + required: + - enabled + type: object + rawNvConfig: + description: List of arbitrary nv config parameters + items: + properties: + name: + description: Name of the arbitrary nvconfig parameter + type: string + value: + description: Value of the arbitrary nvconfig parameter + type: string + required: + - name + - value + type: object + type: array + roceOptimized: + description: RoCE optimization settings + properties: + enabled: + description: Optimize RoCE + type: boolean + qos: + description: Quality of Service settings + properties: + pfc: + description: Priority-based Flow Control configuration, + e.g. "0,0,0,1,0,0,0,0" + pattern: ^([01],){7}[01]$ + type: string + trust: + description: Trust mode for QoS settings, e.g. trust-dscp + type: string + required: + - pfc + - trust + type: object + required: + - enabled + - qos + type: object + required: + - linkType + - numVfs + type: object + type: object + type: object + status: + description: NicDeviceStatus defines the observed state of NicDevice + properties: + conditions: + description: List of conditions observed for the device + items: + description: "Condition contains details for one aspect of the current + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + firmwareVersion: + description: Firmware version currently installed on the device, e.g. + 22.31.1014 + type: string + node: + description: Node where the device is located + type: string + partNumber: + description: Part number of the device, e.g. MCX713106AEHEA_QP1 + type: string + ports: + description: List of ports for the device + items: + description: NicDevicePortSpec describes the ports of the NIC + properties: + networkInterface: + description: NetworkInterface is the name of the network interface + for this port, e.g. eth1 + type: string + pci: + description: PCI is a PCI address of the port, e.g. 0000:3b:00.0 + type: string + rdmaInterface: + description: RdmaInterface is the name of the rdma interface + for this port, e.g. mlx5_1 + type: string + required: + - pci + type: object + type: array + psid: + description: Product Serial ID of the device, e.g. MT_0000000221 + type: string + serialNumber: + description: Serial number of the device, e.g. MT2116X09299 + type: string + type: + description: Type of device, e.g. ConnectX7 + type: string + required: + - firmwareVersion + - node + - partNumber + - ports + - psid + - serialNumber + - type + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/deployment/network-operator/charts/nic-configuration-operator-chart/templates/_helpers.tpl b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/_helpers.tpl new file mode 100644 index 00000000..9c367f0b --- /dev/null +++ b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/_helpers.tpl @@ -0,0 +1,58 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "nic-configuration-operator.name" -}} +{{- default "nic-configuration-operator" .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "nic-configuration-operator.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default "nic-configuration-operator" .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "nic-configuration-operator.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "nic-configuration-operator.labels" -}} +helm.sh/chart: {{ include "nic-configuration-operator.chart" . }} +{{ include "nic-configuration-operator.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "nic-configuration-operator.selectorLabels" -}} +app.kubernetes.io/name: {{ include "nic-configuration-operator.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "nic-configuration-operator.serviceAccountName" -}} +{{- include "nic-configuration-operator.fullname" . }} +{{- end }} diff --git a/deployment/network-operator/charts/nic-configuration-operator-chart/templates/config-daemon.yaml b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/config-daemon.yaml new file mode 100644 index 00000000..1412d713 --- /dev/null +++ b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/config-daemon.yaml @@ -0,0 +1,64 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nic-configuration-daemon + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: nic-configuration-daemon + app.kubernetes.io/created-by: nic-configuration-operator + app.kubernetes.io/part-of: nic-configuration-operator + {{- include "nic-configuration-operator.labels" . | nindent 4 }} +spec: + selector: + matchLabels: + control-plane: nic-configuration-daemon + {{- include "nic-configuration-operator.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: nic-configuration-daemon + labels: + control-plane: nic-configuration-daemon + {{- include "nic-configuration-operator.selectorLabels" . | nindent 8 }} + spec: + nodeSelector: {{- toYaml .Values.operator.nodeSelector | nindent 8 }} + serviceAccountName: {{ include "nic-configuration-operator.serviceAccountName" . }} + terminationGracePeriodSeconds: 10 + hostNetwork: true + hostPID: true + priorityClassName: system-node-critical + containers: + - image: "{{ .Values.configDaemon.image.repository }}/{{ .Values.configDaemon.image.name }}:{{ .Values.configDaemon.image.tag | default .Chart.AppVersion }}" + name: nic-configuration-daemon + securityContext: + privileged: true + resources: {{- toYaml .Values.configDaemon.resources | nindent 12 }} + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: sys + mountPath: /sys + readOnly: false + - name: proc + mountPath: /proc + readOnly: false + - name: host + mountPath: /host + readOnly: true + volumes: + - name: sys + hostPath: + path: /sys + - name: proc + hostPath: + path: /proc + - name: host + hostPath: + path: / diff --git a/deployment/network-operator/charts/nic-configuration-operator-chart/templates/operator.yaml b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/operator.yaml new file mode 100644 index 00000000..8b8b90d6 --- /dev/null +++ b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/operator.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "nic-configuration-operator.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/component: manager + app.kubernetes.io/created-by: nic-configuration-operator + app.kubernetes.io/part-of: nic-configuration-operator + {{- include "nic-configuration-operator.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + control-plane: {{ .Release.Name }}-controller-manager + {{- include "nic-configuration-operator.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + control-plane: {{ .Release.Name }}-controller-manager + {{- include "nic-configuration-operator.selectorLabels" . | nindent 8 }} + annotations: + kubectl.kubernetes.io/default-container: manager + spec: + tolerations: {{- toYaml .Values.operator.tolerations | nindent 8 }} + nodeSelector: {{- toYaml .Values.operator.nodeSelector | nindent 8 }} + affinity: {{- toYaml .Values.operator.affinity | nindent 8 }} + imagePullSecrets: {{ .Values.imagePullSecrets | default list | toJson }} + securityContext: + runAsNonRoot: true + serviceAccountName: {{ include "nic-configuration-operator.serviceAccountName" . }} + terminationGracePeriodSeconds: 10 + containers: + - name: manager + command: + - /manager + image: "{{ .Values.operator.image.repository }}/{{ .Values.operator.image.name }}:{{ .Values.operator.image.tag | default .Chart.AppVersion }}" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + {{- toYaml .Values.resources | nindent 12 }} diff --git a/deployment/network-operator/charts/nic-configuration-operator-chart/templates/role.yaml b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/role.yaml new file mode 100644 index 00000000..e1f46984 --- /dev/null +++ b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/role.yaml @@ -0,0 +1,99 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "nic-configuration-operator.fullname" . }}-role + labels: + {{- include "nic-configuration-operator.labels" . | nindent 4}} +rules: +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - pods + verbs: + - list +- apiGroups: + - "" + resources: + - pods/eviction + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - configuration.net.nvidia.com + resources: + - nicconfigurationtemplates + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - configuration.net.nvidia.com + resources: + - nicconfigurationtemplates/finalizers + verbs: + - update +- apiGroups: + - configuration.net.nvidia.com + resources: + - nicconfigurationtemplates/status + verbs: + - get + - patch + - update +- apiGroups: + - configuration.net.nvidia.com + resources: + - nicdevices + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - configuration.net.nvidia.com + resources: + - nicdevices/finalizers + verbs: + - update +- apiGroups: + - configuration.net.nvidia.com + resources: + - nicdevices/status + verbs: + - get + - patch + - update +- apiGroups: + - maintenance.nvidia.com + resources: + - nodemaintenances + verbs: + - create + - delete + - get + - list + - patch + - update + - watch diff --git a/deployment/network-operator/charts/nic-configuration-operator-chart/templates/role_binding.yaml b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/role_binding.yaml new file mode 100644 index 00000000..0d10563b --- /dev/null +++ b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/role_binding.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: rbac + app.kubernetes.io/created-by: nic-configuration-operator + app.kubernetes.io/part-of: nic-configuration-operator + name: {{ include "nic-configuration-operator.fullname" . }}-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "nic-configuration-operator.fullname" . }}-role +subjects: +- kind: ServiceAccount + name: {{ include "nic-configuration-operator.fullname" . }} + namespace: {{ .Release.Namespace }} diff --git a/deployment/network-operator/charts/nic-configuration-operator-chart/templates/serviceaccount.yaml b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/serviceaccount.yaml new file mode 100644 index 00000000..4f8e5caf --- /dev/null +++ b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "nic-configuration-operator.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/component: rbac + app.kubernetes.io/created-by: nic-configuration-operator + app.kubernetes.io/part-of: nic-configuration-operator + {{- include "nic-configuration-operator.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.operator.serviceAccount.annotations | nindent 4 }} diff --git a/deployment/network-operator/charts/nic-configuration-operator-chart/templates/supported-nic-firmware-configmap.yaml b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/supported-nic-firmware-configmap.yaml new file mode 100644 index 00000000..84f9a33b --- /dev/null +++ b/deployment/network-operator/charts/nic-configuration-operator-chart/templates/supported-nic-firmware-configmap.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: supported-nic-firmware +data: + Nvidia_mlx5_ConnectX-4: "1013 24.07-0.6.1 12.28.2006" + Nvidia_mlx5_ConnectX-5: "1017 24.07-0.6.1 16.35.4030" + Nvidia_mlx5_ConnectX-5_Ex: "1019 24.07-0.6.1 16.35.4030" + Nvidia_mlx5_ConnectX-6: "101b 24.07-0.6.1 20.42.1000" + Nvidia_mlx5_ConnectX-6_Dx: "101d 24.07-0.6.1 22.42.1000" + Nvidia_mlx5_ConnectX-6_Lx: "101f 24.07-0.6.1 26.42.1000" + Nvidia_mlx5_ConnectX-7: "1021 24.07-0.6.1 28.42.1000" + Nvidia_mlx5_MT42822_BlueField-2_integrated_ConnectX-6_Dx: "a2d6 24.07-0.6.1 24.42.1000" diff --git a/deployment/network-operator/charts/nic-configuration-operator-chart/values.yaml b/deployment/network-operator/charts/nic-configuration-operator-chart/values.yaml new file mode 100644 index 00000000..74989726 --- /dev/null +++ b/deployment/network-operator/charts/nic-configuration-operator-chart/values.yaml @@ -0,0 +1,52 @@ +operator: + image: + repository: ghcr.io/mellanox + name: nic-configuration-operator + tag: latest + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Exists" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" + nodeSelector: {} + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/master" + operator: Exists + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/control-plane" + operator: Exists + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + replicas: 1 + serviceAccount: + annotations: {} + +configDaemon: + image: + repository: ghcr.io/mellanox + name: nic-configuration-operator-daemon + tag: latest + nodeSelector: {} + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + +imagePullSecrets: [] diff --git a/deployment/network-operator/values.yaml b/deployment/network-operator/values.yaml index f7c814c9..4cf5d394 100644 --- a/deployment/network-operator/values.yaml +++ b/deployment/network-operator/values.yaml @@ -29,6 +29,10 @@ sriovNetworkOperator: # -- Deploy SR-IOV Network Operator. enabled: false +nicConfigurationOperator: + # -- Deploy NIC Configuration Operator. + enabled: false + # Set both enableNodeFeatureApi and NodeFeatureAPI feature gate to false to disable. node-feature-discovery: # -- The Node Feature API enable communication between nfd master and worker @@ -164,6 +168,19 @@ sriov-network-operator: beta.kubernetes.io/os: "linux" network.nvidia.com/operator.mofed.wait: "false" +# Nic Configuration Operator chart related values. +nic-configuration-operator-chart: + operator: + image: + repository: ghcr.io/mellanox + name: nic-configuration-operator + tag: v0.1.1 + configDaemon: + image: + repository: ghcr.io/mellanox + name: nic-configuration-operator-daemon + tag: v0.1.1 + # General Operator related values # The operator element allows to deploy network operator from an alternate location operator: diff --git a/hack/release.go b/hack/release.go index 2311c27c..5576117b 100644 --- a/hack/release.go +++ b/hack/release.go @@ -75,6 +75,8 @@ type Release struct { DOCATelemetryService *ReleaseImageSpec OVSCni *ReleaseImageSpec RDMACni *ReleaseImageSpec + NicConfigurationOperator *ReleaseImageSpec + NicConfigurationConfigDaemon *ReleaseImageSpec } func readDefaults(releaseDefaults string) Release { @@ -123,6 +125,8 @@ func readEnvironmentVariables(release *Release) { initWithEnvVariale("DOCA_TELEMETRY_SERVICE", release.DOCATelemetryService) initWithEnvVariale("OVS_CNI", release.OVSCni) initWithEnvVariale("RDMA_CNI", release.RDMACni) + initWithEnvVariale("NIC_CONFIGURATION_OPERATOR", release.NicConfigurationOperator) + initWithEnvVariale("NIC_CONFIGURATION_CONFIG_DAEMON", release.NicConfigurationConfigDaemon) } func main() { diff --git a/hack/release.yaml b/hack/release.yaml index 83b49a28..8fb8341e 100644 --- a/hack/release.yaml +++ b/hack/release.yaml @@ -78,3 +78,11 @@ rdmaCni: image: rdma-cni repository: ghcr.io/k8snetworkplumbingwg version: v1.2.0 +nicConfigurationOperator: + image: nic-configuration-operator + repository: ghcr.io/mellanox + version: v0.1.1 +nicConfigurationConfigDaemon: + image: nic-configuration-operator-daemon + repository: ghcr.io/mellanox + version: v0.1.1 diff --git a/hack/templates/values/values.template b/hack/templates/values/values.template index 4f25331d..7de0e6cd 100644 --- a/hack/templates/values/values.template +++ b/hack/templates/values/values.template @@ -29,6 +29,10 @@ sriovNetworkOperator: # -- Deploy SR-IOV Network Operator. enabled: false +nicConfigurationOperator: + # -- Deploy NIC Configuration Operator. + enabled: false + # Set both enableNodeFeatureApi and NodeFeatureAPI feature gate to false to disable. node-feature-discovery: # -- The Node Feature API enable communication between nfd master and worker @@ -164,6 +168,19 @@ sriov-network-operator: beta.kubernetes.io/os: "linux" network.nvidia.com/operator.mofed.wait: "false" +# Nic Configuration Operator chart related values. +nic-configuration-operator-chart: + operator: + image: + repository: {{ .NicConfigurationOperator.Repository }} + name: {{ .NicConfigurationOperator.Image }} + tag: {{ .NicConfigurationOperator.Version }} + configDaemon: + image: + repository: {{ .NicConfigurationConfigDaemon.Repository }} + name: {{ .NicConfigurationConfigDaemon.Image }} + tag: {{ .NicConfigurationConfigDaemon.Version }} + # General Operator related values # The operator element allows to deploy network operator from an alternate location operator: