diff --git a/charts/kubeslice-controller-egs/Chart.yaml b/charts/kubeslice-controller-egs/Chart.yaml index 7817767..9175e4c 100644 --- a/charts/kubeslice-controller-egs/Chart.yaml +++ b/charts/kubeslice-controller-egs/Chart.yaml @@ -38,5 +38,5 @@ dependencies: repository: file://./charts/prometheus version: 25.*.* condition: kubeslice.prometheus.enabled -version: 0.7.2 -appVersion: 0.7.2 +version: 0.8.0 +appVersion: 0.8.0 diff --git a/charts/kubeslice-controller-egs/templates/crd.clustergpuallocations.inventory.kubeslice.io.yaml b/charts/kubeslice-controller-egs/templates/crd.clustergpuallocations.inventory.kubeslice.io.yaml index 2f2ddba..6c8596b 100644 --- a/charts/kubeslice-controller-egs/templates/crd.clustergpuallocations.inventory.kubeslice.io.yaml +++ b/charts/kubeslice-controller-egs/templates/crd.clustergpuallocations.inventory.kubeslice.io.yaml @@ -1,3 +1,4 @@ +--- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -102,9 +103,54 @@ spec: type: integer type: object type: array + unmanagedNodeInventory: + items: + properties: + cloudProvider: + type: string + free: + type: integer + gpuModelName: + type: string + gpuNodeShape: + type: string + gpuPowerThreshold: + type: string + gpuSharingType: + type: string + gpuSlicingProfile: + items: + properties: + memory: + type: string + numGPUs: + type: integer + profileName: + type: string + type: object + type: array + gpuTempThreshold: + type: string + instanceType: + type: string + memory: + type: integer + nodeHealth: + enum: + - Healthy + - Unhealthy + type: string + nodeName: + type: string + region: + type: string + totalGPUs: + type: integer + type: object + type: array type: object type: object served: true storage: true subresources: - status: {} \ No newline at end of file + status: {} diff --git a/charts/kubeslice-controller-egs/templates/crd.workerclustergpuallocations.inventory.kubeslice.io.yaml b/charts/kubeslice-controller-egs/templates/crd.workerclustergpuallocations.inventory.kubeslice.io.yaml index 48777a9..2cb9c5f 100644 --- a/charts/kubeslice-controller-egs/templates/crd.workerclustergpuallocations.inventory.kubeslice.io.yaml +++ b/charts/kubeslice-controller-egs/templates/crd.workerclustergpuallocations.inventory.kubeslice.io.yaml @@ -1,3 +1,4 @@ +--- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -103,9 +104,54 @@ spec: type: integer type: object type: array + unmanagedNodeInventory: + items: + properties: + cloudProvider: + type: string + free: + type: integer + gpuModelName: + type: string + gpuNodeShape: + type: string + gpuPowerThreshold: + type: string + gpuSharingType: + type: string + gpuSlicingProfile: + items: + properties: + memory: + type: string + numGPUs: + type: integer + profileName: + type: string + type: object + type: array + gpuTempThreshold: + type: string + instanceType: + type: string + memory: + type: integer + nodeHealth: + enum: + - Healthy + - Unhealthy + type: string + nodeName: + type: string + region: + type: string + totalGPUs: + type: integer + type: object + type: array type: object type: object served: true storage: true subresources: - status: {} \ No newline at end of file + status: {} diff --git a/charts/kubeslice-controller-egs/templates/gpr-manager-deployment.yaml b/charts/kubeslice-controller-egs/templates/gpr-manager-deployment.yaml index ed70596..1f19bfa 100644 --- a/charts/kubeslice-controller-egs/templates/gpr-manager-deployment.yaml +++ b/charts/kubeslice-controller-egs/templates/gpr-manager-deployment.yaml @@ -3,7 +3,7 @@ kind: Deployment metadata: labels: control-plane: gpr-controller-manager - name: gpr-manager + name: egs-gpr-manager namespace: {{ .Release.Namespace }} spec: replicas: 1 @@ -35,6 +35,8 @@ spec: value: "{{ .Values.egs.gprManager.qmgrCheckPeriod }}" - name: MAX_DELAYED_COUNT value: "{{ .Values.egs.gprManager.maxDelayedCount }}" + - name: WAIT_TIME_REFRESH_PERIOD + value: "{{ .Values.egs.gprManager.refreshWaitTimeInterval }}" image: "{{ .Values.global.imageRegistry }}/{{ .Values.egs.gprManager.image }}:{{ .Values.egs.gprManager.tag }}" imagePullPolicy: "{{ .Values.egs.gprManager.pullPolicy }}" livenessProbe: @@ -75,3 +77,9 @@ spec: secret: defaultMode: 420 secretName: gpr-admission-webhook-certs + {{- if and .Values.imagePullSecrets .Values.imagePullSecrets.repository .Values.imagePullSecrets.username .Values.imagePullSecrets.password }} + imagePullSecrets: + - name: kubeslice-image-pull-secret + {{- end }} + tolerations: + - operator: "Exists" diff --git a/charts/kubeslice-controller-egs/templates/gpr-manager-service.yaml b/charts/kubeslice-controller-egs/templates/gpr-manager-service.yaml index 0132de6..22116c8 100644 --- a/charts/kubeslice-controller-egs/templates/gpr-manager-service.yaml +++ b/charts/kubeslice-controller-egs/templates/gpr-manager-service.yaml @@ -3,7 +3,7 @@ kind: Service metadata: labels: app: gpr-manager - name: gpr-manager + name: egs-gpr-manager namespace: {{ .Release.Namespace }} spec: ports: diff --git a/charts/kubeslice-controller-egs/templates/inventory-manager-deployment.yaml b/charts/kubeslice-controller-egs/templates/inventory-manager-deployment.yaml index f51ab43..93c392c 100644 --- a/charts/kubeslice-controller-egs/templates/inventory-manager-deployment.yaml +++ b/charts/kubeslice-controller-egs/templates/inventory-manager-deployment.yaml @@ -3,7 +3,7 @@ kind: Deployment metadata: labels: control-plane: inventory-controller-manager - name: inventory-controller-manager + name: egs-inventory-controller-manager namespace: {{ .Release.Namespace }} spec: replicas: 1 @@ -60,3 +60,9 @@ spec: runAsNonRoot: true serviceAccountName: inventory-manager-access terminationGracePeriodSeconds: 10 + {{- if and .Values.imagePullSecrets .Values.imagePullSecrets.repository .Values.imagePullSecrets.username .Values.imagePullSecrets.password }} + imagePullSecrets: + - name: kubeslice-image-pull-secret + {{- end }} + tolerations: + - operator: "Exists" diff --git a/charts/kubeslice-controller-egs/templates/inventory-manager-service.yaml b/charts/kubeslice-controller-egs/templates/inventory-manager-service.yaml index c6049c8..bb6b321 100644 --- a/charts/kubeslice-controller-egs/templates/inventory-manager-service.yaml +++ b/charts/kubeslice-controller-egs/templates/inventory-manager-service.yaml @@ -3,7 +3,7 @@ kind: Service metadata: labels: app: inventory-manager - name: inventory-manager + name: egs-inventory-manager namespace: {{ .Release.Namespace }} spec: ports: diff --git a/charts/kubeslice-controller-egs/templates/queue-manager.yaml b/charts/kubeslice-controller-egs/templates/queue-manager.yaml index 68f43e3..c46216d 100644 --- a/charts/kubeslice-controller-egs/templates/queue-manager.yaml +++ b/charts/kubeslice-controller-egs/templates/queue-manager.yaml @@ -1,7 +1,7 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: queue-manager + name: egs-queue-manager namespace: {{ .Release.Namespace }} labels: app: "queue-manager" @@ -43,6 +43,8 @@ spec: - name: kubeslice-image-pull-secret {{- end }} schedulerName: default-scheduler + tolerations: + - operator: "Exists" strategy: type: RollingUpdate rollingUpdate: diff --git a/charts/kubeslice-controller-egs/values.yaml b/charts/kubeslice-controller-egs/values.yaml index 52e0a53..72f20fa 100644 --- a/charts/kubeslice-controller-egs/values.yaml +++ b/charts/kubeslice-controller-egs/values.yaml @@ -2,11 +2,11 @@ # This is a YAML-formatted file. # Declare variables to be passed into your templates. -# if you're installing in openshift cluster make this variable true global: imageRegistry: docker.io/aveshasystems # Profile settings (e.g., for OpenShift) profile: +# if you're installing in openshift cluster make this variable true openshift: false kubeTally: # Enable or disable KubeTally @@ -55,7 +55,7 @@ kubeslice: # Image name for the KubeSlice controller image: kubeslice-controller-ent-egs # Image tag for the KubeSlice controller - tag: 0.7.0 + tag: 0.8.0 # Image pull policy for the KubeSlice controller pullPolicy: IfNotPresent # Configuration for kubeTally, which handles chargeback and metrics @@ -107,21 +107,24 @@ egs: # Image name for the gpr-manager image: gpr-manager # Image tag for the gpr-manager - tag: 0.7.1 + tag: 0.8.0 imagePullPolicy: IfNotPresent # Period for checking if there is queue processing needed (in seconds) qmgrCheckPeriod: 60 # Number of times a GPR can be delayed if there is not enough inventory maxDelayedCount: 10 + # Period for refreshing the GPRs + refreshWaitTimeInterval: 300 queueManager: + imageRegistry: aveshasystems # global registry will be choosen if removed # Image name for the gpr-manager image: queue-manager # Image tag for the gpr-manager - tag: 0.7.0 + tag: 0.8.0 imagePullPolicy: IfNotPresent service: - name: queue-manager + name: egs-queue-manager port: 80 # turn off reflection appEnv: "production" @@ -130,11 +133,11 @@ egs: # Image name for inventory-manager image: inventory-manager # Image tag for inventory-manager - tag: 0.7.1 + tag: 0.8.0 imagePullPolicy: IfNotPresent logLevel: INFO service: - name: inventory-manager + name: egs-inventory-manager port: 80 prometheus: diff --git a/charts/kubeslice-ui-egs/Chart.yaml b/charts/kubeslice-ui-egs/Chart.yaml index 726ff0b..3998686 100644 --- a/charts/kubeslice-ui-egs/Chart.yaml +++ b/charts/kubeslice-ui-egs/Chart.yaml @@ -28,5 +28,5 @@ keywords: - application kubeVersion: '>= 1.19.0-0' home: https://avesha.io/products/product-slice -version: 0.7.2 -appVersion: 0.7.2 +version: 0.8.0 +appVersion: 0.8.0 diff --git a/charts/kubeslice-ui-egs/templates/egs-core-apis.yaml b/charts/kubeslice-ui-egs/templates/egs-core-apis.yaml new file mode 100644 index 0000000..65e955f --- /dev/null +++ b/charts/kubeslice-ui-egs/templates/egs-core-apis.yaml @@ -0,0 +1,115 @@ +{{- if $.Values.kubeslice.egsCoreApis.enabled }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: egs-core-apis + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: egs-core-apis +rules: + - apiGroups: [ "" ] + resources: [ "serviceaccounts/token" ] + verbs: [ "create" ] + - apiGroups: [ "" ] + resources: [ "serviceaccounts" ] + verbs: [ "get", "list" ] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: egs-core-apis +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: egs-core-apis +subjects: + - kind: ServiceAccount + name: egs-core-apis + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: egs-core-apis + namespace: {{ .Release.Namespace }} +rules: + - apiGroups: [ "" ] + resources: [ "secrets" ] + verbs: [ "get" ] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: egs-core-apis + namespace: {{ .Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: egs-core-apis +subjects: + - kind: ServiceAccount + name: egs-core-apis + namespace: {{ .Release.Namespace }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: egs-core-apis + namespace: {{ .Release.Namespace }} + labels: + app: egs-core-apis + version: v1 +spec: + selector: + matchLabels: + app: egs-core-apis + version: v1 + template: + metadata: + labels: + version: v1 + app: egs-core-apis + spec: + containers: + - name: egs-core-apis + env: + - name: EGS_CORE_APIS_API_KEY_SECRET_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: EGS_CORE_APIS_PORT + value: "8080" + - name: EGS_CORE_APIS_EGS_API_GATEWAY_ENDPOINT + value: "http://kubeslice-api-gw:8080/" + ports: + - containerPort: 8080 + image: '{{ .Values.global.imageRegistry }}/{{ .Values.kubeslice.egsCoreApis.image }}:{{ .Values.kubeslice.egsCoreApis.tag }}' + imagePullPolicy: '{{ .Values.kubeslice.egsCoreApis.pullPolicy }}' + restartPolicy: Always + {{- if and .Values.imagePullSecrets .Values.imagePullSecrets.repository .Values.imagePullSecrets.username .Values.imagePullSecrets.password }} + imagePullSecrets: + - name: kubeslice-ui-image-pull-secret + {{- end }} + serviceAccountName: egs-core-apis + replicas: 1 +--- +apiVersion: v1 +kind: Service +metadata: + name: egs-core-apis + namespace: {{ .Release.Namespace }} +spec: + ports: + - port: 8080 + name: http + protocol: TCP + targetPort: 8080 + selector: + version: v1 + app: egs-core-apis + type: {{ $.Values.kubeslice.egsCoreApis.service.type }} +{{- end }} diff --git a/charts/kubeslice-ui-egs/templates/kubeslice-api-gw-gpu-specification-config.yaml b/charts/kubeslice-ui-egs/templates/kubeslice-api-gw-gpu-specification-config.yaml new file mode 100644 index 0000000..b66dda2 --- /dev/null +++ b/charts/kubeslice-ui-egs/templates/kubeslice-api-gw-gpu-specification-config.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: gpu-specification-config + namespace: kubeslice-controller +data: + gpuDetails: | + [ + { "gpuShape": "a100", "maxPower": 400 }, + { "gpuShape": "a10", "maxPower": 150 }, + { "gpuShape": "h100", "maxPower": 700 }, + { "gpuShape": "p100", "maxPower": 250 }, + { "gpuShape": "v100", "maxPower": 300 }, + { "gpuShape": "tesla t4", "maxPower": 70 }, + { "gpuShape": "p4", "maxPower": 75 }, + { "gpuShape": "k80", "maxPower": 300 }, + { "gpuShape": "l40", "maxPower": 300 } + ] + +--- + diff --git a/charts/kubeslice-ui-egs/values.yaml b/charts/kubeslice-ui-egs/values.yaml index 387be02..b740258 100644 --- a/charts/kubeslice-ui-egs/values.yaml +++ b/charts/kubeslice-ui-egs/values.yaml @@ -11,11 +11,11 @@ kubeTally: kubeslice: ui: image: kubeslice-ui-ent-egs - tag: 0.7.0 + tag: 0.8.0 pullPolicy: IfNotPresent uiv2: image: kubeslice-ui-v2-ent-egs - tag: 0.7.1 + tag: 0.8.0 pullPolicy: IfNotPresent dashboard: image: kubeslice-kubernetes-dashboard @@ -23,7 +23,7 @@ kubeslice: pullPolicy: IfNotPresent uiproxy: image: kubeslice-ui-proxy-egs - tag: 0.7.2 + tag: 0.8.0 pullPolicy: IfNotPresent service: ## For kind, set this to NodePort, elsewhere use LoadBalancer or NodePort @@ -34,9 +34,17 @@ kubeslice: # nodePort: apigw: image: kubeslice-api-gw-ent-egs - tag: 0.7.1 + tag: 0.8.0 pullPolicy: IfNotPresent + egsCoreApis: + enabled: false + image: egs-core-apis + tag: 0.8.0 + pullPolicy: Always + service: + type: LoadBalancer + workerinstaller: image: worker-installer tag: 1.2.0 @@ -46,8 +54,8 @@ kubeslice: url: http://kubeslice-controller-prometheus-service:9090 egs: - InventoryManagerAddress: inventory-manager:80 - QueueManagerAddress: queue-manager:80 + InventoryManagerAddress: egs-inventory-manager:80 + QueueManagerAddress: egs-queue-manager:80 # username & password & email values for imagePullSecrets has to provided to create a secret imagePullSecrets: diff --git a/charts/kubeslice-worker-egs/Chart.yaml b/charts/kubeslice-worker-egs/Chart.yaml index 70defb9..7d3d1ee 100644 --- a/charts/kubeslice-worker-egs/Chart.yaml +++ b/charts/kubeslice-worker-egs/Chart.yaml @@ -37,5 +37,5 @@ dependencies: repository: file://./charts/netop version: 0.2.0 condition: kubesliceNetworking.enabled -version: 0.7.2 -appVersion: 0.7.2 +version: 0.8.0 +appVersion: 0.8.0 diff --git a/charts/kubeslice-worker-egs/templates/aiops-daemonset-toleration-config.yaml b/charts/kubeslice-worker-egs/templates/aiops-daemonset-toleration-config.yaml new file mode 100644 index 0000000..ec73ffd --- /dev/null +++ b/charts/kubeslice-worker-egs/templates/aiops-daemonset-toleration-config.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: daemonset-toleration-config + namespace: {{ .Release.Namespace }} + labels: + component: daemonset-toleration-config +data: + daemonsets: | + gpu-operator/gpu-feature-discovery + gpu-operator/gpu-operator-release-node-feature-discovery-worker + gpu-operator/nvidia-container-toolkit-daemonset + gpu-operator/nvidia-dcgm-exporter + gpu-operator/nvidia-device-plugin-daemonset + gpu-operator/nvidia-device-plugin-mps-control-daemon + gpu-operator/nvidia-mig-manager + gpu-operator/nvidia-operator-validator + monitoring/prometheus-prometheus-node-exporter + kube-system/nvidia-driver-installer + kube-system/nvidia-gpu-device-plugin-large-cos + kube-system/nvidia-gpu-device-plugin-large-ubuntu + kube-system/nvidia-gpu-device-plugin-medium-cos + kube-system/nvidia-gpu-device-plugin-medium-ubuntu + kube-system/nvidia-gpu-device-plugin-small-cos + kube-system/nvidia-gpu-device-plugin-small-ubuntu diff --git a/charts/kubeslice-worker-egs/templates/aiops-operator-rbac.yaml b/charts/kubeslice-worker-egs/templates/aiops-operator-rbac.yaml index 89f7381..f225374 100644 --- a/charts/kubeslice-worker-egs/templates/aiops-operator-rbac.yaml +++ b/charts/kubeslice-worker-egs/templates/aiops-operator-rbac.yaml @@ -104,6 +104,24 @@ rules: - get - list - watch +- apiGroups: + - apps + resources: + - daemonsets + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch - apiGroups: - "" resources: diff --git a/charts/kubeslice-worker-egs/templates/crd.clustergpuallocations.aiops.kubeslice.io.yaml b/charts/kubeslice-worker-egs/templates/crd.clustergpuallocations.aiops.kubeslice.io.yaml index 5036adc..4e2e93f 100644 --- a/charts/kubeslice-worker-egs/templates/crd.clustergpuallocations.aiops.kubeslice.io.yaml +++ b/charts/kubeslice-worker-egs/templates/crd.clustergpuallocations.aiops.kubeslice.io.yaml @@ -69,6 +69,31 @@ spec: type: integer type: object type: array + unmanagedNodeInventory: + items: + properties: + free: + type: integer + gpuModelName: + type: string + gpuNodeShape: + type: string + gpuPowerThresold: + type: string + gpuTempThresold: + type: string + instanceType: + type: string + memory: + type: integer + nodeHealth: + type: string + nodeName: + type: string + totalGPUs: + type: integer + type: object + type: array type: object type: object served: true diff --git a/charts/kubeslice-worker-egs/values.yaml b/charts/kubeslice-worker-egs/values.yaml index cd469ea..638eda5 100644 --- a/charts/kubeslice-worker-egs/values.yaml +++ b/charts/kubeslice-worker-egs/values.yaml @@ -18,7 +18,7 @@ cluster: aiops: image: kube-aiops-operator - tag: 0.7.1 + tag: 0.8.0 pullPolicy: IfNotPresent logLevel: INFO