feat: NVIDIA NIM Updates (#631)

awslabs · Sep 3, 2024 · 5ddd136 · 5ddd136
1 parent 2be29e5
commit 5ddd136
Show file tree

Hide file tree

Showing 9 changed files with 204 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ In this repository, you'll find a variety of deployment blueprints for creating
 
 🚀 [JupyterHub on EKS](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/jupyterhub) 👈 This blueprint deploys a self-managed JupyterHub on EKS with Amazon Cognito authentication.
 
-🚀 [Generative AI on EKS](https://awslabs.github.io/data-on-eks/docs/gen-ai) 👈 Collection of Generative AI Trianing and Inference LLM deployment patterns
+🚀 [Generative AI on EKS](https://awslabs.github.io/data-on-eks/docs/gen-ai) 👈 Collection of Generative AI Training and Inference LLM deployment patterns
 
 ### 📊 Data
 

diff --git a/ai-ml/nvidia-triton-server/README.md b/ai-ml/nvidia-triton-server/README.md
@@ -77,6 +77,7 @@
 | <a name="input_huggingface_token"></a> [huggingface\_token](#input\_huggingface\_token) | Hugging Face Secret Token | `string` | `"DUMMY_TOKEN_REPLACE_ME"` | no |
 | <a name="input_name"></a> [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"nvidia-triton-server"` | no |
 | <a name="input_ngc_api_key"></a> [ngc\_api\_key](#input\_ngc\_api\_key) | NGC API Key | `string` | `"DUMMY_NGC_API_KEY_REPLACE_ME"` | no |
+| <a name="input_nim_models"></a> [nim\_models](#input\_nim\_models) | NVIDIA NIM Models | <pre>list(object({<br>    name    = string<br>    id      = string<br>    enable  = bool<br>    num_gpu = string<br>  }))</pre> | <pre>[<br>  {<br>    "enable": false,<br>    "id": "nvcr.io/nim/meta/llama-3.1-8b-instruct",<br>    "name": "llama-3-1-8b-instruct",<br>    "num_gpu": "4"<br>  },<br>  {<br>    "enable": true,<br>    "id": "nvcr.io/nim/meta/llama3-8b-instruct",<br>    "name": "llama3-8b-instruct",<br>    "num_gpu": "1"<br>  }<br>]</pre> | no |
 | <a name="input_region"></a> [region](#input\_region) | region | `string` | `"us-west-2"` | no |
 | <a name="input_secondary_cidr_blocks"></a> [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` | <pre>[<br>  "100.64.0.0/16"<br>]</pre> | no |
 | <a name="input_vpc_cidr"></a> [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR. This should be a valid private (RFC 1918) CIDR range | `string` | `"10.1.0.0/21"` | no |

diff --git a/ai-ml/nvidia-triton-server/helm-values/nim-llm.yaml b/ai-ml/nvidia-triton-server/helm-values/nim-llm.yaml
@@ -1,15 +1,20 @@
 # ref: https://github.com/NVIDIA/nim-deploy/blob/main/helm/nim-llm/values.yaml
+fullname: ${name}
 image:
-  repository: nvcr.io/nim/meta/llama3-8b-instruct
+  repository: ${model_id}
   tag: latest
+imagePullSecrets:
+  - name: ngc-secret-${name}
 model:
+  name: ${name}
+  ngcAPISecret: ngc-api-${name}
   ngcAPIKey: ${ngc_api_key}
   nimCache: /model-store
 resources:
   limits:
-    nvidia.com/gpu: 1
+    nvidia.com/gpu: ${num_gpu}
   requests:
-    nvidia.com/gpu: 1
+    nvidia.com/gpu: ${num_gpu}
 statefulSet:
   enabled: true
 persistence:
@@ -45,9 +50,10 @@ autoscaling:
 ingress:
   enabled: true
   className: nginx
-  annotations: {}
+  annotations:
+    kubernetes.io/ingress.class: nginx
   hosts:
   - paths:
-    - path: /
-      pathType: ImplementationSpecific
+    - path: /${name}
+      pathType: Prefix
       serviceType: openai
diff --git a/ai-ml/nvidia-triton-server/nvidia-nim.tf b/ai-ml/nvidia-triton-server/nvidia-nim.tf
@@ -104,21 +104,33 @@ resource "null_resource" "download_nim_deploy" {
   }
 }
 
+#--------------------------------------------------------------------
+# Helm Chart for deploying NIM models
+#--------------------------------------------------------------------
+locals {
+  enabled_models = var.enable_nvidia_nim ? {
+    for model in var.nim_models : model.name => model
+    if model.enable
+  } : {}
+}
 
 resource "helm_release" "nim_llm" {
-  count            = var.enable_nvidia_nim ? 1 : 0
-  name             = "nim-llm"
+  for_each         = local.enabled_models
+  name             = "nim-llm-${each.key}"
   chart            = "${path.module}/nim-llm"
   create_namespace = true
-  namespace        = kubernetes_namespace.nim[count.index].metadata[0].name
+  namespace        = kubernetes_namespace.nim[0].metadata[0].name
   timeout          = 360
   wait             = false
   values = [
     templatefile(
       "${path.module}/helm-values/nim-llm.yaml",
       {
+        model_id    = each.value.id
+        name        = each.value.name
+        num_gpu     = each.value.num_gpu
         ngc_api_key = var.ngc_api_key
-        pvc_name    = kubernetes_persistent_volume_claim_v1.efs_pvc[count.index].metadata[0].name
+        pvc_name    = kubernetes_persistent_volume_claim_v1.efs_pvc[0].metadata[0].name
       }
     )
   ]

diff --git a/ai-ml/nvidia-triton-server/variables.tf b/ai-ml/nvidia-triton-server/variables.tf
@@ -75,3 +75,33 @@ variable "ngc_api_key" {
   default     = "DUMMY_NGC_API_KEY_REPLACE_ME"
   sensitive   = true
 }
+
+variable "nim_models" {
+  description = "NVIDIA NIM Models"
+  type = list(object({
+    name    = string
+    id      = string
+    enable  = bool
+    num_gpu = string
+  }))
+  # Ensure you check the NVIDIA NIM support matrix for models and the required GPUs:
+  # https://docs.nvidia.com/nim/large-language-models/latest/support-matrix.html#
+  #
+  # For identifying the right EC2 instances with the supported GPUs, refer to:
+  # https://docs.aws.amazon.com/dlami/latest/devguide/gpu.html
+  default = [
+    {
+      name    = "llama-3-1-8b-instruct"
+      num_gpu = "4"
+      id      = "nvcr.io/nim/meta/llama-3.1-8b-instruct"
+      enable  = false
+    },
+    {
+      name    = "llama3-8b-instruct"
+      num_gpu = "1"
+      id      = "nvcr.io/nim/meta/llama3-8b-instruct"
+      enable  = true
+    }
+    # Add more models as needed
+  ]
+}
diff --git a/gen-ai/inference/nvidia-nim/openai-webui-deployment.yaml b/gen-ai/inference/nvidia-nim/openai-webui-deployment.yaml
@@ -0,0 +1,70 @@
+#-----------------------------------------------------------------------------
+# Deployment Instructions
+# kubectl apply -f gen-ai/inference/nvidia-nim/openai-webui-deployment.yaml
+# kubectl port-forward svc/open-webui 8080:80 -n openai-webui
+# Open WebUi using http://localhost:8080
+#-----------------------------------------------------------------------------
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: openai-webui
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: open-webui
+  namespace: openai-webui
+  labels:
+    app: open-webui
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: open-webui
+  template:
+    metadata:
+      labels:
+        app: open-webui
+    spec:
+      containers:
+        - name: open-webui
+          image: ghcr.io/open-webui/open-webui:main
+          ports:
+            - containerPort: 8080
+          resources:
+            requests:
+              cpu: "500m"
+              memory: "500Mi"
+            limits:
+              cpu: "1000m"
+              memory: "1Gi"
+          env:
+            - name: OPENAI_API_BASE_URLS
+              value: "http://nim-llm-llama3-8b-instruct.nim.svc.cluster.local:8000/v1"
+            - name: OPENAI_API_KEY
+              value: "dummy"  # Replace with actual API key if required
+          volumeMounts:
+            - name: webui-volume
+              mountPath: /app/backend/data
+      volumes:
+        - name: webui-volume
+          emptyDir: {}
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: open-webui
+  namespace: openai-webui
+  labels:
+    app: open-webui
+spec:
+  type: ClusterIP
+  selector:
+    app: open-webui
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 8080
diff --git a/website/docs/gen-ai/inference/GPUs/nvidia-nim-llama3.md b/website/docs/gen-ai/inference/GPUs/nvidia-nim-llama3.md
@@ -124,6 +124,20 @@ Important Note: Ensure that you update the region in the variables.tf file befor
 
 Run the installation script:
 
+:::info
+
+
+This pattern deploys a model called `nvcr.io/nim/meta/llama3-8b-instruct`. You can modify the `nim_models` variable in the `variables.tf` file to add more models. Multiple models can be deployed simultaneously using this pattern.
+:::
+
+:::caution
+
+Ensure you have specified enough GPUs for each model before enabling additional models through these variables. Also, verify that your AWS account has access to sufficient GPUs.
+This pattern uses Karpenter to scale GPU nodes, restricted to G5 instances by default. You can modify the Karpenter node pool to include other instances like p4 and p5 if needed.
+
+:::
+
+
 ```bash
 cd data-on-eks/ai-ml/nvidia-triton-server
 export TF_VAR_enable_nvidia_nim=true
@@ -145,38 +159,30 @@ aws eks --region us-west-2 update-kubeconfig --name nvidia-triton-server
 Check the status of your pods deployed
 
 ```bash
-kubectl get po -n nim
+kubectl get all -n nim
 ```
 
 You should see output similar to the following:
 <details>
 <summary>Click to expand the deployment details</summary>
 
 ```text
-NAME            READY   STATUS    RESTARTS   AGE
-pod/nim-llm-0   1/1     Running   0          105s
+NAME                               READY   STATUS    RESTARTS   AGE
+pod/nim-llm-llama3-8b-instruct-0   1/1     Running   0          4h2m
 
-NAME              TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)    AGE
-service/nim-llm   ClusterIP   172.20.63.25   <none>        8000/TCP   107s
+NAME                                     TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)    AGE
+service/nim-llm-llama3-8b-instruct       ClusterIP   172.20.5.230   <none>        8000/TCP   4h2m
+service/nim-llm-llama3-8b-instruct-sts   ClusterIP   None           <none>        8000/TCP   4h2m
 
-NAME                       READY   AGE
-statefulset.apps/nim-llm   1/4     106s
+NAME                                          READY   AGE
+statefulset.apps/nim-llm-llama3-8b-instruct   1/1     4h2m
 
-NAME                                          REFERENCE             TARGETS   MINPODS   MAXPODS   REPLICAS   AGE
-horizontalpodautoscaler.autoscaling/nim-llm   StatefulSet/nim-llm   1/5       1         5         4          107s
+NAME                                                             REFERENCE                                TARGETS   MINPODS   MAXPODS   REPLICAS   AGE
+horizontalpodautoscaler.autoscaling/nim-llm-llama3-8b-instruct   StatefulSet/nim-llm-llama3-8b-instruct   2/5       1         5         1          4h2m
 ```
 </details>
 
-:::info
-The `Llama3` model deployed is specified in `ai-ml/nvidia-triton-server/helm-values/nim-llm.yaml` with below config. Please visit [this page](https://build.nvidia.com/explore/discover) to explore more. You may simply update this image configuration if you want to change to deploy another model.
-:::
-
-```yaml
-image:
-  repository: nvcr.io/nim/meta/llama3-8b-instruct
-  tag: latest
-```
-The Llama3 model is deployed with a StatefulSet in nim-llm namespace. As it is running, Karpenter provisioned a GPU
+The `llama3-8b-instruct` model is deployed with a StatefulSet in `nim` namespace. As it is running, Karpenter provisioned a GPU
 Check the Karpenter provisioned node.
 
 ```bash
@@ -193,7 +199,7 @@ ip-100-64-77-39.us-west-2.compute.internal   Ready    <none>   4m46s   v1.30.0-e
 Once all pods in `nim` namespace is ready with `1/1` status, use below command to verify it's ready to serve the traffic. To verify, expose the model serving service with port-forward using kubectl.
 
 ```bash
-kubectl port-forward -n nim svc/nim-llm 8000
+kubectl port-forward -n nim service/nim-llm-llama3-8b-instruct 8000
 ```
 
 Then you can invoke the deployed model with a simple HTTP request with curl command.
@@ -315,6 +321,50 @@ By applying these optimizations, TensorRT can significantly accelerate LLM infer
 ```
 </details>
 
+## Open WebUI Deployment
+
+:::info
+
+[Open WebUI](https://github.com/open-webui/open-webui) is compatible only with models that work with the OpenAI API server and Ollama.
+
+:::
+
+**1. Deploy the WebUI**
+
+Deploy the [Open WebUI](https://github.com/open-webui/open-webui) by running the following command:
+
+```sh
+kubectl apply -f gen-ai/inference/nvidia-nim/openai-webui-deployment.yaml
+```
+
+**2. Port Forward to Access WebUI**
+
+Use kubectl port-forward to access the WebUI locally:
+
+```sh
+kubectl port-forward svc/open-webui 8081:80 -n openai-webui
+```
+
+**3. Access the WebUI**
+
+Open your browser and go to http://localhost:8081
+
+**4. Sign Up**
+
+Sign up using your name, email, and a dummy password.
+
+**5. Start a New Chat**
+
+Click on New Chat and select the model from the dropdown menu, as shown in the screenshot below:
+
+![alt text](../img/openweb-ui-nim-1.png)
+
+**6. Enter Test Prompt**
+
+Enter your prompt, and you will see the streaming results, as shown below:
+
+![alt text](../img/openweb-ui-nim-2.png)
+
 ## Performance Testing with NVIDIA GenAI-Perf Tool
 
 [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/client/src/c%2B%2B/perf_analyzer/genai-perf/README.html) is a command line tool for measuring the throughput and latency of generative AI models as served through an inference server.
@@ -351,7 +401,7 @@ genai-perf \
   --concurrency 10 \
   --measurement-interval 4000 \
   --profile-export-file my_profile_export.json \
-  --url nim-llm.nim:8000
+  --url nim-llm-llama3-8b-instruct.nim:8000
 ```
 
 You should see similar output like the following
@@ -397,10 +447,11 @@ prometheus-adapter                               ClusterIP   172.20.171.163   <n
 prometheus-operated                              ClusterIP   None             <none>        9090/TCP            10m
 ```
 
-The NVIDIA NIM LLM service expose metrics via `/metrics` endpoint from `nim-llm` service at port `8000`. Verify it by running
+The NVIDIA NIM LLM service expose metrics via `/metrics` endpoint from `nim-llm-llama3-8b-instruct` service at port `8000`. Verify it by running
+
 ```bash
 kubectl get svc -n nim
-kubectl port-forward -n nim svc/nim-llm 8000
+kubectl port-forward -n nim svc/nim-llm-llama3-8b-instruct 8000
 
 curl localhost:8000/metrics # run this in another terminal
 ```

diff --git a/website/docs/gen-ai/inference/img/openweb-ui-nim-1.png b/website/docs/gen-ai/inference/img/openweb-ui-nim-1.png
diff --git a/website/docs/gen-ai/inference/img/openweb-ui-nim-2.png b/website/docs/gen-ai/inference/img/openweb-ui-nim-2.png