Adding files

are-ces · are-ces · commit 6771d6e8e898 · 2025-08-21T11:07:21.000+02:00
diff --git a/README.md b/README.md
@@ -463,6 +463,7 @@ The following configurations are llama-stack config examples from production dep
 - [Granite on vLLM example](examples/vllm-granite-run.yaml)
 - [Qwen3 on vLLM example](examples/vllm-qwen3-run.yaml)
 - [Gemini example](examples/gemini-run.yaml)
+- [VertexAI example](examples/vertexai-run.yaml)
 
 > [!NOTE]
 > RAG functionality is **not tested** for these configurations.
diff --git a/docs/rag_guide.md b/docs/rag_guide.md
@@ -61,7 +61,7 @@ Update the `run.yaml` file used by Llama Stack to point to:
 * Your downloaded **embedding model**
 * Your generated **vector database**
 
-### FAISS Example
+### FAISS example
 
 ```yaml
 models:
@@ -102,11 +102,11 @@ Where:
 
 See the full working [config example](examples/openai-faiss-run.yaml) for more details.
 
-### pgvector Example
+### pgvector example
 
 This example shows how to configure a remote PostgreSQL database with the [pgvector](https://github.com/pgvector/pgvector) extension for storing embeddings.
 
-> You will need to install PostgreSQL, the matching version of pgvector, then log in with `psql` and enable the extension with:
+> You will need to install PostgreSQL with a matching version to pgvector, then log in with `psql` and enable the extension with:
 > ```sql
 > CREATE EXTENSION IF NOT EXISTS vector;
 > ```
@@ -117,10 +117,10 @@ Each pgvector-backed table follows this schema:
 
 - `id` (`text`): UUID identifier of the chunk
 - `document` (`jsonb`): json containing content and metadata associated with the embedding  
-- `embedding` (`vector(n)`): the embedding vector, where `n` is the embedding dimension and must match the model's output size (e.g. 768 for `all-mpnet-base-v2`) 
+- `embedding` (`vector(n)`): the embedding vector, where `n` is the embedding dimension and will match the model's output size (e.g. 768 for `all-mpnet-base-v2`) 
 
 > [!NOTE]
-> The vector_db_id (e.g. rhdocs) is used to point to the table named vector_store_rhdocs in the specified database, which stores the vector embeddings.
+> The `vector_db_id` (e.g. `rhdocs`) is used to point to the table named `vector_store_rhdocs` in the specified database, which stores the vector embeddings.
 
 
 ```yaml
@@ -146,6 +146,7 @@ vector_dbs:
   provider_id: pgvector-example 
   # A unique ID that becomes the PostgreSQL table name, prefixed with 'vector_store_'.
   # e.g., 'rhdocs' will create the table 'vector_store_rhdocs'.
+  # If the table was already created, this value must match the ID used at creation.
   vector_db_id: rhdocs
 ```
 
@@ -155,7 +156,10 @@ See the full working [config example](examples/openai-pgvector-run.yaml) for mor
 
 ## Add an Inference Model (LLM)
 
-### vLLM on RHEL AI (Llama 3.1) Example
+### vLLM on RHEL AI (Llama 3.1) example
+
+> [!NOTE]
+> The following example assumes that podman's CDI has been properly configured to [enable GPU support](https://podman-desktop.io/docs/podman/gpu).
 
 The [`vllm-openai`](https://hub.docker.com/r/vllm/vllm-openai) Docker image is used to serve the Llama-3.1-8B-Instruct model.  
 The following example shows how to run it on **RHEL AI** with `podman`:  
@@ -178,19 +182,13 @@ podman run \
 > For other supported models and configuration options, see the vLLM documentation:
 > [vLLM: Tool Calling](https://docs.vllm.ai/en/stable/features/tool_calling.html)
 
-After starting the container, you can check which model is being served by running:
-
-```bash
-curl http://localhost:8000/v1/models # Replace localhost with the url of the vLLM instance
-```
-
-The response will include the `model_id`, which you can then use in your `run.yaml` configuration.
+After starting the container edit your `run.yaml` file, matching `model_id` with the model provided in the `podman run` command.
 
 ```yaml
 [...]
 models:
 [...]
-- model_id: meta-llama/Llama-3.1-8B-Instruct
+- model_id: meta-llama/Llama-3.1-8B-Instruct # Same as the model name in the 'podman run' command
   provider_id: vllm
   model_type: llm
   provider_model_id: null
@@ -201,13 +199,13 @@ providers:
   - provider_id: vllm
     provider_type: remote::vllm
     config:
-      url: http://localhost:8000/v1/ # Replace localhost with the url of the vLLM instance
-      api_token: <your-key-here>
+      url: http://localhost:${env.EXPORTED_PORT:=8000}/v1/ # Replace localhost with the url of the vLLM instance
+      api_token: <your-key-here> # if any
 ```
 
 See the full working [config example](examples/vllm-llama-faiss-run.yaml) for more details.
 
-### OpenAI Example
+### OpenAI example
 
 Add a provider for your language model (e.g., OpenAI):
 
diff --git a/examples/vertexai-run.yaml b/examples/vertexai-run.yaml
@@ -0,0 +1,91 @@
+# Example llama-stack configuration for VertexAI inference
+# 
+# Contributed by @eloycoto (2025-08). See https://github.com/rhdhorchestrator/LS-core-test/blob/master/run-llama-stack.yaml
+# This file shows how to integrate VertexAI with LCS.
+# 
+# Notes:
+# - You will need to configure Gemini inference on VertexAI.
+#
+version: '3'
+image_name: ollama-llama-stack-config
+apis:
+  - agents
+  - inference
+  - safety
+  - telemetry
+  - tool_runtime
+  - vector_io
+logging:
+  level: DEBUG  # Set root logger to DEBUG
+  category_levels:
+    llama_stack: DEBUG  # Enable DEBUG for all llama_stack modules
+    llama_stack.providers.remote.inference.vllm: DEBUG
+    llama_stack.providers.inline.agents.meta_reference: DEBUG
+    llama_stack.providers.inline.agents.meta_reference.agent_instance: DEBUG
+    llama_stack.providers.inline.vector_io.faiss: DEBUG
+    llama_stack.providers.inline.telemetry.meta_reference: DEBUG
+    llama_stack.core: DEBUG
+    llama_stack.apis: DEBUG
+    uvicorn: DEBUG
+    uvicorn.access: INFO  # Keep HTTP requests at INFO to reduce noise
+    fastapi: DEBUG
+
+providers:
+  vector_io:
+    - config:
+        kvstore:
+          db_path: /tmp/faiss_store.db
+          type: sqlite
+      provider_id: faiss
+      provider_type: inline::faiss
+
+  agents:
+  - config:
+      persistence_store:
+        db_path: /tmp/agents_store.db
+        namespace: null
+        type: sqlite
+      responses_store:
+        db_path: /tmp/responses_store.db
+        type: sqlite
+    provider_id: meta-reference
+    provider_type: inline::meta-reference
+
+
+  inference:
+    - provider_id: vllm-inference
+      provider_type: remote::vllm
+      config:
+        url: ${env.VLLM_URL:=http://localhost:8000/v1}
+        max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+        api_token: ${env.VLLM_API_TOKEN:=fake}
+        tls_verify: ${env.VLLM_TLS_VERIFY:=false}
+
+    - provider_id: google-vertex
+      provider_type: remote::vertexai
+      config:
+        project: ${env.VERTEXAI_PROJECT}
+        region: ${env.VERTEXAI_REGION:=us-east5}
+
+  tool_runtime:
+    - provider_id: model-context-protocol
+      provider_type: remote::model-context-protocol
+      config: {}
+      module: null
+
+  telemetry:
+    - config:
+        service_name: 'llama-stack'
+        sinks: console,sqlite
+        sqlite_db_path: /tmp/trace_store.db
+      provider_id: meta-reference
+      provider_type: inline::meta-reference
+
+metadata_store:
+  type: sqlite
+  db_path: /tmp/registry.db
+  namespace: null
+
+inference_store:
+  type: sqlite
+  db_path: /tmp/inference_store.db