(feat) rhoai-lls does not require kubernetes to run

gallettilance · gallettilance · commit ac1e39e05eef · 2025-09-09T08:34:34.000-04:00
diff --git a/README.md b/README.md
@@ -8,14 +8,36 @@ This directory contains the necessary files to build a Red Hat compatible contai
 - `llama` CLI tool installed: `pip install llama-stack`
 - Podman or Docker installed
 
+## Build Modes
+
+The build script supports three modes:
+
+### 1. Full Mode (Default)
+Includes all features including TrustyAI providers that require Kubernetes/OpenShift:
+```bash
+./distribution/build.py
+```
+
+### 2. Standalone Mode
+Builds a version without Kubernetes dependencies, using Llama Guard for safety:
+```bash
+./distribution/build.py --standalone
+```
+
+### 3. Unified Mode (Recommended)
+Builds a single container that supports both modes via environment variables:
+```bash
+./distribution/build.py --unified
+```
+
 ## Generating the Containerfile
 
 The Containerfile is auto-generated from a template. To generate it:
 
 1. Make sure you have the `llama` CLI tool installed
-2. Run the build script from root of this git repo:
+2. Run the build script from root of this git repo with your desired mode:
    ```bash
-   ./distribution/build.py
+   ./distribution/build.py [--standalone] [--unified]
    ```
 
 This will:
@@ -35,7 +57,47 @@ Once the Containerfile is generated, you can build the image using either Podman
 ### Using Podman build image for x86_64
 
 ```bash
-podman build --platform linux/amd64 -f distribution/Containerfile -t rh .
+podman build --platform linux/amd64 -f distribution/Containerfile -t llama-stack-rh .
+```
+
+### Using Docker
+
+```bash
+docker build -f distribution/Containerfile -t llama-stack-rh .
+```
+
+## Running the Container
+
+### Running in Standalone Mode (No Kubernetes)
+
+To run the container in standalone mode without Kubernetes dependencies, set the `STANDALONE` environment variable:
+
+```bash
+# Using Docker
+docker run -e STANDALONE=true \
+  -e VLLM_URL=http://host.docker.internal:8000/v1 \
+  -e INFERENCE_MODEL=your-model-name \
+  -p 8321:8321 \
+  llama-stack-rh
+
+# Using Podman
+podman run -e STANDALONE=true \
+  -e VLLM_URL=http://host.docker.internal:8000/v1 \
+  -e INFERENCE_MODEL=your-model-name \
+  -p 8321:8321 \
+  llama-stack-rh
+```
+
+### Running in Full Mode (With Kubernetes)
+
+To run with all features including TrustyAI providers (requires Kubernetes/OpenShift):
+
+```bash
+# Using Docker
+docker run -p 8321:8321 llama-stack-rh
+
+# Using Podman
+podman run -p 8321:8321 llama-stack-rh
 ```
 
 ## Notes
diff --git a/distribution/Containerfile b/distribution/Containerfile
@@ -1,5 +1,5 @@
 # WARNING: This file is auto-generated. Do not modify it manually.
-# Generated by: distribution/build.py
+# Generated by: distribution/build.py --unified
 
 FROM registry.access.redhat.com/ubi9/python-312@sha256:95ec8d3ee9f875da011639213fd254256c29bc58861ac0b11f290a291fa04435
 WORKDIR /opt/app-root
@@ -8,6 +8,7 @@ RUN pip install sqlalchemy # somehow sqlalchemy[asyncio] is not sufficient
 RUN pip install \
     aiosqlite \
     autoevals \
+    blobfile \
     chardet \
     datasets \
     fastapi \
@@ -42,7 +43,15 @@ RUN pip install --index-url https://download.pytorch.org/whl/cpu torch torchvisi
 RUN pip install --no-deps sentence-transformers
 RUN pip install --no-cache llama-stack==0.2.18
 RUN mkdir -p ${HOME}/.llama/providers.d ${HOME}/.cache
-COPY distribution/run.yaml ${APP_ROOT}/run.yaml
+
+# Copy both configurations
+COPY distribution/run.yaml ${APP_ROOT}/run-full.yaml
+COPY distribution/run-standalone.yaml ${APP_ROOT}/run-standalone.yaml
+
+# Copy the entrypoint script
+COPY --chmod=755 distribution/entrypoint.sh ${APP_ROOT}/entrypoint.sh
+
+# Copy providers directory (will be filtered by entrypoint script)
 COPY distribution/providers.d/ ${HOME}/.llama/providers.d/
 
-ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "/opt/app-root/run.yaml"]
+ENTRYPOINT ["/opt/app-root/entrypoint.sh"]
diff --git a/distribution/Containerfile.in b/distribution/Containerfile.in
@@ -5,7 +5,15 @@ RUN pip install sqlalchemy # somehow sqlalchemy[asyncio] is not sufficient
 {dependencies}
 RUN pip install --no-cache llama-stack==0.2.18
 RUN mkdir -p ${{HOME}}/.llama/providers.d ${{HOME}}/.cache
-COPY distribution/run.yaml ${{APP_ROOT}}/run.yaml
+
+# Copy both configurations
+COPY distribution/run.yaml ${{APP_ROOT}}/run-full.yaml
+COPY distribution/run-standalone.yaml ${{APP_ROOT}}/run-standalone.yaml
+
+# Copy the entrypoint script
+COPY --chmod=755 distribution/entrypoint.sh ${{APP_ROOT}}/entrypoint.sh
+
+# Copy providers directory (will be filtered by entrypoint script)
 COPY distribution/providers.d/ ${{HOME}}/.llama/providers.d/
 
-ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "/opt/app-root/run.yaml"]
+ENTRYPOINT ["/opt/app-root/entrypoint.sh"]
diff --git a/distribution/build-standalone.yaml b/distribution/build-standalone.yaml
@@ -0,0 +1,35 @@
+version: "2"
+distribution_spec:
+  description: Red Hat distribution of Llama Stack (Standalone Docker)
+  providers:
+    inference:
+    - "remote::vllm"
+    - "inline::sentence-transformers"
+    vector_io:
+    - "inline::milvus"
+    safety:
+    - "inline::llama-guard"
+    agents:
+    - "inline::meta-reference"
+    # eval: removed trustyai_lmeval provider for standalone Docker
+    datasetio:
+    - "remote::huggingface"
+    - "inline::localfs"
+    scoring:
+    - "inline::basic"
+    - "inline::llm-as-judge"
+    - "inline::braintrust"
+    telemetry:
+    - "inline::meta-reference"
+    tool_runtime:
+    - "remote::brave-search"
+    - "remote::tavily-search"
+    - "inline::rag-runtime"
+    - "remote::model-context-protocol"
+  container_image: registry.redhat.io/ubi9/python-311:9.6-1749631027
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
+image_type: container
+image_name: llama-stack-rh-standalone
+# external_providers_dir: distribution/providers.d  # Disabled for standalone mode
diff --git a/distribution/build.py b/distribution/build.py
@@ -5,11 +5,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-# Usage: ./distribution/build.py
+# Usage: ./distribution/build.py [--standalone] [--unified]
+# Or set STANDALONE=true or UNIFIED=true environment variables
 
+import os
 import shutil
 import subprocess
 import sys
+import argparse
 from pathlib import Path
 
 BASE_REQUIREMENTS = [
@@ -57,9 +60,10 @@ def check_llama_stack_version():
         print("Continuing without version validation...")
 
 
-def get_dependencies():
+def get_dependencies(standalone=False):
     """Execute the llama stack build command and capture dependencies."""
-    cmd = "llama stack build --config distribution/build.yaml --print-deps-only"
+    config_file = "distribution/build-standalone.yaml" if standalone else "distribution/build.yaml"
+    cmd = f"llama stack build --config {config_file} --print-deps-only"
     try:
         result = subprocess.run(
             cmd, shell=True, capture_output=True, text=True, check=True
@@ -112,7 +116,7 @@ def get_dependencies():
         sys.exit(1)
 
 
-def generate_containerfile(dependencies):
+def generate_containerfile(dependencies, standalone=False, unified=False):
     """Generate Containerfile from template with dependencies."""
     template_path = Path("distribution/Containerfile.in")
     output_path = Path("distribution/Containerfile")
@@ -126,7 +130,13 @@ def generate_containerfile(dependencies):
         template_content = f.read()
 
     # Add warning message at the top
-    warning = "# WARNING: This file is auto-generated. Do not modify it manually.\n# Generated by: distribution/build.py\n\n"
+    if unified:
+        mode = "unified"
+    elif standalone:
+        mode = "standalone"
+    else:
+        mode = "full"
+    warning = f"# WARNING: This file is auto-generated. Do not modify it manually.\n# Generated by: distribution/build.py --{mode}\n\n"
 
     # Process template using string formatting
     containerfile_content = warning + template_content.format(
@@ -141,19 +151,63 @@ def generate_containerfile(dependencies):
 
 
 def main():
+    parser = argparse.ArgumentParser(
+        description="Build Llama Stack distribution",
+        epilog="""
+Examples:
+  %(prog)s                    # Build full version (default)
+  %(prog)s --standalone       # Build standalone version (no Kubernetes deps)
+  %(prog)s --unified          # Build unified version (supports both modes)
+  STANDALONE=true %(prog)s    # Build standalone via environment variable
+  UNIFIED=true %(prog)s       # Build unified via environment variable
+        """,
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("--standalone", action="store_true", 
+                       help="Build standalone version without Kubernetes dependencies")
+    parser.add_argument("--unified", action="store_true",
+                       help="Build unified version that supports both modes via environment variables")
+    args = parser.parse_args()
+    
+    # Check environment variable as fallback
+    standalone = args.standalone or os.getenv("STANDALONE", "false").lower() in ("true", "1", "yes")
+    unified = args.unified or os.getenv("UNIFIED", "false").lower() in ("true", "1", "yes")
+    
+    if unified:
+        mode = "unified"
+        print("Building unified version (supports both full and standalone modes)...")
+    else:
+        mode = "standalone" if standalone else "full"
+        print(f"Building {mode} version...")
+    
     print("Checking llama installation...")
     check_llama_installed()
 
     print("Checking llama-stack version...")
     check_llama_stack_version()
 
     print("Getting dependencies...")
-    dependencies = get_dependencies()
+    dependencies = get_dependencies(standalone)
 
     print("Generating Containerfile...")
-    generate_containerfile(dependencies)
+    generate_containerfile(dependencies, standalone, unified)
 
     print("Done!")
+    print(f"\nTo build the Docker image:")
+    if unified:
+        print("  docker build -f distribution/Containerfile -t llama-stack-unified .")
+        print("\nTo run in standalone mode:")
+        print("  docker run -e STANDALONE=true -e VLLM_URL=http://host.docker.internal:8000/v1 -e INFERENCE_MODEL=your-model -p 8321:8321 llama-stack-unified")
+        print("\nTo run in full mode (requires Kubernetes):")
+        print("  docker run -p 8321:8321 llama-stack-unified")
+    elif standalone:
+        print("  docker build -f distribution/Containerfile -t llama-stack-standalone .")
+        print("\nTo run in standalone mode:")
+        print("  docker run -e VLLM_URL=http://host.docker.internal:8000/v1 -e INFERENCE_MODEL=your-model -p 8321:8321 llama-stack-standalone")
+    else:
+        print("  docker build -f distribution/Containerfile -t llama-stack-full .")
+        print("\nTo run with full features (requires Kubernetes):")
+        print("  docker run -p 8321:8321 llama-stack-full")
 
 
 if __name__ == "__main__":
diff --git a/distribution/build.yaml b/distribution/build.yaml
@@ -1,32 +1,32 @@
-version: 2
+version: "2"
 distribution_spec:
   description: Red Hat distribution of Llama Stack
   providers:
     inference:
-    - provider_type: remote::vllm
-    - provider_type: inline::sentence-transformers
+    - "remote::vllm"
+    - "inline::sentence-transformers"
     vector_io:
-    - provider_type: inline::milvus
+    - "inline::milvus"
     safety:
-    - provider_type: remote::trustyai_fms
+    - "remote::trustyai_fms"
     agents:
-    - provider_type: inline::meta-reference
+    - "inline::meta-reference"
     eval:
-    - provider_type: remote::trustyai_lmeval
+    - "remote::trustyai_lmeval"
     datasetio:
-    - provider_type: remote::huggingface
-    - provider_type: inline::localfs
+    - "remote::huggingface"
+    - "inline::localfs"
     scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
+    - "inline::basic"
+    - "inline::llm-as-judge"
+    - "inline::braintrust"
     telemetry:
-    - provider_type: inline::meta-reference
+    - "inline::meta-reference"
     tool_runtime:
-    - provider_type: remote::brave-search
-    - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
-    - provider_type: remote::model-context-protocol
+    - "remote::brave-search"
+    - "remote::tavily-search"
+    - "inline::rag-runtime"
+    - "remote::model-context-protocol"
   container_image: registry.redhat.io/ubi9/python-311:9.6-1749631027
 additional_pip_packages:
 - aiosqlite
diff --git a/distribution/entrypoint.sh b/distribution/entrypoint.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Unified entrypoint script for Llama Stack distribution
+# Supports both full and standalone modes via STANDALONE environment variable
+
+set -e
+
+echo "=== Llama Stack Distribution Entrypoint ==="
+
+# Check if we should run in standalone mode
+if [ "${STANDALONE:-false}" = "true" ]; then
+    echo "Running in STANDALONE mode (no Kubernetes dependencies)"
+    
+    # Use standalone configuration
+    CONFIG_FILE="/opt/app-root/run-standalone.yaml"
+    
+    # Filter out TrustyAI providers from providers.d directory
+    echo "Filtering out TrustyAI providers for standalone mode..."
+    mkdir -p ${HOME}/.llama/providers.d
+    
+    # Copy only non-TrustyAI providers
+    find /opt/app-root/.llama/providers.d -name "*.yaml" ! -name "*trustyai*" -exec cp {} ${HOME}/.llama/providers.d/ \; 2>/dev/null || true
+    
+    # Remove the external_providers_dir from the config to prevent loading TrustyAI providers
+    echo "Disabling external providers directory for standalone mode..."
+    sed -i 's|external_providers_dir:.*|# external_providers_dir: disabled for standalone mode|' "$CONFIG_FILE"
+    
+    echo "✓ Standalone configuration ready"
+    echo "✓ TrustyAI providers excluded"
+else
+    echo "Running in FULL mode (with Kubernetes dependencies)"
+    
+    # Use full configuration
+    CONFIG_FILE="/opt/app-root/run-full.yaml"
+    
+    # Copy all providers
+    echo "Setting up all providers..."
+    mkdir -p ${HOME}/.llama/providers.d
+    cp -r /opt/app-root/.llama/providers.d/* ${HOME}/.llama/providers.d/ 2>/dev/null || true
+    
+    echo "✓ Full configuration ready"
+    echo "✓ All providers available"
+fi
+
+echo "Configuration file: $CONFIG_FILE"
+echo "APIs enabled: $(grep -A 20 '^apis:' $CONFIG_FILE | grep '^-' | wc -l) APIs"
+
+# Show which APIs are available
+echo "Available APIs:"
+grep -A 20 '^apis:' $CONFIG_FILE | grep '^-' | sed 's/^- /  - /' || echo "  (none listed)"
+
+# Start the server
+echo "Starting Llama Stack server..."
+exec python -m llama_stack.core.server.server "$CONFIG_FILE"
diff --git a/distribution/run-standalone.yaml b/distribution/run-standalone.yaml