NillionNetwork · manel1874 · May 5, 2025
diff --git a/docker/compose/docker-compose.gemma-4b-gpu.yml b/docker/compose/docker-compose.gemma-4b-gpu.yml
@@ -0,0 +1,52 @@
+services:
+  gemma_4b_it_gpu:
+    image: nillion/nilai-vllm:latest
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    ipc: host
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    env_file:
+      - .env    # must include HUGGINGFACE_HUB_TOKEN
+    restart: unless-stopped
+    depends_on:
+      etcd:
+        condition: service_healthy
+    command: >
+      --model "google/gemma-3-4b-it"
+      --trust-remote-code
+      --gpu-memory-utilization 0.85
+      --max-model-len 60000
+      --tensor-parallel-size 1
+      --uvicorn-log-level warning
+      --chat-template-content-format "openai"
+      --dtype "bfloat16"
+    environment:
+      - SVC_HOST=gemma_4b_it_gpu
+      - SVC_PORT=8000
+      - ETCD_HOST=etcd
+      - ETCD_PORT=2379
+      - TOOL_SUPPORT=true
+      - ENABLE_MULTIMODAL=true
+    volumes:
+      - hugging_face_models:/root/.cache/huggingface
+    networks:
+      - backend_net
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      retries: 3
+      start_period: 90s
+      timeout: 10s
+
+volumes:
+  hugging_face_models:
+
+networks:
+  backend_net:
diff --git a/docker/vllm.Dockerfile b/docker/vllm.Dockerfile
@@ -1,4 +1,4 @@
-FROM vllm/vllm-openai:v0.7.3
+FROM vllm/vllm-openai:latest
 
 # # Specify model name and path during build
 # ARG MODEL_NAME=llama_1b_cpu
@@ -14,13 +14,17 @@ COPY --link . /daemon/
 WORKDIR /daemon/nilai-models/
 
 RUN apt-get update && \
-    apt-get install build-essential -y && \
-    pip install uv && \
+    apt-get install -y ffmpeg libsm6 libxext6 libgl1 build-essential && \
+    pip install uv pillow torchvision torchaudio && \
     uv sync && \
     apt-get clean && \
     apt-get autoremove && \
     rm -rf /var/lib/apt/lists/*
 
+# Install dependencies for multimodal models
+RUN pip install pillow ftfy regex
+RUN pip install git+https://github.com/huggingface/[email protected]
+
 # Expose port 8000 for incoming requests
 EXPOSE 8000
 

diff --git a/nilai-api/src/nilai_api/config/mainnet.py b/nilai-api/src/nilai_api/config/mainnet.py
@@ -10,6 +10,7 @@
     "cognitivecomputations/Dolphin3.0-Llama3.1-8B": 30,
     "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": 5,
     "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4": 5,
+    "google/gemma-3-4b-it": 5,
 }
 
 # It defines the number of requests allowed for each user for a given time frame.

diff --git a/nilai-api/src/nilai_api/config/testnet.py b/nilai-api/src/nilai_api/config/testnet.py
@@ -10,6 +10,7 @@
     "cognitivecomputations/Dolphin3.0-Llama3.1-8B": 5,
     "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": 5,
     "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4": 5,
+    "google/gemma-3-4b-it": 5,
 }
 
 # It defines the number of requests allowed for each user for a given time frame.

diff --git a/nilai-api/src/nilai_api/routers/private.py b/nilai-api/src/nilai_api/routers/private.py
@@ -238,16 +238,28 @@ async def chat_completion_stream_generator() -> AsyncGenerator[str, None]:
             chat_completion_stream_generator(),
             media_type="text/event-stream",  # Ensure client interprets as Server-Sent Events
         )
+
     client = OpenAI(base_url=model_url, api_key="<not-needed>")
-    response = client.chat.completions.create(
-        model=req.model,
-        messages=req.messages,  # type: ignore
-        stream=req.stream,
-        top_p=req.top_p,
-        temperature=req.temperature,
-        max_tokens=req.max_tokens,
-        tools=req.tools,  # type: ignore
-    )  # type: ignore
+    if req.response_format:
+        response = client.beta.chat.completions.parse(
+            model=req.model,
+            messages=req.messages,
+            top_p=req.top_p,
+            temperature=req.temperature,
+            max_tokens=req.max_tokens,
+            tools=req.tools,
+            response_format=req.response_format,
+        )
+    else:
+        response = client.chat.completions.create(
+            model=req.model,
+            messages=req.messages,  # type: ignore
+            stream=req.stream,
+            top_p=req.top_p,
+            temperature=req.temperature,
+            max_tokens=req.max_tokens,
+            tools=req.tools,  # type: ignore
+        )  # type: ignore
 
     model_response = SignedChatCompletion(
         **response.model_dump(),

diff --git a/packages/nilai-common/src/nilai_common/api_model.py b/packages/nilai-common/src/nilai_common/api_model.py
@@ -1,5 +1,5 @@
 import uuid
-from typing import List, Optional, Literal, Iterable
+from typing import List, Optional, Literal, Union
 
 from openai.types.chat import ChatCompletion, ChatCompletionMessage
 from openai.types.chat.chat_completion import Choice as OpenaAIChoice
@@ -21,8 +21,23 @@
 ]
 
 
+# Define ImageUrl for image content
+class ImageUrl(BaseModel):
+    url: str
+
+
+# Define MessageContent for multimodal content
+class MessageContent(BaseModel):
+    type: Literal["text", "image_url"]
+    text: Optional[str] = None
+    image_url: Optional[ImageUrl] = None
+
+
+# Define Message as a standalone class
 class Message(ChatCompletionMessage):
-    role: Literal["system", "user", "assistant", "tool"]  # type: ignore
+    role: Literal["system", "user", "assistant", "tool"]
+    content: Union[str, List[MessageContent]]
+    name: Optional[str] = None
 
 
 class Choice(OpenaAIChoice):
@@ -34,10 +49,11 @@ class ChatRequest(BaseModel):
     messages: List[Message] = Field(..., min_length=1)
     temperature: Optional[float] = Field(default=0.2, ge=0.0, le=5.0)
     top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0)
-    max_tokens: Optional[int] = Field(default=2048, ge=1, le=100000)
+    max_tokens: Optional[int] = Field(default=10000, ge=1, le=100000)
     stream: Optional[bool] = False
-    tools: Optional[Iterable[ChatCompletionToolParam]] = None
+    tools: List[ChatCompletionToolParam] = Field(default_factory=list)
     nilrag: Optional[dict] = {}
+    response_format: Optional[dict] = {}
 
 
 class SignedChatCompletion(ChatCompletion):