Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions docker/compose/docker-compose.gemma-4b-gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
services:
gemma_4b_it_gpu:
image: nillion/nilai-vllm:latest
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
ipc: host
ulimits:
memlock: -1
stack: 67108864
env_file:
- .env # must include HUGGINGFACE_HUB_TOKEN
restart: unless-stopped
depends_on:
etcd:
condition: service_healthy
command: >
--model "google/gemma-3-4b-it"
--trust-remote-code
--gpu-memory-utilization 0.85
--max-model-len 60000
--tensor-parallel-size 1
--uvicorn-log-level warning
--chat-template-content-format "openai"
--dtype "bfloat16"
environment:
- SVC_HOST=gemma_4b_it_gpu
- SVC_PORT=8000
- ETCD_HOST=etcd
- ETCD_PORT=2379
- TOOL_SUPPORT=true
- ENABLE_MULTIMODAL=true
volumes:
- hugging_face_models:/root/.cache/huggingface
networks:
- backend_net
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
retries: 3
start_period: 90s
timeout: 10s

volumes:
hugging_face_models:

networks:
backend_net:
10 changes: 7 additions & 3 deletions docker/vllm.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM vllm/vllm-openai:v0.7.3
FROM vllm/vllm-openai:latest

# # Specify model name and path during build
# ARG MODEL_NAME=llama_1b_cpu
Expand All @@ -14,13 +14,17 @@ COPY --link . /daemon/
WORKDIR /daemon/nilai-models/

RUN apt-get update && \
apt-get install build-essential -y && \
pip install uv && \
apt-get install -y ffmpeg libsm6 libxext6 libgl1 build-essential && \
pip install uv pillow torchvision torchaudio && \
uv sync && \
apt-get clean && \
apt-get autoremove && \
rm -rf /var/lib/apt/lists/*

# Install dependencies for multimodal models
RUN pip install pillow ftfy regex
RUN pip install git+https://github.com/huggingface/[email protected]

# Expose port 8000 for incoming requests
EXPOSE 8000

Expand Down
1 change: 1 addition & 0 deletions nilai-api/src/nilai_api/config/mainnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"cognitivecomputations/Dolphin3.0-Llama3.1-8B": 30,
"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": 5,
"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4": 5,
"google/gemma-3-4b-it": 5,
}

# It defines the number of requests allowed for each user for a given time frame.
Expand Down
1 change: 1 addition & 0 deletions nilai-api/src/nilai_api/config/testnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"cognitivecomputations/Dolphin3.0-Llama3.1-8B": 5,
"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": 5,
"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4": 5,
"google/gemma-3-4b-it": 5,
}

# It defines the number of requests allowed for each user for a given time frame.
Expand Down
30 changes: 21 additions & 9 deletions nilai-api/src/nilai_api/routers/private.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,16 +238,28 @@ async def chat_completion_stream_generator() -> AsyncGenerator[str, None]:
chat_completion_stream_generator(),
media_type="text/event-stream", # Ensure client interprets as Server-Sent Events
)

client = OpenAI(base_url=model_url, api_key="<not-needed>")
response = client.chat.completions.create(
model=req.model,
messages=req.messages, # type: ignore
stream=req.stream,
top_p=req.top_p,
temperature=req.temperature,
max_tokens=req.max_tokens,
tools=req.tools, # type: ignore
) # type: ignore
if req.response_format:
response = client.beta.chat.completions.parse(
model=req.model,
messages=req.messages,
top_p=req.top_p,
temperature=req.temperature,
max_tokens=req.max_tokens,
tools=req.tools,
response_format=req.response_format,
)
else:
response = client.chat.completions.create(
model=req.model,
messages=req.messages, # type: ignore
stream=req.stream,
top_p=req.top_p,
temperature=req.temperature,
max_tokens=req.max_tokens,
tools=req.tools, # type: ignore
) # type: ignore

model_response = SignedChatCompletion(
**response.model_dump(),
Expand Down
24 changes: 20 additions & 4 deletions packages/nilai-common/src/nilai_common/api_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import uuid
from typing import List, Optional, Literal, Iterable
from typing import List, Optional, Literal, Union

from openai.types.chat import ChatCompletion, ChatCompletionMessage
from openai.types.chat.chat_completion import Choice as OpenaAIChoice
Expand All @@ -21,8 +21,23 @@
]


# Define ImageUrl for image content
class ImageUrl(BaseModel):
url: str


# Define MessageContent for multimodal content
class MessageContent(BaseModel):
type: Literal["text", "image_url"]
text: Optional[str] = None
image_url: Optional[ImageUrl] = None


# Define Message as a standalone class
class Message(ChatCompletionMessage):
role: Literal["system", "user", "assistant", "tool"] # type: ignore
role: Literal["system", "user", "assistant", "tool"]
content: Union[str, List[MessageContent]]
name: Optional[str] = None


class Choice(OpenaAIChoice):
Expand All @@ -34,10 +49,11 @@ class ChatRequest(BaseModel):
messages: List[Message] = Field(..., min_length=1)
temperature: Optional[float] = Field(default=0.2, ge=0.0, le=5.0)
top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0)
max_tokens: Optional[int] = Field(default=2048, ge=1, le=100000)
max_tokens: Optional[int] = Field(default=10000, ge=1, le=100000)
stream: Optional[bool] = False
tools: Optional[Iterable[ChatCompletionToolParam]] = None
tools: List[ChatCompletionToolParam] = Field(default_factory=list)
nilrag: Optional[dict] = {}
response_format: Optional[dict] = {}


class SignedChatCompletion(ChatCompletion):
Expand Down
Loading