Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions .github/workflows/publish-inference-dev.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
name: publish-inference-dev

# Publishes the FlatClaw dev-inference image to ghcr.io.
#
# Same pattern as `publish-inference.yml` (the prod 31B image): pull SGLang base
# from DockerHub, append the dev entrypoint as a single layer via crane mutate,
# push to GHCR. Server-to-server, no local Docker daemon, ~5–10 min wall-clock.
#
# The two pipelines are intentionally identical except for entrypoint contents
# and destination repo, so dev and prod runtimes stay byte-for-byte aligned on
# everything except the GEMMA_DIR_NAME default and the bf16-vs-fp8 quant flag.

on:
workflow_dispatch:
inputs:
sglang_tag:
description: "lmsysorg/sglang tag to base on"
default: "dev"
image_tag:
description: "Destination tag to publish as"
default: "latest"
push:
branches: [main]
paths:
- "infra/inference-dev/entrypoint.sh"
- "infra/inference-dev/Dockerfile"
- ".github/workflows/publish-inference-dev.yml"

permissions:
contents: read
packages: write

jobs:
publish:
runs-on: ubuntu-latest
env:
SGLANG_TAG: ${{ inputs.sglang_tag || 'dev' }}
IMAGE_TAG: ${{ inputs.image_tag || 'latest' }}
SRC: docker.io/lmsysorg/sglang
DST: ghcr.io/${{ github.repository_owner }}/flatclaw-inference-dev
steps:
- uses: actions/checkout@v4

- name: Install crane
env:
# Pinned to avoid rate-limiting on GH's unauthenticated releases API.
# Bump manually if a newer crane is needed.
CRANE_VER: v0.21.5
run: |
set -euo pipefail
curl -fsSL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VER}/go-containerregistry_Linux_x86_64.tar.gz" \
| tar -xzC /usr/local/bin crane
crane version

- name: Login to GHCR
run: echo "${{ secrets.GITHUB_TOKEN }}" | crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin

- name: Stage entrypoint layer
run: |
set -euo pipefail
mkdir -p layer/usr/local/bin
install -m 0755 infra/inference-dev/entrypoint.sh layer/usr/local/bin/entrypoint.sh
tar -C layer -czf /tmp/entrypoint-layer.tgz usr
ls -la /tmp/entrypoint-layer.tgz

- name: Copy base + mutate + push
run: |
set -euo pipefail
echo "Source: ${SRC}:${SGLANG_TAG}"
echo "Dest: ${DST}:${IMAGE_TAG}"
crane mutate \
"${SRC}:${SGLANG_TAG}" \
--append /tmp/entrypoint-layer.tgz \
--entrypoint /usr/local/bin/entrypoint.sh \
--env MODEL_DIR=/workspace/models \
--env GEMMA_DIR_NAME=gemma-4-e4b-it \
--env PYTHONUNBUFFERED=1 \
--tag "${DST}:${IMAGE_TAG}"

- name: Summary
run: |
echo "### Published" >> "$GITHUB_STEP_SUMMARY"
echo "- \`${DST}:${IMAGE_TAG}\`" >> "$GITHUB_STEP_SUMMARY"
echo "- Base: \`${SRC}:${SGLANG_TAG}\`" >> "$GITHUB_STEP_SUMMARY"
echo "- Entrypoint: \`/usr/local/bin/entrypoint.sh\`" >> "$GITHUB_STEP_SUMMARY"
30 changes: 30 additions & 0 deletions infra/inference-dev/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# FlatClaw dev-inference image — SGLang base + entrypoint, no model weights.
#
# Same shape as `infra/inference/Dockerfile` (the prod 31B image). The only
# differences are baked into entrypoint.sh: BF16 instead of FP8 quant, and the
# default GEMMA_DIR_NAME points at the dev model. This keeps prod and dev on
# identical runtime + identical pipeline; they only diverge on the model
# binary and the GPU plan they run on.
#
# Model weights (Gemma 4 E4B-it from Kaggle) live on the per-tenant Northflank
# weights volume served by the in-project `weights-server` pod. They are NOT
# baked in — pods cold-boot in 60–90 s by pulling the SGLang base + the small
# entrypoint layer, then fetching weights over the project's internal HTTP.
#
# In production, .github/workflows/publish-inference-dev.yml uses `crane mutate`
# to build this image registry-to-registry without a local Docker daemon, which
# sidesteps the slow push of a 16+ GB base from a developer laptop. This
# Dockerfile is the equivalent local definition for ad-hoc rebuilds.

ARG SGLANG_TAG=dev
FROM lmsysorg/sglang:${SGLANG_TAG}

ENV MODEL_DIR=/workspace/models \
GEMMA_DIR_NAME=gemma-4-e4b-it \
PYTHONUNBUFFERED=1

COPY --chmod=755 entrypoint.sh /usr/local/bin/entrypoint.sh

EXPOSE 8000

ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
60 changes: 60 additions & 0 deletions infra/inference-dev/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env bash
# FlatClaw dev-inference entrypoint.
#
# Same pattern as the prod 31B entrypoint (`infra/inference/entrypoint.sh`):
# weights are NOT baked into the image, they live on a per-tenant Northflank
# volume served by the in-project `weights-server` pod, populated once by a
# Northflank stager job from
# `google/gemma-4/transformers/gemma-4-e4b-it/1` on Kaggle.
#
# At pod boot the dev Northflank manifest's customEntrypoint fetches weights
# into $MODEL_DIR before invoking this script. When weights are present, this
# entrypoint launches SGLang against $MODEL_DIR/$GEMMA_DIR_NAME on the L4 GPU.
# /v1/chat/completions and /v1/embeddings respond once warm-up completes
# (~60s typical on L4 once weights are local).

set -euo pipefail

: "${MODEL_DIR:=/workspace/models}"
: "${PORT:=8000}"
: "${MAX_CONTEXT:=32768}"
: "${TP:=1}"
: "${GEMMA_DIR_NAME:=gemma-4-e4b-it}"
: "${SGLANG_EXTRA_ARGS:=}"

say() { printf '\n\033[36m[entrypoint-dev] %s\033[0m\n' "$*"; }

model_path="$MODEL_DIR/$GEMMA_DIR_NAME"
if [ ! -d "$model_path" ]; then
echo "FATAL: $model_path does not exist." >&2
echo "The dev manifest's customEntrypoint should fetch weights from the" >&2
echo "in-project weights-server before invoking this script. If running" >&2
echo "locally, mount a directory holding the model files at" >&2
echo "$MODEL_DIR/$GEMMA_DIR_NAME." >&2
ls -la "$MODEL_DIR" 2>&1 >&2 || true
exit 1
fi

say "launching SGLang on :$PORT (context=$MAX_CONTEXT, tp=$TP)"
say "model_path=$model_path"

# Gemma 4 E4B ships in BF16; on a 24 GB L4 we don't need FP8 quant — the model
# is ~8 GB BF16, leaves ample headroom for KV cache at 32k context.
#
# Same SGLang parsers as prod (--tool-call-parser gemma4, --reasoning-parser
# gemma4) so the response shape and parsing path are byte-identical between
# dev and prod. Per Google's chat template, thinking is OFF by default —
# callers must pass `extra_body.chat_template_kwargs.enable_thinking=true`
# to activate it on a per-request basis.
exec python3 -m sglang.launch_server \
--model-path "$model_path" \
--host 0.0.0.0 \
--port "$PORT" \
--context-length "$MAX_CONTEXT" \
--tp "$TP" \
--dtype bfloat16 \
--served-model-name "gemma-4-e4b-it" \
--tool-call-parser gemma4 \
--reasoning-parser gemma4 \
--enable-metrics \
$SGLANG_EXTRA_ARGS
47 changes: 29 additions & 18 deletions infra/inference/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,45 +1,55 @@
#!/usr/bin/env bash
# FlatClaw inference entrypoint.
#
# Weights are NOT baked into the image. They live on a per-tenant persistent
# disk that the inference pod mounts at $MODEL_DIR (default /workspace/models).
# The disk is populated once by stage-disk.sh, which spins up a tiny CPU
# instance, attaches the disk, and runs the Kaggle CLI to download Gemma 4 31B
# from `google/gemma-4/transformers/gemma-4-31b-it/1`.
# Weights are NOT baked into the image. They live on a per-tenant Northflank
# volume served by an in-project `weights-server` pod. The volume is populated
# once by a Northflank stager job that runs the Kaggle CLI to download Gemma 4
# 31B from `google/gemma-4/transformers/gemma-4-31b-it/1`.
#
# At pod boot:
# 1. The volume mount makes /workspace/models/<modelname>/ available.
# 2. SGLang launches against $MODEL_DIR/$GEMMA_DIR_NAME.
# 3. /v1/chat/completions and /v1/embeddings respond once warm-up completes
# (60-90s typical on RTX PRO 6000 Blackwell).
# At pod boot the production Northflank manifest's customEntrypoint fetches
# weights into $MODEL_DIR before invoking this script. When weights are
# present, this entrypoint just launches SGLang against
# $MODEL_DIR/$GEMMA_DIR_NAME. /v1/chat/completions and /v1/embeddings respond
# once warm-up completes (60-90s typical on H100).

set -euo pipefail

: "${MODEL_DIR:=/workspace/models}"
: "${PORT:=8000}"
: "${MAX_CONTEXT:=32768}"
: "${TP:=1}"
: "${GEMMA_DIR_NAME:=gemma-4-31B-it}"
: "${GEMMA_DIR_NAME:=gemma-4-31b-it}"
: "${SGLANG_EXTRA_ARGS:=}"

say() { printf '\n\033[36m[entrypoint] %s\033[0m\n' "$*"; }

model_path="$MODEL_DIR/$GEMMA_DIR_NAME"
if [ ! -d "$model_path" ]; then
echo "FATAL: $model_path does not exist." >&2
echo "Mount the model-weights disk at $MODEL_DIR before starting." >&2
echo "Populate the disk once via stage-disk.sh (see infra/inference/README.md)." >&2
echo "The production manifest's customEntrypoint should fetch weights from" >&2
echo "the in-project weights-server before invoking this script." >&2
echo "If running locally, mount a directory holding the model files at" >&2
echo "$MODEL_DIR/$GEMMA_DIR_NAME." >&2
ls -la "$MODEL_DIR" 2>&1 >&2 || true
exit 1
fi

say "launching SGLang on :$PORT (context=$MAX_CONTEXT, tp=$TP)"
say "model_path=$model_path"

# FP8 quant on RTX PRO 6000 Blackwell (sm_100+) runs through native cutlass /
# deep_gemm — no Marlin fallback (Marlin's 8608-tile constraint kills Gemma 4
# 31B on Ampere sm_80). 96 GB VRAM holds Gemma weights + KV cache + co-resident
# bge-m3, with headroom for VoxCPM2 and SDXL when those land in Spike B2.
# FP8 quant on H100 (Hopper, sm_90) runs through native cutlass / deep_gemm —
# no Marlin fallback (Marlin's 8608-tile constraint kills Gemma 4 31B on
# Ampere sm_80). 80 GB VRAM holds Gemma weights (~33 GB FP8) + KV cache +
# co-resident bge-m3, with ~25 GB headroom for the v0.3 cascade (small Gemma
# + voice + image co-resident).
#
# `--tool-call-parser gemma4` and `--reasoning-parser gemma4` are the
# Gemma-4-specific parsers SGLang ships (added in PR #21952). They extract
# Gemma 4's `<|tool_call|>...<tool_call|>` envelope into structured
# `tool_calls`, and split the `<channel|>` thinking section out as
# `reasoning_content`. Per Google's chat template, thinking is OFF by
# default — callers pass `extra_body.chat_template_kwargs.enable_thinking=true`
# to activate it on a per-request basis.
exec python3 -m sglang.launch_server \
--model-path "$model_path" \
--host 0.0.0.0 \
Expand All @@ -48,6 +58,7 @@ exec python3 -m sglang.launch_server \
--tp "$TP" \
--quantization fp8 \
--served-model-name "gemma-4-31b-it" \
--tool-call-parser pythonic \
--tool-call-parser gemma4 \
--reasoning-parser gemma4 \
--enable-metrics \
$SGLANG_EXTRA_ARGS