From 121d1ce65872bf0cda03448ff43fb50bad69ec5e Mon Sep 17 00:00:00 2001 From: Skyler Truax Date: Mon, 4 May 2026 17:49:59 -0400 Subject: [PATCH] Inference: gemma4 tool/reasoning parsers; add inference-dev image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Source-of-truth fix to match the parser flags that production has been running. Past session JSONLs show prod's deployed pod was already using --tool-call-parser gemma4 (125+ structured toolCall blocks captured, zero raw <|tool_call|> envelope leaks). The repo's infra/inference/entrypoint.sh had drifted to "pythonic" — wrong for Gemma 4, would have broken the next image republish. - infra/inference/entrypoint.sh: --tool-call-parser pythonic → gemma4, plus --reasoning-parser gemma4. Per SGLang's published Gemma 4 cookbook (PR sgl-project/sglang#21952). Thinking still defaults OFF in the chat template — callers must pass extra_body.chat_template_kwargs.enable_thinking=true to activate. - infra/inference-dev/{Dockerfile,entrypoint.sh}: new dev-inference image, mirrors the prod Dockerfile shape exactly. Same SGLang base, same crane-mutate publish pattern, same parser flags. Diverges only on dtype (bfloat16 vs fp8) and the GEMMA_DIR_NAME default (gemma-4-e4b-it vs gemma-4-31b-it). - .github/workflows/publish-inference-dev.yml: companion publish workflow for ghcr.io//flatclaw-inference-dev:latest. Same shape as publish-inference.yml, runs on push-to-main when the inference-dev files change. --- .github/workflows/publish-inference-dev.yml | 85 +++++++++++++++++++++ infra/inference-dev/Dockerfile | 30 ++++++++ infra/inference-dev/entrypoint.sh | 60 +++++++++++++++ infra/inference/entrypoint.sh | 47 +++++++----- 4 files changed, 204 insertions(+), 18 deletions(-) create mode 100644 .github/workflows/publish-inference-dev.yml create mode 100644 infra/inference-dev/Dockerfile create mode 100755 infra/inference-dev/entrypoint.sh diff --git a/.github/workflows/publish-inference-dev.yml b/.github/workflows/publish-inference-dev.yml new file mode 100644 index 0000000..d0d6462 --- /dev/null +++ b/.github/workflows/publish-inference-dev.yml @@ -0,0 +1,85 @@ +name: publish-inference-dev + +# Publishes the FlatClaw dev-inference image to ghcr.io. +# +# Same pattern as `publish-inference.yml` (the prod 31B image): pull SGLang base +# from DockerHub, append the dev entrypoint as a single layer via crane mutate, +# push to GHCR. Server-to-server, no local Docker daemon, ~5–10 min wall-clock. +# +# The two pipelines are intentionally identical except for entrypoint contents +# and destination repo, so dev and prod runtimes stay byte-for-byte aligned on +# everything except the GEMMA_DIR_NAME default and the bf16-vs-fp8 quant flag. + +on: + workflow_dispatch: + inputs: + sglang_tag: + description: "lmsysorg/sglang tag to base on" + default: "dev" + image_tag: + description: "Destination tag to publish as" + default: "latest" + push: + branches: [main] + paths: + - "infra/inference-dev/entrypoint.sh" + - "infra/inference-dev/Dockerfile" + - ".github/workflows/publish-inference-dev.yml" + +permissions: + contents: read + packages: write + +jobs: + publish: + runs-on: ubuntu-latest + env: + SGLANG_TAG: ${{ inputs.sglang_tag || 'dev' }} + IMAGE_TAG: ${{ inputs.image_tag || 'latest' }} + SRC: docker.io/lmsysorg/sglang + DST: ghcr.io/${{ github.repository_owner }}/flatclaw-inference-dev + steps: + - uses: actions/checkout@v4 + + - name: Install crane + env: + # Pinned to avoid rate-limiting on GH's unauthenticated releases API. + # Bump manually if a newer crane is needed. + CRANE_VER: v0.21.5 + run: | + set -euo pipefail + curl -fsSL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VER}/go-containerregistry_Linux_x86_64.tar.gz" \ + | tar -xzC /usr/local/bin crane + crane version + + - name: Login to GHCR + run: echo "${{ secrets.GITHUB_TOKEN }}" | crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Stage entrypoint layer + run: | + set -euo pipefail + mkdir -p layer/usr/local/bin + install -m 0755 infra/inference-dev/entrypoint.sh layer/usr/local/bin/entrypoint.sh + tar -C layer -czf /tmp/entrypoint-layer.tgz usr + ls -la /tmp/entrypoint-layer.tgz + + - name: Copy base + mutate + push + run: | + set -euo pipefail + echo "Source: ${SRC}:${SGLANG_TAG}" + echo "Dest: ${DST}:${IMAGE_TAG}" + crane mutate \ + "${SRC}:${SGLANG_TAG}" \ + --append /tmp/entrypoint-layer.tgz \ + --entrypoint /usr/local/bin/entrypoint.sh \ + --env MODEL_DIR=/workspace/models \ + --env GEMMA_DIR_NAME=gemma-4-e4b-it \ + --env PYTHONUNBUFFERED=1 \ + --tag "${DST}:${IMAGE_TAG}" + + - name: Summary + run: | + echo "### Published" >> "$GITHUB_STEP_SUMMARY" + echo "- \`${DST}:${IMAGE_TAG}\`" >> "$GITHUB_STEP_SUMMARY" + echo "- Base: \`${SRC}:${SGLANG_TAG}\`" >> "$GITHUB_STEP_SUMMARY" + echo "- Entrypoint: \`/usr/local/bin/entrypoint.sh\`" >> "$GITHUB_STEP_SUMMARY" diff --git a/infra/inference-dev/Dockerfile b/infra/inference-dev/Dockerfile new file mode 100644 index 0000000..28d24a6 --- /dev/null +++ b/infra/inference-dev/Dockerfile @@ -0,0 +1,30 @@ +# FlatClaw dev-inference image — SGLang base + entrypoint, no model weights. +# +# Same shape as `infra/inference/Dockerfile` (the prod 31B image). The only +# differences are baked into entrypoint.sh: BF16 instead of FP8 quant, and the +# default GEMMA_DIR_NAME points at the dev model. This keeps prod and dev on +# identical runtime + identical pipeline; they only diverge on the model +# binary and the GPU plan they run on. +# +# Model weights (Gemma 4 E4B-it from Kaggle) live on the per-tenant Northflank +# weights volume served by the in-project `weights-server` pod. They are NOT +# baked in — pods cold-boot in 60–90 s by pulling the SGLang base + the small +# entrypoint layer, then fetching weights over the project's internal HTTP. +# +# In production, .github/workflows/publish-inference-dev.yml uses `crane mutate` +# to build this image registry-to-registry without a local Docker daemon, which +# sidesteps the slow push of a 16+ GB base from a developer laptop. This +# Dockerfile is the equivalent local definition for ad-hoc rebuilds. + +ARG SGLANG_TAG=dev +FROM lmsysorg/sglang:${SGLANG_TAG} + +ENV MODEL_DIR=/workspace/models \ + GEMMA_DIR_NAME=gemma-4-e4b-it \ + PYTHONUNBUFFERED=1 + +COPY --chmod=755 entrypoint.sh /usr/local/bin/entrypoint.sh + +EXPOSE 8000 + +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/infra/inference-dev/entrypoint.sh b/infra/inference-dev/entrypoint.sh new file mode 100755 index 0000000..3ee91f5 --- /dev/null +++ b/infra/inference-dev/entrypoint.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# FlatClaw dev-inference entrypoint. +# +# Same pattern as the prod 31B entrypoint (`infra/inference/entrypoint.sh`): +# weights are NOT baked into the image, they live on a per-tenant Northflank +# volume served by the in-project `weights-server` pod, populated once by a +# Northflank stager job from +# `google/gemma-4/transformers/gemma-4-e4b-it/1` on Kaggle. +# +# At pod boot the dev Northflank manifest's customEntrypoint fetches weights +# into $MODEL_DIR before invoking this script. When weights are present, this +# entrypoint launches SGLang against $MODEL_DIR/$GEMMA_DIR_NAME on the L4 GPU. +# /v1/chat/completions and /v1/embeddings respond once warm-up completes +# (~60s typical on L4 once weights are local). + +set -euo pipefail + +: "${MODEL_DIR:=/workspace/models}" +: "${PORT:=8000}" +: "${MAX_CONTEXT:=32768}" +: "${TP:=1}" +: "${GEMMA_DIR_NAME:=gemma-4-e4b-it}" +: "${SGLANG_EXTRA_ARGS:=}" + +say() { printf '\n\033[36m[entrypoint-dev] %s\033[0m\n' "$*"; } + +model_path="$MODEL_DIR/$GEMMA_DIR_NAME" +if [ ! -d "$model_path" ]; then + echo "FATAL: $model_path does not exist." >&2 + echo "The dev manifest's customEntrypoint should fetch weights from the" >&2 + echo "in-project weights-server before invoking this script. If running" >&2 + echo "locally, mount a directory holding the model files at" >&2 + echo "$MODEL_DIR/$GEMMA_DIR_NAME." >&2 + ls -la "$MODEL_DIR" 2>&1 >&2 || true + exit 1 +fi + +say "launching SGLang on :$PORT (context=$MAX_CONTEXT, tp=$TP)" +say "model_path=$model_path" + +# Gemma 4 E4B ships in BF16; on a 24 GB L4 we don't need FP8 quant — the model +# is ~8 GB BF16, leaves ample headroom for KV cache at 32k context. +# +# Same SGLang parsers as prod (--tool-call-parser gemma4, --reasoning-parser +# gemma4) so the response shape and parsing path are byte-identical between +# dev and prod. Per Google's chat template, thinking is OFF by default — +# callers must pass `extra_body.chat_template_kwargs.enable_thinking=true` +# to activate it on a per-request basis. +exec python3 -m sglang.launch_server \ + --model-path "$model_path" \ + --host 0.0.0.0 \ + --port "$PORT" \ + --context-length "$MAX_CONTEXT" \ + --tp "$TP" \ + --dtype bfloat16 \ + --served-model-name "gemma-4-e4b-it" \ + --tool-call-parser gemma4 \ + --reasoning-parser gemma4 \ + --enable-metrics \ + $SGLANG_EXTRA_ARGS diff --git a/infra/inference/entrypoint.sh b/infra/inference/entrypoint.sh index 40f2d61..467d72a 100755 --- a/infra/inference/entrypoint.sh +++ b/infra/inference/entrypoint.sh @@ -1,17 +1,16 @@ #!/usr/bin/env bash # FlatClaw inference entrypoint. # -# Weights are NOT baked into the image. They live on a per-tenant persistent -# disk that the inference pod mounts at $MODEL_DIR (default /workspace/models). -# The disk is populated once by stage-disk.sh, which spins up a tiny CPU -# instance, attaches the disk, and runs the Kaggle CLI to download Gemma 4 31B -# from `google/gemma-4/transformers/gemma-4-31b-it/1`. +# Weights are NOT baked into the image. They live on a per-tenant Northflank +# volume served by an in-project `weights-server` pod. The volume is populated +# once by a Northflank stager job that runs the Kaggle CLI to download Gemma 4 +# 31B from `google/gemma-4/transformers/gemma-4-31b-it/1`. # -# At pod boot: -# 1. The volume mount makes /workspace/models// available. -# 2. SGLang launches against $MODEL_DIR/$GEMMA_DIR_NAME. -# 3. /v1/chat/completions and /v1/embeddings respond once warm-up completes -# (60-90s typical on RTX PRO 6000 Blackwell). +# At pod boot the production Northflank manifest's customEntrypoint fetches +# weights into $MODEL_DIR before invoking this script. When weights are +# present, this entrypoint just launches SGLang against +# $MODEL_DIR/$GEMMA_DIR_NAME. /v1/chat/completions and /v1/embeddings respond +# once warm-up completes (60-90s typical on H100). set -euo pipefail @@ -19,7 +18,7 @@ set -euo pipefail : "${PORT:=8000}" : "${MAX_CONTEXT:=32768}" : "${TP:=1}" -: "${GEMMA_DIR_NAME:=gemma-4-31B-it}" +: "${GEMMA_DIR_NAME:=gemma-4-31b-it}" : "${SGLANG_EXTRA_ARGS:=}" say() { printf '\n\033[36m[entrypoint] %s\033[0m\n' "$*"; } @@ -27,8 +26,10 @@ say() { printf '\n\033[36m[entrypoint] %s\033[0m\n' "$*"; } model_path="$MODEL_DIR/$GEMMA_DIR_NAME" if [ ! -d "$model_path" ]; then echo "FATAL: $model_path does not exist." >&2 - echo "Mount the model-weights disk at $MODEL_DIR before starting." >&2 - echo "Populate the disk once via stage-disk.sh (see infra/inference/README.md)." >&2 + echo "The production manifest's customEntrypoint should fetch weights from" >&2 + echo "the in-project weights-server before invoking this script." >&2 + echo "If running locally, mount a directory holding the model files at" >&2 + echo "$MODEL_DIR/$GEMMA_DIR_NAME." >&2 ls -la "$MODEL_DIR" 2>&1 >&2 || true exit 1 fi @@ -36,10 +37,19 @@ fi say "launching SGLang on :$PORT (context=$MAX_CONTEXT, tp=$TP)" say "model_path=$model_path" -# FP8 quant on RTX PRO 6000 Blackwell (sm_100+) runs through native cutlass / -# deep_gemm — no Marlin fallback (Marlin's 8608-tile constraint kills Gemma 4 -# 31B on Ampere sm_80). 96 GB VRAM holds Gemma weights + KV cache + co-resident -# bge-m3, with headroom for VoxCPM2 and SDXL when those land in Spike B2. +# FP8 quant on H100 (Hopper, sm_90) runs through native cutlass / deep_gemm — +# no Marlin fallback (Marlin's 8608-tile constraint kills Gemma 4 31B on +# Ampere sm_80). 80 GB VRAM holds Gemma weights (~33 GB FP8) + KV cache + +# co-resident bge-m3, with ~25 GB headroom for the v0.3 cascade (small Gemma +# + voice + image co-resident). +# +# `--tool-call-parser gemma4` and `--reasoning-parser gemma4` are the +# Gemma-4-specific parsers SGLang ships (added in PR #21952). They extract +# Gemma 4's `<|tool_call|>...` envelope into structured +# `tool_calls`, and split the `` thinking section out as +# `reasoning_content`. Per Google's chat template, thinking is OFF by +# default — callers pass `extra_body.chat_template_kwargs.enable_thinking=true` +# to activate it on a per-request basis. exec python3 -m sglang.launch_server \ --model-path "$model_path" \ --host 0.0.0.0 \ @@ -48,6 +58,7 @@ exec python3 -m sglang.launch_server \ --tp "$TP" \ --quantization fp8 \ --served-model-name "gemma-4-31b-it" \ - --tool-call-parser pythonic \ + --tool-call-parser gemma4 \ + --reasoning-parser gemma4 \ --enable-metrics \ $SGLANG_EXTRA_ARGS