From 121d1ce65872bf0cda03448ff43fb50bad69ec5e Mon Sep 17 00:00:00 2001
From: Skyler Truax <skyler@bidfind.ai>
Date: Mon, 4 May 2026 17:49:59 -0400
Subject: [PATCH] Inference: gemma4 tool/reasoning parsers; add inference-dev
 image
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Source-of-truth fix to match the parser flags that production has been
running. Past session JSONLs show prod's deployed pod was already using
--tool-call-parser gemma4 (125+ structured toolCall blocks captured, zero
raw <|tool_call|> envelope leaks). The repo's infra/inference/entrypoint.sh
had drifted to "pythonic" — wrong for Gemma 4, would have broken the next
image republish.

- infra/inference/entrypoint.sh: --tool-call-parser pythonic →
  gemma4, plus --reasoning-parser gemma4. Per SGLang's published
  Gemma 4 cookbook (PR sgl-project/sglang#21952). Thinking still
  defaults OFF in the chat template — callers must pass
  extra_body.chat_template_kwargs.enable_thinking=true to activate.

- infra/inference-dev/{Dockerfile,entrypoint.sh}: new dev-inference
  image, mirrors the prod Dockerfile shape exactly. Same SGLang base,
  same crane-mutate publish pattern, same parser flags. Diverges only
  on dtype (bfloat16 vs fp8) and the GEMMA_DIR_NAME default
  (gemma-4-e4b-it vs gemma-4-31b-it).

- .github/workflows/publish-inference-dev.yml: companion publish
  workflow for ghcr.io/<owner>/flatclaw-inference-dev:latest. Same
  shape as publish-inference.yml, runs on push-to-main when the
  inference-dev files change.
---
 .github/workflows/publish-inference-dev.yml | 85 +++++++++++++++++++++
 infra/inference-dev/Dockerfile              | 30 ++++++++
 infra/inference-dev/entrypoint.sh           | 60 +++++++++++++++
 infra/inference/entrypoint.sh               | 47 +++++++-----
 4 files changed, 204 insertions(+), 18 deletions(-)
 create mode 100644 .github/workflows/publish-inference-dev.yml
 create mode 100644 infra/inference-dev/Dockerfile
 create mode 100755 infra/inference-dev/entrypoint.sh
diff --git a/.github/workflows/publish-inference-dev.yml b/.github/workflows/publish-inference-dev.yml
new file mode 100644
index 0000000..d0d6462
--- /dev/null
+++ b/.github/workflows/publish-inference-dev.yml
@@ -0,0 +1,85 @@
+name: publish-inference-dev
+
+# Publishes the FlatClaw dev-inference image to ghcr.io.
+#
+# Same pattern as `publish-inference.yml` (the prod 31B image): pull SGLang base
+# from DockerHub, append the dev entrypoint as a single layer via crane mutate,
+# push to GHCR. Server-to-server, no local Docker daemon, ~5–10 min wall-clock.
+#
+# The two pipelines are intentionally identical except for entrypoint contents
+# and destination repo, so dev and prod runtimes stay byte-for-byte aligned on
+# everything except the GEMMA_DIR_NAME default and the bf16-vs-fp8 quant flag.
+
+on:
+  workflow_dispatch:
+    inputs:
+      sglang_tag:
+        description: "lmsysorg/sglang tag to base on"
+        default: "dev"
+      image_tag:
+        description: "Destination tag to publish as"
+        default: "latest"
+  push:
+    branches: [main]
+    paths:
+      - "infra/inference-dev/entrypoint.sh"
+      - "infra/inference-dev/Dockerfile"
+      - ".github/workflows/publish-inference-dev.yml"
+
+permissions:
+  contents: read
+  packages: write
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    env:
+      SGLANG_TAG: ${{ inputs.sglang_tag || 'dev' }}
+      IMAGE_TAG:  ${{ inputs.image_tag || 'latest' }}
+      SRC: docker.io/lmsysorg/sglang
+      DST: ghcr.io/${{ github.repository_owner }}/flatclaw-inference-dev
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install crane
+        env:
+          # Pinned to avoid rate-limiting on GH's unauthenticated releases API.
+          # Bump manually if a newer crane is needed.
+          CRANE_VER: v0.21.5
+        run: |
+          set -euo pipefail
+          curl -fsSL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VER}/go-containerregistry_Linux_x86_64.tar.gz" \
+            | tar -xzC /usr/local/bin crane
+          crane version
+
+      - name: Login to GHCR
+        run: echo "${{ secrets.GITHUB_TOKEN }}" | crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
+
+      - name: Stage entrypoint layer
+        run: |
+          set -euo pipefail
+          mkdir -p layer/usr/local/bin
+          install -m 0755 infra/inference-dev/entrypoint.sh layer/usr/local/bin/entrypoint.sh
+          tar -C layer -czf /tmp/entrypoint-layer.tgz usr
+          ls -la /tmp/entrypoint-layer.tgz
+
+      - name: Copy base + mutate + push
+        run: |
+          set -euo pipefail
+          echo "Source: ${SRC}:${SGLANG_TAG}"
+          echo "Dest:   ${DST}:${IMAGE_TAG}"
+          crane mutate \
+            "${SRC}:${SGLANG_TAG}" \
+            --append /tmp/entrypoint-layer.tgz \
+            --entrypoint /usr/local/bin/entrypoint.sh \
+            --env MODEL_DIR=/workspace/models \
+            --env GEMMA_DIR_NAME=gemma-4-e4b-it \
+            --env PYTHONUNBUFFERED=1 \
+            --tag "${DST}:${IMAGE_TAG}"
+
+      - name: Summary
+        run: |
+          echo "### Published" >> "$GITHUB_STEP_SUMMARY"
+          echo "- \`${DST}:${IMAGE_TAG}\`" >> "$GITHUB_STEP_SUMMARY"
+          echo "- Base: \`${SRC}:${SGLANG_TAG}\`" >> "$GITHUB_STEP_SUMMARY"
+          echo "- Entrypoint: \`/usr/local/bin/entrypoint.sh\`" >> "$GITHUB_STEP_SUMMARY"
diff --git a/infra/inference-dev/Dockerfile b/infra/inference-dev/Dockerfile
new file mode 100644
index 0000000..28d24a6
--- /dev/null
+++ b/infra/inference-dev/Dockerfile
@@ -0,0 +1,30 @@
+# FlatClaw dev-inference image — SGLang base + entrypoint, no model weights.
+#
+# Same shape as `infra/inference/Dockerfile` (the prod 31B image). The only
+# differences are baked into entrypoint.sh: BF16 instead of FP8 quant, and the
+# default GEMMA_DIR_NAME points at the dev model. This keeps prod and dev on
+# identical runtime + identical pipeline; they only diverge on the model
+# binary and the GPU plan they run on.
+#
+# Model weights (Gemma 4 E4B-it from Kaggle) live on the per-tenant Northflank
+# weights volume served by the in-project `weights-server` pod. They are NOT
+# baked in — pods cold-boot in 60–90 s by pulling the SGLang base + the small
+# entrypoint layer, then fetching weights over the project's internal HTTP.
+#
+# In production, .github/workflows/publish-inference-dev.yml uses `crane mutate`
+# to build this image registry-to-registry without a local Docker daemon, which
+# sidesteps the slow push of a 16+ GB base from a developer laptop. This
+# Dockerfile is the equivalent local definition for ad-hoc rebuilds.
+
+ARG SGLANG_TAG=dev
+FROM lmsysorg/sglang:${SGLANG_TAG}
+
+ENV MODEL_DIR=/workspace/models \
+    GEMMA_DIR_NAME=gemma-4-e4b-it \
+    PYTHONUNBUFFERED=1
+
+COPY --chmod=755 entrypoint.sh /usr/local/bin/entrypoint.sh
+
+EXPOSE 8000
+
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
diff --git a/infra/inference-dev/entrypoint.sh b/infra/inference-dev/entrypoint.sh
new file mode 100755
index 0000000..3ee91f5
--- /dev/null
+++ b/infra/inference-dev/entrypoint.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+# FlatClaw dev-inference entrypoint.
+#
+# Same pattern as the prod 31B entrypoint (`infra/inference/entrypoint.sh`):
+# weights are NOT baked into the image, they live on a per-tenant Northflank
+# volume served by the in-project `weights-server` pod, populated once by a
+# Northflank stager job from
+# `google/gemma-4/transformers/gemma-4-e4b-it/1` on Kaggle.
+#
+# At pod boot the dev Northflank manifest's customEntrypoint fetches weights
+# into $MODEL_DIR before invoking this script. When weights are present, this
+# entrypoint launches SGLang against $MODEL_DIR/$GEMMA_DIR_NAME on the L4 GPU.
+# /v1/chat/completions and /v1/embeddings respond once warm-up completes
+# (~60s typical on L4 once weights are local).
+
+set -euo pipefail
+
+: "${MODEL_DIR:=/workspace/models}"
+: "${PORT:=8000}"
+: "${MAX_CONTEXT:=32768}"
+: "${TP:=1}"
+: "${GEMMA_DIR_NAME:=gemma-4-e4b-it}"
+: "${SGLANG_EXTRA_ARGS:=}"
+
+say() { printf '\n\033[36m[entrypoint-dev] %s\033[0m\n' "$*"; }
+
+model_path="$MODEL_DIR/$GEMMA_DIR_NAME"
+if [ ! -d "$model_path" ]; then
+  echo "FATAL: $model_path does not exist." >&2
+  echo "The dev manifest's customEntrypoint should fetch weights from the" >&2
+  echo "in-project weights-server before invoking this script. If running" >&2
+  echo "locally, mount a directory holding the model files at" >&2
+  echo "$MODEL_DIR/$GEMMA_DIR_NAME." >&2
+  ls -la "$MODEL_DIR" 2>&1 >&2 || true
+  exit 1
+fi
+
+say "launching SGLang on :$PORT (context=$MAX_CONTEXT, tp=$TP)"
+say "model_path=$model_path"
+
+# Gemma 4 E4B ships in BF16; on a 24 GB L4 we don't need FP8 quant — the model
+# is ~8 GB BF16, leaves ample headroom for KV cache at 32k context.
+#
+# Same SGLang parsers as prod (--tool-call-parser gemma4, --reasoning-parser
+# gemma4) so the response shape and parsing path are byte-identical between
+# dev and prod. Per Google's chat template, thinking is OFF by default —
+# callers must pass `extra_body.chat_template_kwargs.enable_thinking=true`
+# to activate it on a per-request basis.
+exec python3 -m sglang.launch_server \
+  --model-path "$model_path" \
+  --host 0.0.0.0 \
+  --port "$PORT" \
+  --context-length "$MAX_CONTEXT" \
+  --tp "$TP" \
+  --dtype bfloat16 \
+  --served-model-name "gemma-4-e4b-it" \
+  --tool-call-parser gemma4 \
+  --reasoning-parser gemma4 \
+  --enable-metrics \
+  $SGLANG_EXTRA_ARGS
diff --git a/infra/inference/entrypoint.sh b/infra/inference/entrypoint.sh
index 40f2d61..467d72a 100755
--- a/infra/inference/entrypoint.sh
+++ b/infra/inference/entrypoint.sh
@@ -1,17 +1,16 @@
 #!/usr/bin/env bash
 # FlatClaw inference entrypoint.
 #
-# Weights are NOT baked into the image. They live on a per-tenant persistent
-# disk that the inference pod mounts at $MODEL_DIR (default /workspace/models).
-# The disk is populated once by stage-disk.sh, which spins up a tiny CPU
-# instance, attaches the disk, and runs the Kaggle CLI to download Gemma 4 31B
-# from `google/gemma-4/transformers/gemma-4-31b-it/1`.
+# Weights are NOT baked into the image. They live on a per-tenant Northflank
+# volume served by an in-project `weights-server` pod. The volume is populated
+# once by a Northflank stager job that runs the Kaggle CLI to download Gemma 4
+# 31B from `google/gemma-4/transformers/gemma-4-31b-it/1`.
 #
-# At pod boot:
-#   1. The volume mount makes /workspace/models/<modelname>/ available.
-#   2. SGLang launches against $MODEL_DIR/$GEMMA_DIR_NAME.
-#   3. /v1/chat/completions and /v1/embeddings respond once warm-up completes
-#      (60-90s typical on RTX PRO 6000 Blackwell).
+# At pod boot the production Northflank manifest's customEntrypoint fetches
+# weights into $MODEL_DIR before invoking this script. When weights are
+# present, this entrypoint just launches SGLang against
+# $MODEL_DIR/$GEMMA_DIR_NAME. /v1/chat/completions and /v1/embeddings respond
+# once warm-up completes (60-90s typical on H100).
 
 set -euo pipefail
 
@@ -19,7 +18,7 @@ set -euo pipefail
 : "${PORT:=8000}"
 : "${MAX_CONTEXT:=32768}"
 : "${TP:=1}"
-: "${GEMMA_DIR_NAME:=gemma-4-31B-it}"
+: "${GEMMA_DIR_NAME:=gemma-4-31b-it}"
 : "${SGLANG_EXTRA_ARGS:=}"
 
 say() { printf '\n\033[36m[entrypoint] %s\033[0m\n' "$*"; }
@@ -27,8 +26,10 @@ say() { printf '\n\033[36m[entrypoint] %s\033[0m\n' "$*"; }
 model_path="$MODEL_DIR/$GEMMA_DIR_NAME"
 if [ ! -d "$model_path" ]; then
   echo "FATAL: $model_path does not exist." >&2
-  echo "Mount the model-weights disk at $MODEL_DIR before starting." >&2
-  echo "Populate the disk once via stage-disk.sh (see infra/inference/README.md)." >&2
+  echo "The production manifest's customEntrypoint should fetch weights from" >&2
+  echo "the in-project weights-server before invoking this script." >&2
+  echo "If running locally, mount a directory holding the model files at" >&2
+  echo "$MODEL_DIR/$GEMMA_DIR_NAME." >&2
   ls -la "$MODEL_DIR" 2>&1 >&2 || true
   exit 1
 fi
@@ -36,10 +37,19 @@ fi
 say "launching SGLang on :$PORT (context=$MAX_CONTEXT, tp=$TP)"
 say "model_path=$model_path"
 
-# FP8 quant on RTX PRO 6000 Blackwell (sm_100+) runs through native cutlass /
-# deep_gemm — no Marlin fallback (Marlin's 8608-tile constraint kills Gemma 4
-# 31B on Ampere sm_80). 96 GB VRAM holds Gemma weights + KV cache + co-resident
-# bge-m3, with headroom for VoxCPM2 and SDXL when those land in Spike B2.
+# FP8 quant on H100 (Hopper, sm_90) runs through native cutlass / deep_gemm —
+# no Marlin fallback (Marlin's 8608-tile constraint kills Gemma 4 31B on
+# Ampere sm_80). 80 GB VRAM holds Gemma weights (~33 GB FP8) + KV cache +
+# co-resident bge-m3, with ~25 GB headroom for the v0.3 cascade (small Gemma
+# + voice + image co-resident).
+#
+# `--tool-call-parser gemma4` and `--reasoning-parser gemma4` are the
+# Gemma-4-specific parsers SGLang ships (added in PR #21952). They extract
+# Gemma 4's `<|tool_call|>...<tool_call|>` envelope into structured
+# `tool_calls`, and split the `<channel|>` thinking section out as
+# `reasoning_content`. Per Google's chat template, thinking is OFF by
+# default — callers pass `extra_body.chat_template_kwargs.enable_thinking=true`
+# to activate it on a per-request basis.
 exec python3 -m sglang.launch_server \
   --model-path "$model_path" \
   --host 0.0.0.0 \
@@ -48,6 +58,7 @@ exec python3 -m sglang.launch_server \
   --tp "$TP" \
   --quantization fp8 \
   --served-model-name "gemma-4-31b-it" \
-  --tool-call-parser pythonic \
+  --tool-call-parser gemma4 \
+  --reasoning-parser gemma4 \
   --enable-metrics \
   $SGLANG_EXTRA_ARGS