Navi-AI-Lab · Natfii · May 7, 2026 · May 3, 2026 · May 7, 2026 · May 7, 2026
diff --git a/benchmarks/nvllm/traces/upstream_stabilization_tier1/2026-05-07-tier1cp-smoke/metadata.json b/benchmarks/nvllm/traces/upstream_stabilization_tier1/2026-05-07-tier1cp-smoke/metadata.json
@@ -0,0 +1,10 @@
+{
+  "commit": "a131443ff5d5d4ccf7928b4a930a191c718fe59f",
+  "branch": "main",
+  "image_id": "sha256:c051b0a8a84fd70e73807d50c4ab9342521dcd4b1202af37883b0bba198bbd83",
+  "model": "ig1/Qwen3.5-27B-NVFP4",
+  "wo_splits": "1",
+  "replays": 1,
+  "sharegpt_max_tokens": 128,
+  "longdecode_max_tokens": 2048
+}
diff --git a/benchmarks/nvllm/traces/upstream_stabilization_tier1/2026-05-07-tier1cp-smoke/runner.log b/benchmarks/nvllm/traces/upstream_stabilization_tier1/2026-05-07-tier1cp-smoke/runner.log
@@ -0,0 +1,150 @@
+[2026-05-07 11:11:29] output: /tmp/tier1cp-soak
+[2026-05-07 11:11:29] starting server wo_split=1 region_timing=0 profiler=0
+[2026-05-07 11:15:08] server ready after ~210s
+[2026-05-07 11:15:08] GSM8K wo1
+[1/50] WRONG (gold=2280 pred=2180) 108.5s
+[2/50] OK (gold=1 pred=1) 50.4s
+[3/50] OK (gold=5 pred=5) 51.5s
+[4/50] OK (gold=12 pred=12) 84.6s
+[5/50] OK (gold=273 pred=273) 76.2s
+[6/50] OK (gold=45 pred=45) 53.5s
+[7/50] OK (gold=21 pred=21) 46.2s
+[8/50] OK (gold=145 pred=145) 76.7s
+[9/50] OK (gold=60 pred=60) 40.3s
+[10/50] OK (gold=122 pred=122) 60.0s
+[11/50] OK (gold=29 pred=29) 186.5s
+[12/50] OK (gold=80 pred=80) 91.8s
+[13/50] OK (gold=36 pred=36) 52.3s
+[14/50] OK (gold=1430 pred=1430) 74.0s
+[15/50] OK (gold=5 pred=5) 63.0s
+[16/50] OK (gold=5 pred=5) 43.8s
+[17/50] OK (gold=5 pred=5) 45.5s
+[18/50] OK (gold=66 pred=66) 41.5s
+[19/50] OK (gold=15 pred=15) 42.4s
+[20/50] OK (gold=40 pred=40) 38.9s
+[21/50] OK (gold=93 pred=93) 51.3s
+[22/50] OK (gold=2000 pred=2000) 100.9s
+[23/50] OK (gold=1520 pred=1520) 38.7s
+[24/50] OK (gold=11050 pred=11050) 61.9s
+[25/50] OK (gold=90 pred=90) 73.8s
+[26/50] OK (gold=40000 pred=40000) 67.6s
+[27/50] OK (gold=21 pred=21) 49.5s
+[28/50] OK (gold=18 pred=18) 133.0s
+[29/50] OK (gold=14 pred=14) 68.9s
+[30/50] OK (gold=23 pred=23) 41.3s
+[31/50] OK (gold=145 pred=145) 78.1s
+[32/50] OK (gold=123 pred=123) 95.1s
+[33/50] OK (gold=98 pred=98) 78.1s
+[34/50] OK (gold=7 pred=7) 96.8s
+[35/50] OK (gold=34 pred=34) 125.7s
+[36/50] OK (gold=38 pred=38) 49.2s
+[37/50] OK (gold=320 pred=320) 74.6s
+[38/50] OK (gold=50 pred=50) 89.6s
+[39/50] OK (gold=50 pred=50) 72.9s
+[40/50] OK (gold=84 pred=84) 49.5s
+[41/50] OK (gold=50 pred=50) 114.3s
+[42/50] OK (gold=8000 pred=8000) 67.3s
+[43/50] OK (gold=280 pred=280) 50.2s
+[44/50] OK (gold=30 pred=30) 36.0s
+[45/50] WRONG (gold=192 pred=1) 223.6s
+[46/50] OK (gold=276 pred=276) 93.1s
+[47/50] OK (gold=32 pred=32) 57.7s
+[48/50] OK (gold=25 pred=25) 117.2s
+[49/50] OK (gold=10 pred=10) 54.2s
+[50/50] OK (gold=84 pred=84) 83.0s
+{
+  "label": "wo1_primary_gsm8k",
+  "model": "default",
+  "api": "http://localhost:8000/v1",
+  "n": 50,
+  "seed": 42,
+  "correct": 48,
+  "errors": 0,
+  "accuracy": "48/50 (96.0%)",
+  "total_seconds": 3720.6
+}
+[2026-05-07 12:17:09] ShareGPT wo1 run01
+[sharegpt] conv=0 turn=0 chunks=128 wall=55.44s
+[sharegpt] conv=0 turn=2 chunks=128 wall=59.58s
+[sharegpt] conv=0 turn=4 chunks=128 wall=72.45s
+[sharegpt] conv=0 turn=6 chunks=128 wall=90.96s
+[sharegpt] conv=0 turn=8 chunks=128 wall=114.45s
+[sharegpt] conv=0 turn=10 chunks=128 wall=146.12s
+[sharegpt] conv=1 turn=0 chunks=128 wall=54.80s
+[sharegpt] conv=2 turn=0 chunks=128 wall=55.17s
+[sharegpt] conv=3 turn=0 chunks=128 wall=54.81s
+[sharegpt] conv=3 turn=2 chunks=128 wall=57.28s
+[sharegpt] conv=3 turn=4 chunks=128 wall=73.12s
+[sharegpt] conv=3 turn=6 chunks=128 wall=91.96s
+[sharegpt] conv=4 turn=0 chunks=128 wall=54.82s
+[sharegpt] conv=4 turn=2 chunks=128 wall=61.19s
+[sharegpt] conv=4 turn=4 chunks=128 wall=75.59s
+[sharegpt] conv=5 turn=0 chunks=128 wall=55.00s
+[sharegpt] conv=5 turn=2 chunks=128 wall=73.92s
+[sharegpt] conv=6 turn=0 chunks=128 wall=54.69s
+[sharegpt] conv=6 turn=2 chunks=128 wall=57.46s
+[sharegpt] conv=6 turn=4 chunks=128 wall=64.79s
+[sharegpt] conv=6 turn=6 chunks=128 wall=77.99s
+[sharegpt] conv=6 turn=8 chunks=128 wall=90.43s
+[sharegpt] conv=6 turn=10 chunks=128 wall=106.24s
+[sharegpt] conv=6 turn=12 chunks=128 wall=129.20s
+[sharegpt] conv=6 turn=14 chunks=128 wall=167.69s
+[sharegpt] conv=7 turn=1 chunks=128 wall=62.27s
+[sharegpt] conv=7 turn=3 chunks=128 wall=110.60s
+[sharegpt] conv=7 turn=5 chunks=128 wall=164.55s
+[sharegpt] conv=7 turn=7 chunks=128 wall=173.32s
+[sharegpt] conv=8 turn=1 chunks=128 wall=64.38s
+[sharegpt] conv=8 turn=3 chunks=128 wall=94.53s
+[sharegpt] conv=8 turn=5 chunks=128 wall=119.68s
+[sharegpt] conv=8 turn=7 chunks=128 wall=147.03s
+[sharegpt] conv=8 turn=9 chunks=128 wall=174.39s
+[sharegpt] conv=9 turn=0 chunks=128 wall=54.98s
+[sharegpt] conv=10 turn=0 chunks=128 wall=62.05s
+[sharegpt] conv=10 turn=2 chunks=128 wall=77.23s
+[sharegpt] conv=10 turn=4 chunks=128 wall=92.29s
+[sharegpt] conv=10 turn=6 chunks=128 wall=123.66s
+[sharegpt] conv=10 turn=8 chunks=128 wall=154.44s
+[sharegpt] conv=11 turn=0 chunks=128 wall=59.49s
+[sharegpt] conv=11 turn=2 chunks=128 wall=67.43s
+[sharegpt] conv=12 turn=0 chunks=128 wall=62.49s
+[sharegpt] conv=12 turn=2 chunks=128 wall=81.57s
+[sharegpt] conv=13 turn=0 chunks=128 wall=60.08s
+[sharegpt] conv=13 turn=2 chunks=128 wall=71.94s
+[sharegpt] conv=13 turn=4 chunks=128 wall=103.65s
+[sharegpt] conv=13 turn=6 chunks=128 wall=144.94s
+[sharegpt] conv=13 turn=8 chunks=128 wall=182.02s
+[sharegpt] conv=13 turn=10 chunks=128 wall=194.95s
+[sharegpt] conv=14 turn=0 chunks=128 wall=62.76s
+[sharegpt] conv=14 turn=2 chunks=128 wall=68.24s
+[sharegpt] conv=14 turn=4 chunks=128 wall=102.64s
+[sharegpt] conv=14 turn=6 chunks=128 wall=154.97s
+[sharegpt] conv=15 turn=0 chunks=128 wall=63.16s
+[sharegpt] conv=15 turn=2 chunks=128 wall=69.45s
+[sharegpt] conv=16 turn=1 chunks=128 wall=94.84s
+[sharegpt] conv=16 turn=3 chunks=128 wall=141.74s
+[sharegpt] conv=16 turn=5 chunks=128 wall=158.70s
+[sharegpt] conv=17 turn=1 chunks=128 wall=74.41s
+[sharegpt] conv=17 turn=3 chunks=128 wall=97.06s
+[sharegpt] conv=17 turn=5 chunks=128 wall=101.80s
+[sharegpt] conv=17 turn=7 chunks=128 wall=147.47s
+[sharegpt] conv=17 turn=9 chunks=128 wall=154.17s
+[sharegpt] conv=18 turn=0 chunks=128 wall=60.78s
+[sharegpt] conv=18 turn=2 chunks=128 wall=112.96s
+[sharegpt] conv=19 turn=0 chunks=128 wall=64.41s
+[sharegpt] conv=20 turn=0 chunks=128 wall=265.75s
+[sharegpt] conv=21 turn=0 chunks=128 wall=212.39s
+[sharegpt] conv=22 turn=0 chunks=128 wall=210.82s
+[sharegpt] conv=23 turn=0 chunks=128 wall=97.50s
+[sharegpt] conv=24 turn=0 chunks=128 wall=189.35s
+[sharegpt] conv=25 turn=0 chunks=128 wall=89.30s
+[sharegpt] conv=25 turn=2 chunks=128 wall=168.29s
+[sharegpt] conv=26 turn=1 chunks=128 wall=130.77s
+[sharegpt] conv=27 turn=0 chunks=128 wall=92.26s
+[sharegpt] conv=28 turn=0 chunks=128 wall=92.64s
+[sharegpt] conv=29 turn=0 chunks=128 wall=115.18s
+[2026-05-07 14:32:02] Long decode wo1 run01
+[longdecode] chunks=2048 wall=986.67s finish=length
+[2026-05-07 14:48:28] 2-concurrent probe wo1
+[concurrent] wall=57.37s
+wrote /tmp/tier1cp-soak/summary.md
+[2026-05-07 14:49:27] done: /tmp/tier1cp-soak/summary.md
diff --git a/...s/nvllm/traces/upstream_stabilization_tier1/2026-05-07-tier1cp-smoke/summary.md b/...s/nvllm/traces/upstream_stabilization_tier1/2026-05-07-tier1cp-smoke/summary.md
@@ -0,0 +1,99 @@
+# Upstream stabilization tier-1 — correctness smoke
+
+## What this is
+
+Apples-to-apples GSM8K + ShareGPT + long-decode + 2-concurrent smoke comparing the
+**tier-1 cherry-pick stack** against the wo1 baseline that produced the wo_split
+production-soak evidence. **This is a correctness smoke, not a performance claim** —
+the 4 picks land in code paths that should not move SM120 decode latency.
+
+## Picks under test (PR #10)
+
+| sha | summary |
+|---|---|
+| `884b5ae34` | Disable flashinfer autotune temporarily due to correctness issues (vllm-project/vllm#41524) |
+| `b383774ad` | fix(FLA): tighten write-side guard against `NULL_BLOCK_ID=0` (partial of upstream `d4cb783c1`) |
+| `9e3a48cd8` | KV cache stride canonicalization for TMA alignment (manual port of upstream `66dfee712`) |
+| `f3b4d3d09` | Gemma4 EAGLE-3 mixin + sliding-window cache realignment (manual port of upstream `e7cfd7c5b`) |
+
+## Build
+
+| | image | code commit (HEAD) | version |
+|---|---|---|---|
+| baseline | `nvllm:gb10` | `f79cf418b` (pre-soak main) | `0.3.1.dev69+gf79cf418b` |
+| tier1cp | `nvllm:gb10-tier1cp` | `f3b4d3d09` (cherry-pick stack tip) | `0.3.1.dev102+gf3b4d3d09` |
+
+Baseline image was the production image at the time of the wo_split soak.
+Tier1cp image was built from the cherry-pick worktree via `/tmp/tier1cp-build-ctx`
+(see `feedback_docker_build_worktree`).
+
+## Model & config
+
+- Model: `ig1/Qwen3.5-27B-NVFP4`
+- KV cache dtype: `fp8_e4m3`
+- Attention backend: CUTE_PAGED
+- `CUTE_WO_SPLIT=1` (wo1 arm only — same as baseline)
+- `--kernel-config '{"enable_flashinfer_autotune":false}'`
+- `--gpu-memory-utilization 0.85`
+
+## GSM8K 50 (seed=42, max_tokens=512, /v1/completions)
+
+| | correct | acc | total wall | mean | median |
+|---|---|---|---|---|---|
+| baseline (wo1) | 48 | 96.0% | 3737.8 s | 74.8 s | 67.7 s |
+| tier1cp        | 48 | 96.0% | 3720.6 s | 74.4 s | 67.4 s |
+| Δ              | **0** | **0** | **−17.2 s (−0.46 %)** | −0.4 | −0.3 |
+
+**Answer-level divergences vs baseline: 0/50.** Same two questions wrong (Q1 2280→2180
+arithmetic miss; Q45 192→1 reasoning miss), every other answer byte-identical.
+Sub-1 % wall delta is well within thermal noise on a workstation GPU and is **not**
+claimed as a perf improvement.
+
+## ShareGPT slice (30 conversations, max_tokens=128, seed=42)
+
+Completed 30 conversations with no errors. Per-turn walltimes are recorded in
+`wo1/primary/run01/sharegpt.json` and `sharegpt_wall_tpot.csv`.
+
+## Long decode (max_tokens=2048, seed=42)
+
+| | wall | finish | chunks |
+|---|---|---|---|
+| tier1cp | 986.7 s | length | 2048 |
+
+## 2-concurrent probe
+
+| request | wall | ttft | tpot p95 | finish |
+|---|---|---|---|---|
+| req_a | 57.37 s | 2882 ms | 433.40 ms | length |
+| req_b | 54.81 s | 2882 ms | 433.25 ms | stop   |
+
+## How to reproduce
+
+```bash
+# Build (from a non-worktree clone)
+git clone --branch cherry-pick/upstream-stabilization-tier1 --single-branch \
+  /home/natfii/docker/nvllm /tmp/tier1cp-build-ctx
+cd /tmp/tier1cp-build-ctx
+docker build -f docker/Dockerfile.gb10 -t nvllm:gb10-tier1cp .
+
+# Smoke (run from main checkout, evidence-PR runner)
+cd /home/natfii/docker/nvllm
+NVLLM_IMAGE=nvllm:gb10-tier1cp WO_SPLITS=1 REPLAYS=1 PHASES=primary \
+  OUT_DIR=/tmp/tier1cp-soak \
+  bash docs/research/2026-05-04-wo-split-prod-soak/runner.sh
+```
+
+## Verdict
+
+Cherry-pick stack is **bit-clean against the wo1 baseline**: zero answer divergences,
+sub-1 % wall delta (within thermal noise), no errors in ShareGPT / long decode /
+concurrent. Safe to merge PR #10.
+
+## Caveats
+
+- No nsys trace was captured. AGENTS.md §4 requires nsys for **performance claims**;
+  this smoke makes none. If a perf claim is later attached to this stack, capture
+  per-kernel evidence at that time.
+- `metadata.json` records the host-side script commit (`a131443ff` on `main`),
+  not the image-side cherry-pick HEAD. The image-side commit is `f3b4d3d09`,
+  surfaced via `vllm --version` inside the container.
diff --git a/.../traces/upstream_stabilization_tier1/2026-05-07-tier1cp-smoke/wo1/primary/concurrent/DONE b/.../traces/upstream_stabilization_tier1/2026-05-07-tier1cp-smoke/wo1/primary/concurrent/DONE
diff --git a/...stream_stabilization_tier1/2026-05-07-tier1cp-smoke/wo1/primary/concurrent/concurrent.log b/...stream_stabilization_tier1/2026-05-07-tier1cp-smoke/wo1/primary/concurrent/concurrent.log
@@ -0,0 +1 @@
+[concurrent] wall=57.37s
diff --git a/..._stabilization_tier1/2026-05-07-tier1cp-smoke/wo1/primary/concurrent/request_a_output.txt b/..._stabilization_tier1/2026-05-07-tier1cp-smoke/wo1/primary/concurrent/request_a_output.txt
@@ -0,0 +1,11 @@
+<think>: Here's a thinking process that leads to the suggested summary:
+
+1.  **Analyze the Request:**
+    *   **Source Material:** Jeff Walker's "Product Launch Formula" (PLF).
+    *   **Target Audience/Context:** A growth marketing agency implementing these strategies for *their clients*.
+    *   **Format:** Bullet points.
+    *   **Goal:** Summarize main ideas adapted for an agency-client relationship.
+
+2.  **Deconstruct the Product Launch Formula (PLF):**
+    *   What are the core pillars of PLF?
+        *   Pre
diff --git a/..._stabilization_tier1/2026-05-07-tier1cp-smoke/wo1/primary/concurrent/request_b_output.txt b/..._stabilization_tier1/2026-05-07-tier1cp-smoke/wo1/primary/concurrent/request_b_output.txt
@@ -0,0 +1,7 @@
+<think>
+
+</think>
+
+- **Measurable and Actionable**: The segment must be large enough to be profitable and distinct enough that you can design specific marketing strategies, products, or pricing models tailored directly to its unique needs.
+- **Homogeneous Internally, Heterogeneous Externally**: Customers within the same segment should share similar characteristics, behaviors, and needs, while clearly differing from customers in other segments to ensure targeted relevance.
+- **Accessible and Reachable**: You must be able to effectively identify, contact, and serve the segment through existing distribution channels and communication methods without prohibitive costs.