From dd9b48d348059a878b7e7d58fd48829a720dfba7 Mon Sep 17 00:00:00 2001
From: Fumitaka Tokumitsu <3142849+toku345@users.noreply.github.com>
Date: Sun, 1 Mar 2026 18:11:09 +0900
Subject: [PATCH 1/2] Switch vllm-qwen35 image to cu130-nightly for SM 12.1
 compatibility

The previous qwen3_5-cu130 image (02-23 build) crashed with a Triton
kernel error on GB10 (SM 12.1). The cu130-nightly image (03-01 build,
commit afd089f2) includes fixes for both the Triton issue and the
RMSNormGated bug (vllm-project/vllm#35423).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 backends/vllm/compose.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/backends/vllm/compose.yml b/backends/vllm/compose.yml
index 2092ee6..151d5e7 100644
--- a/backends/vllm/compose.yml
+++ b/backends/vllm/compose.yml
@@ -63,8 +63,9 @@ services:
 
   vllm-qwen35:
     <<: *common
-    # Qwen3.5 は vLLM upstream の専用イメージが必要（NGC 26.01 は transformers が古く qwen3_5_moe 未対応）
-    image: vllm/vllm-openai:qwen3_5-cu130
+    # Qwen3.5 は vLLM upstream イメージが必要（NGC 26.01 は transformers が古く qwen3_5_moe 未対応）
+    # cu130-nightly は最新 Triton + PyTorch を含み、SM 12.1 の Gated DeltaNet FLA カーネル問題を回避
+    image: vllm/vllm-openai:cu130-nightly
     profiles: ["qwen35"]
     # vllm/vllm-openai は ENTRYPOINT ["vllm", "serve"] がプリセット済みのため、command にはモデルパス以降のみ指定
     command:

From 3722b6447330e3ece3af6ce18fa0fdebcac3ad2b Mon Sep 17 00:00:00 2001
From: Fumitaka Tokumitsu <3142849+toku345@users.noreply.github.com>
Date: Sun, 1 Mar 2026 18:14:03 +0900
Subject: [PATCH 2/2] Update CLAUDE.md: reflect cu130-nightly image and MoE
 backend info

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 CLAUDE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 6ea70bb..c2d275b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -64,7 +64,7 @@ curl -X POST http://localhost:8000/v1/chat/completions \
 - クライアント側で `<think>...</think>` タグの除去が必要
 
 ### vLLM
-- Qwen3.5-35B-A3B-FP8: `qwen35` プロファイル。vLLM upstream イメージ使用（NGC 26.01 は `qwen3_5_moe` 未対応）。`--reasoning-parser qwen3` で thinking を `reasoning_content` に分離。`--language-model-only` でビジョンエンコーダーを無効化（テキスト専用モード）
+- Qwen3.5-35B-A3B-FP8: `qwen35` プロファイル。`vllm/vllm-openai:cu130-nightly` 使用（NGC 26.01 は `qwen3_5_moe` 未対応、専用 cu130 ビルドは Triton/RMSNormGated バグあり）。`--reasoning-parser qwen3` で thinking を `reasoning_content` に分離。`--language-model-only` でビジョンエンコーダーを無効化（テキスト専用モード）。SM 12.1 では TRITON Fp8 MoE バックエンドが自動選択される
 - ツール呼び出し対応（Qwen3-Coder）
 - 内部プロンプト確認: `echo: true` パラメータを使用
 - 設定パラメータ: `--gpu-memory-utilization 0.9`, `--max-model-len 32768`