vllm-project · esmeetu · May 8, 2026 · May 8, 2026 · chatgpt-codex-connector · May 8, 2026
diff --git a/models/openai/gpt-oss-120b.yaml b/models/openai/gpt-oss-120b.yaml
@@ -48,12 +48,6 @@ variants:
     vram_minimum_gb: 96
     description: "gpt-oss-120b with MXFP4 MoE; fits on 1xA100 80GB, scales to TP 2/4/8"
 
-  "20b":
-    model_id: "openai/gpt-oss-20b"
-    precision: mxfp4
-    vram_minimum_gb: 40
-    description: "gpt-oss-20b, fits on a single A100"
-
   amd_fp8:
     model_id: "amd/gpt-oss-120b-w-mxfp4-a-fp8"
     precision: mxfp4

diff --git a/models/openai/gpt-oss-20b.yaml b/models/openai/gpt-oss-20b.yaml
@@ -0,0 +1,212 @@
+meta:
+  title: "GPT-OSS 20B"
+  slug: "gpt-oss-20b"
+  provider: "OpenAI"
+  description: "OpenAI's gpt-oss-20b — 21B-total / 3.6B-active MoE reasoning model with native MXFP4 quant; fits in 16GB VRAM"
+  date_updated: 2026-05-08
+  difficulty: beginner
+  tasks:
+    - text
+  performance_headline: "21B/3.6B-A MoE reasoning model with native MXFP4 — runs on 16GB"
+  related_recipes:
+    - "openai/gpt-oss-120b"
+  hardware:
+    h100: verified
+    h200: verified
+    b200: verified
+    mi300x: verified
+    mi325x: verified
+    mi355x: verified
+
+model:
+  model_id: "openai/gpt-oss-20b"
+  min_vllm_version: "0.10.0"
+  architecture: moe
+  parameter_count: "21B"
+  active_parameters: "3.6B"
+  context_length: 131072
+  base_args: []
+  base_env: {}
+
+features:
+  tool_calling:
+    description: "OpenAI harmony tool-call parser with automatic tool choice"
+    args:
+      - "--tool-call-parser"
+      - "openai"
+      - "--enable-auto-tool-choice"
+
+opt_in_features: []
+
+variants:
+  default:
+    precision: mxfp4
+    vram_minimum_gb: 16
+    description: "MXFP4 MoE weights — fits in 16GB VRAM on a single consumer or datacenter GPU"
+
+compatible_strategies:
+  - single_node_tp
+
+hardware_overrides:
+  blackwell:
+    extra_args:
+      - "--kv-cache-dtype"
+      - "fp8"
+      - "--no-enable-prefix-caching"
+      - "--max-cudagraph-capture-size"
+      - "2048"
+      - "--max-num-batched-tokens"
+      - "8192"
+      - "--stream-interval"
+      - "20"
+    extra_env:
+      VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: "1"
+  hopper:
+    extra_args:
+      - "--no-enable-prefix-caching"
+      - "--max-cudagraph-capture-size"
+      - "2048"
+      - "--max-num-batched-tokens"
+      - "8192"
+      - "--stream-interval"
+      - "20"
+    extra_env: {}
+  amd:
+    extra_args:
+      - "--attention-backend"
+      - "ROCM_AITER_UNIFIED_ATTN"
+      - "-cc.pass_config.fuse_rope_kvcache=True"
+      - "-cc.use_inductor_graph_partition=True"
+      - "--gpu-memory-utilization"
+      - "0.95"
+      - "--block-size=64"
+    extra_env:
+      VLLM_ROCM_USE_AITER: "1"
+      VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: "INT4"
+      HSA_NO_SCRATCH_RECLAIM: "1"
+      AMDGCN_USE_BUFFER_OPS: "0"
+
+strategy_overrides:
+  single_node_tp:
+    tp: 1
+
+guide: |
+  ## Overview
+
+  [`gpt-oss-20b`](https://huggingface.co/openai/gpt-oss-20b) is OpenAI's smaller open-weight reasoning model: 21B total parameters with 3.6B activated per token across 32 experts (top-4 routing), shipped with native MXFP4 quantization on the MoE weights. It targets lower-latency and on-device use cases — the model loads in ~16GB of VRAM, runs on a single H100/H200/B200 or AMD MI300X/MI325X/MI355X, and supports the same harmony chat format, configurable reasoning effort (low / medium / high), and built-in tools (browser, python, function calling) as its larger sibling [`gpt-oss-120b`](https://huggingface.co/openai/gpt-oss-120b).
+
+  Architectural notes:
+  - 24 layers alternating sliding-window (window=128) and full attention.
+  - YaRN rope scaling (factor=32) extending 4K → 131K context.
+  - MXFP4 quant on `model.layers.*.mlp` experts; attention, router, embeddings stay in BF16.
+
+  ## Prerequisites
+
+  - Hardware: NVIDIA H100/H200/B200 or AMD MI300X/MI325X/MI355X (also runs on Ada/Ampere consumer cards with sufficient VRAM).
+  - vLLM >= 0.10.0.
+  - CUDA >= 12.8 if building from source (must match between install and serving).
+
+  ### Install vLLM
+
+  ```bash
+  uv venv
+  source .venv/bin/activate
+  uv pip install vllm --torch-backend=auto
+  ```
+
+  Docker quickstart:
+
+  ```bash
+  docker run --gpus all -p 8000:8000 --ipc=host vllm/vllm-openai --model openai/gpt-oss-20b
+  ```
+
+  AMD ROCm wheels:
+
+  ```bash
+  uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm
+  ```
+
+  ## Launch commands
+
+  Single GPU (default — works on any 16GB+ card):
+
+  ```bash
+  vllm serve openai/gpt-oss-20b
+  ```
+
+  Blackwell (B200) with FlashInfer MXFP4+MXFP8 MoE:
+
+  ```bash
+  export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
+
+  vllm serve openai/gpt-oss-20b \
+    --kv-cache-dtype fp8 \
+    --no-enable-prefix-caching \
+    --max-cudagraph-capture-size 2048 \
+    --max-num-batched-tokens 8192 \
+    --stream-interval 20
+  ```
+
+  Hopper (H100/H200): same as Blackwell minus `--kv-cache-dtype fp8` and the FlashInfer env var.
+
+  AMD MI300X/MI325X/MI355X:
+
+  ```bash
+  export HSA_NO_SCRATCH_RECLAIM=1
+  export AMDGCN_USE_BUFFER_OPS=0
+  export VLLM_ROCM_USE_AITER=1
+  export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+  vllm serve openai/gpt-oss-20b \
+    --attention-backend ROCM_AITER_UNIFIED_ATTN \
+    -cc.pass_config.fuse_rope_kvcache=True \
+    -cc.use_inductor_graph_partition=True \
+    --gpu-memory-utilization 0.95 \
+    --block-size 64
+  ```
+
+  ## Tool use
+
+  The `/v1/responses` endpoint supports built-in tools (browsing, python, MCP). Setup requires `uv pip install gpt-oss` and either Docker (for the Python sandbox) or `PYTHON_EXECUTION_BACKEND=dangerously_use_uv`. For demo tools:
+
+  ```bash
+  vllm serve openai/gpt-oss-20b --tool-server demo
+  ```
+
+  For user-defined function calling (toggle the **Tool Calling** feature above, or pass manually):
+
+  ```bash
+  vllm serve openai/gpt-oss-20b --tool-call-parser openai --enable-auto-tool-choice
+  ```
+
+  ## Reasoning effort
+
+  gpt-oss exposes three reasoning levels — low, medium, high — selected via the system prompt:
+
+  ```python
+  from openai import OpenAI
+
+  client = OpenAI(api_key="EMPTY", base_url="http://localhost:8000/v1")
+  response = client.chat.completions.create(
+      model="openai/gpt-oss-20b",
+      messages=[
+          {"role": "system", "content": "Reasoning: high"},
+          {"role": "user", "content": "Explain why eigenvalues matter."},
+      ],
+  )
+  print(response.choices[0].message.content)
+  ```
+
+  ## Troubleshooting
+
+  - **Attention sinks dtype error on Blackwell:** ensure `VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1` and `--kv-cache-dtype fp8`.
+  - **`tl.language not defined`:** make sure no extra Triton (e.g., `pytorch-triton`) is installed alongside vLLM's bundled Triton.
+  - **Harmony vocab download failure:** pre-download tiktoken files and set `TIKTOKEN_ENCODINGS_BASE`.
+
+  ## References
+
+  - [Model card — gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b)
+  - [Sibling — gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b)
+  - [OpenAI announcement](https://openai.com/index/introducing-gpt-oss/)
+  - [gpt-oss model card paper (arXiv:2508.10925)](https://arxiv.org/abs/2508.10925)
+  - [vLLM gpt-oss cookbook](https://cookbook.openai.com/articles/gpt-oss/run-vllm)
diff --git a/models/poolside/Laguna-XS.2.yaml b/models/poolside/Laguna-XS.2.yaml
@@ -37,8 +37,14 @@ features:
     args:
       - "--reasoning-parser"
       - "poolside_v1"
+  spec_decoding:
+    description: "DFlash speculative decoding with the Laguna-XS.2 draft model (7 tokens, greedy)"
+    args:
+      - "--speculative-config"
+      - '{"model":"poolside/Laguna-XS.2-speculator.dflash","num_speculative_tokens":7,"method":"dflash"}'
 
-opt_in_features: []
+opt_in_features:
+  - spec_decoding
 
 variants:
   default:
@@ -48,6 +54,8 @@ variants:
 
 compatible_strategies:
   - single_node_tp
+  - single_node_tep
+  - single_node_dep
 
 hardware_overrides: {}
 
@@ -135,8 +143,30 @@ guide: |
 
   Or default-on with `--default-chat-template-kwargs '{"enable_thinking": true}'`.
 
+  ## Speculative decoding (DFlash)
+
+  Enable the **Spec Decoding** toggle above to attach Poolside's [DFlash draft model](https://huggingface.co/poolside/Laguna-XS.2-speculator.dflash) — a 5-layer Llama-style speculator that proposes up to 7 tokens per step. Reported per-position acceptance with reasoning enabled is ~70% at position 1 across coding, math, QA, and writing workloads.
+
+  Requires:
+  - vLLM built from [PR #41880](https://github.com/vllm-project/vllm/pull/41880) (extends the base Laguna PR with DFlash support).
+  - `VLLM_USE_DEEP_GEMM=0` in the launch environment — DeepGEMM is currently incompatible with the DFlash draft path.
+
+  Example:
+
+  ```bash
+  VLLM_USE_DEEP_GEMM=0 vllm serve poolside/Laguna-XS.2 \
+    --trust-remote-code \
+    --max-model-len 16384 \
+    --enable-auto-tool-choice \
+    --tool-call-parser poolside_v1 \
+    --reasoning-parser poolside_v1 \
+    --speculative-config '{"model":"poolside/Laguna-XS.2-speculator.dflash","num_speculative_tokens":7,"method":"dflash"}'
+  ```
+
   ## References
 
   - [Model card](https://huggingface.co/poolside/Laguna-XS.2)
   - [Release blog post](https://poolside.ai/blog/laguna-a-deeper-dive)
   - [vLLM support PR #41129](https://github.com/vllm-project/vllm/pull/41129)
+  - [vLLM DFlash spec-decoding PR #41880](https://github.com/vllm-project/vllm/pull/41880)
+  - [DFlash draft model](https://huggingface.co/poolside/Laguna-XS.2-speculator.dflash)