Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,10 @@ benchmarks/nvllm/results/*.sqlite
# nvllm evaluation results (evals/)
benchmarks/nvllm/results/evals/

# nvllm trace full-logs (keep trimmed decode_log.txt tracked, full dumps local-only)
benchmarks/nvllm/traces/**/*.full.txt
benchmarks/nvllm/traces/**/*.full.log

# nvllm: excluded from fork
CLAUDE.md
.claude/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.3.self_attn.attn nat=1 phaseB ref: absmax=220.6929 mean=5.4770e-02 kernel: absmax=220.6929 mean=5.4770e-02 diff: max=0.0000 mean=2.0900e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.3.self_attn.attn nat=1 phaseC hidden_ref_absmax=0.0000 hidden_kernel_absmax=0.0000 h_max_diff=0.0000 res_ref_absmax=170141183460469231731687303715884105728.0000 res_kernel_absmax=170141183460469231731687303715884105728.0000 r_max_diff=224.0000 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.7.self_attn.attn nat=1 phaseB ref: absmax=50.6014 mean=1.1364e-02 kernel: absmax=50.6014 mean=1.1364e-02 diff: max=0.0000 mean=1.4823e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.7.self_attn.attn nat=1 phaseC hidden_ref_absmax=65.5932 hidden_kernel_absmax=65.5000 h_max_diff=0.0932 res_ref_absmax=50.6014 res_kernel_absmax=50.5000 r_max_diff=0.1014 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.11.self_attn.attn nat=1 phaseB ref: absmax=39.5711 mean=1.0581e-03 kernel: absmax=39.5711 mean=1.0581e-03 diff: max=0.0000 mean=2.0968e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.11.self_attn.attn nat=1 phaseC hidden_ref_absmax=53.4998 hidden_kernel_absmax=53.5000 h_max_diff=0.0063 res_ref_absmax=39.5711 res_kernel_absmax=39.5000 r_max_diff=0.0711 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.15.self_attn.attn nat=1 phaseB ref: absmax=13.8859 mean=3.0576e-03 kernel: absmax=13.8859 mean=3.0576e-03 diff: max=0.0000 mean=1.3743e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.15.self_attn.attn nat=1 phaseC hidden_ref_absmax=36.4917 hidden_kernel_absmax=36.5000 h_max_diff=0.0083 res_ref_absmax=13.8859 res_kernel_absmax=13.8750 r_max_diff=0.0109 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.19.self_attn.attn nat=1 phaseB ref: absmax=3.6525 mean=1.5460e-03 kernel: absmax=3.6525 mean=1.5460e-03 diff: max=0.0000 mean=1.5882e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.19.self_attn.attn nat=1 phaseC hidden_ref_absmax=9.1445 hidden_kernel_absmax=9.1250 h_max_diff=0.0195 res_ref_absmax=3.6525 res_kernel_absmax=3.6562 r_max_diff=0.0038 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.23.self_attn.attn nat=1 phaseB ref: absmax=10.7538 mean=-4.6950e-04 kernel: absmax=10.7538 mean=-4.6950e-04 diff: max=0.0000 mean=1.8470e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.23.self_attn.attn nat=1 phaseC hidden_ref_absmax=19.5012 hidden_kernel_absmax=19.5000 h_max_diff=0.0073 res_ref_absmax=10.7538 res_kernel_absmax=10.7500 r_max_diff=0.0065 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.27.self_attn.attn nat=1 phaseB ref: absmax=3.7602 mean=-7.8647e-04 kernel: absmax=3.7602 mean=-7.8647e-04 diff: max=0.0000 mean=2.5609e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.27.self_attn.attn nat=1 phaseC hidden_ref_absmax=5.6586 hidden_kernel_absmax=5.6562 h_max_diff=0.0039 res_ref_absmax=3.7602 res_kernel_absmax=3.7656 r_max_diff=0.0070 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.31.self_attn.attn nat=1 phaseB ref: absmax=7.0185 mean=-4.4733e-03 kernel: absmax=7.0185 mean=-4.4733e-03 diff: max=0.0000 mean=2.2128e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.31.self_attn.attn nat=1 phaseC hidden_ref_absmax=10.2285 hidden_kernel_absmax=10.2500 h_max_diff=0.0215 res_ref_absmax=7.0185 res_kernel_absmax=7.0312 r_max_diff=0.0128 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.35.self_attn.attn nat=1 phaseB ref: absmax=3.9329 mean=2.2475e-04 kernel: absmax=3.9329 mean=2.2475e-04 diff: max=0.0000 mean=2.4080e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.35.self_attn.attn nat=1 phaseC hidden_ref_absmax=4.7694 hidden_kernel_absmax=4.7812 h_max_diff=0.0118 res_ref_absmax=3.9329 res_kernel_absmax=3.9375 r_max_diff=0.0065 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.39.self_attn.attn nat=1 phaseB ref: absmax=2.0009 mean=-7.4135e-03 kernel: absmax=2.0009 mean=-7.4135e-03 diff: max=0.0000 mean=2.2953e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:36 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.39.self_attn.attn nat=1 phaseC hidden_ref_absmax=1.7291 hidden_kernel_absmax=1.7266 h_max_diff=0.0034 res_ref_absmax=2.0009 res_kernel_absmax=2.0000 r_max_diff=0.0039 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:37 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.43.self_attn.attn nat=1 phaseB ref: absmax=7.0806 mean=1.8172e-03 kernel: absmax=7.0806 mean=1.8172e-03 diff: max=0.0000 mean=3.5171e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:37 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.43.self_attn.attn nat=1 phaseC hidden_ref_absmax=2.4871 hidden_kernel_absmax=2.4844 h_max_diff=0.0039 res_ref_absmax=7.0806 res_kernel_absmax=7.0938 r_max_diff=0.0131 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:37 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.47.self_attn.attn nat=1 phaseB ref: absmax=11.1461 mean=2.6451e-04 kernel: absmax=11.1461 mean=2.6450e-04 diff: max=0.0000 mean=3.2844e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:37 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.47.self_attn.attn nat=1 phaseC hidden_ref_absmax=8.4472 hidden_kernel_absmax=8.4375 h_max_diff=0.0097 res_ref_absmax=11.1461 res_kernel_absmax=11.1250 r_max_diff=0.0211 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:37 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.51.self_attn.attn nat=1 phaseB ref: absmax=9.8056 mean=-1.3065e-03 kernel: absmax=9.8056 mean=-1.3065e-03 diff: max=0.0000 mean=4.1282e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:37 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.51.self_attn.attn nat=1 phaseC hidden_ref_absmax=4.8361 hidden_kernel_absmax=4.8438 h_max_diff=0.0076 res_ref_absmax=9.8056 res_kernel_absmax=9.8125 r_max_diff=0.0154 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:37 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.55.self_attn.attn nat=1 phaseB ref: absmax=35.0521 mean=-2.5966e-02 kernel: absmax=35.0521 mean=-2.5966e-02 diff: max=0.0000 mean=3.9317e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:37 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.55.self_attn.attn nat=1 phaseC hidden_ref_absmax=19.2748 hidden_kernel_absmax=19.2500 h_max_diff=0.0248 res_ref_absmax=35.0521 res_kernel_absmax=35.0000 r_max_diff=0.0521 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:37 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.59.self_attn.attn nat=1 phaseB ref: absmax=53.3233 mean=-8.7635e-02 kernel: absmax=53.3234 mean=-8.7635e-02 diff: max=0.0000 mean=8.4114e-07 close=True
(EngineCore pid=144) INFO 04-17 20:20:37 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.59.self_attn.attn nat=1 phaseC hidden_ref_absmax=7.9966 hidden_kernel_absmax=8.0000 h_max_diff=0.0037 res_ref_absmax=53.3234 res_kernel_absmax=53.2500 r_max_diff=0.0734 close_h=True close_r=True
(EngineCore pid=144) INFO 04-17 20:20:37 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.63.self_attn.attn nat=1 phaseB ref: absmax=75.9949 mean=-1.0771e-01 kernel: absmax=75.9949 mean=-1.0771e-01 diff: max=0.0000 mean=2.0806e-06 close=True
(EngineCore pid=144) INFO 04-17 20:20:37 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.63.self_attn.attn nat=1 phaseC hidden_ref_absmax=1.2595 hidden_kernel_absmax=1.2578 h_max_diff=0.0030 res_ref_absmax=75.9949 res_kernel_absmax=76.0000 r_max_diff=0.1223 close_h=True close_r=True
86 changes: 86 additions & 0 deletions benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-c/summary.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Phase C ship-gate evidence

**Date:** 2026-04-17
**Commit:** `cbfadb6a9` (`cbfadb6a9104ea49da1c5a8cbd118cb157662978`)
**Branch:** `feat/own-the-stack-phase-c`
**Image:** `nvllm:gb10-ots-phaseC` (ID `6b6c71c9daa3`, 20GB)
**Model:** `natfii/Qwen3.5-27B-NVFP4-Opus-GB10`
**Backend:** CuTe paged, PIECEWISE CUDA graphs
**KV cache:** fp8_e4m3
**Context:** 65536 tokens, max_num_seqs=4
**Env:** `CUTE_DEBUG_FUSION=1`

## Ship gate

GSM8K sanity: **8/8 correct (100%) — PASS**

```
Q1: OK expected=72 got=72 (36.6s — first-request warmup)
Q2: OK expected=10 got=10 (6.1s)
Q3: OK expected=5 got=5 (6.4s)
Q4: OK expected=42 got=42 (6.3s)
Q5: OK expected=624 got=624 (6.1s)
Q6: OK expected=35 got=35 (6.4s)
Q7: OK expected=48 got=48 (6.2s)
Q8: OK expected=16 got=16 (6.2s)
```

Matches Phase B baseline (commit `4110dc77a`, image `nvllm:gb10-ots`) — same 8/8, same ~6 s warm latency per prompt after first-request JIT warmup.

## Tier-1 host-side evidence

`notebooks/nvllm/layers_smoke_tests.py` — **5/5 passed** on host CPU:
- `test_rms_block_level_equivalence` — `Qwen3_5RMSNorm(5120)` vs `GemmaRMSNorm(5120)`, `rtol=0 atol=0` on bf16 input.
- `test_rms_head_dim_equivalence` — same at `hidden_size=256` (q/k_norm shape).
- `test_rms_fused_residual_equivalence` — fused add+norm path.
- `test_customop_registry_no_collision` — `qwen3_5_rms_norm` and `gemma_rms_norm` coexist in `op_registry`.
- `test_mlp_class_surface` — `Qwen3_5MLP.__init__` has exactly `(hidden_size, intermediate_size, hidden_act, quant_config, prefix)` — no `expert_gate`, no `reduce_results`.

## Tier-2 lint evidence

- `pre-commit run --files ...` — all hooks pass (ruff, format, typos, SPDX, etc.).
SPDX-header hook auto-added canonical vLLM-project copyright line to the 4 new files during first run; re-run clean.
- `pre-commit run mypy-3.10 --all-files --hook-stage manual` — zero errors in Phase C files.
11 pre-existing errors remain in `vllm/v1/attention/backends/cute_paged/{kernel.py,_backend.py}` and `vllm/v1/core/kv_cache_utils.py` — identical to Phase B baseline, not introduced by this refactor.

## Tier-3 image-content verification

```bash
docker run --rm --gpus all --entrypoint python nvllm:gb10-ots-phaseC \
-c "from vllm.nvllm.layers import Qwen3_5RMSNorm, Qwen3_5MLP; \
from vllm.model_executor.custom_op import op_registry; \
print('IMPORT_OK'); \
print('qwen3_5_rms_norm in registry:', 'qwen3_5_rms_norm' in op_registry)"
# → IMPORT_OK
# → qwen3_5_rms_norm in registry: True
```

## Repro

```bash
git checkout feat/own-the-stack-phase-c
docker build -f docker/Dockerfile.gb10 -t nvllm:gb10-ots-phaseC .
NVLLM_IMAGE=nvllm:gb10-ots-phaseC CUTE_DEBUG_FUSION=1 ./scripts/serve-cute.sh
# wait for "Application startup complete."
.venv/bin/python scripts/gsm8k_sanity.py --api http://localhost:8000/v1 --model default
```

## Rollback

Phase B image `nvllm:gb10-ots` stays on disk as the rollback snapshot.
`nvllm:gb10-preshim-phaseC-20260417` tag also points at the same Phase B image ID (`240c48497d10`).

```bash
docker tag nvllm:gb10-ots nvllm:gb10
git checkout main
```

No data migration, no weight-format changes, no config-schema changes — rollback is instant.

## Kernel durations

No kernel changes in Phase C — this is a pure layer-ownership refactor.
Decode kernel durations match the Phase B baseline
(see [`../2026-04-17-own-the-stack/summary.md`](../2026-04-17-own-the-stack/summary.md)).
No new `.nsys-rep` required per AGENTS.md §4 — new traces are only mandatory for perf claims,
not for semantically-equivalent refactors.
148 changes: 148 additions & 0 deletions notebooks/nvllm/layers_smoke_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the nvllm fork
"""Tier-1 host-side smoke tests for nvllm Phase C owned layer classes.

Run: .venv/bin/python notebooks/nvllm/layers_smoke_tests.py
Runs in ~5 s on CPU. No GPU required.

Tests:
1. Qwen3_5RMSNorm block-level equivalence vs upstream GemmaRMSNorm(5120).
2. Qwen3_5RMSNorm head-dim equivalence vs upstream GemmaRMSNorm(256).
3. Qwen3_5RMSNorm fused-residual equivalence vs upstream GemmaRMSNorm.
4. CustomOp registry: 'qwen3_5_rms_norm' exists and does not clobber
'gemma_rms_norm'.

Notes:
- Qwen3_5MLP vs Qwen2MoeMLP equivalence test is SKIPPED here because
MergedColumnParallelLinear / RowParallelLinear require an initialized
distributed backend (torch.distributed). Shape-only smoke test is
included. Full correctness is covered by Tier-3 GSM8K 8/8.
"""

import sys

import torch

from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.custom_op import op_registry
from vllm.model_executor.layers.layernorm import GemmaRMSNorm
from vllm.nvllm.layers.layernorm import Qwen3_5RMSNorm
from vllm.nvllm.layers.mlp import Qwen3_5MLP


def _seed_weights(rms_upstream: GemmaRMSNorm, rms_owned: Qwen3_5RMSNorm) -> None:
"""Copy identical weights into both norms so outputs can be compared."""
torch.manual_seed(42)
w = torch.randn_like(rms_upstream.weight.data)
rms_upstream.weight.data.copy_(w)
rms_owned.weight.data.copy_(w)


def test_rms_block_level_equivalence() -> None:
"""Block-level (hidden_size=5120) equivalence — no residual."""
hidden = 5120
upstream = GemmaRMSNorm(hidden, eps=1e-6)
owned = Qwen3_5RMSNorm(hidden, eps=1e-6)
_seed_weights(upstream, owned)

torch.manual_seed(0)
x = torch.randn(8, hidden, dtype=torch.bfloat16)
y_up = upstream(x.clone())
y_own = owned(x.clone())

torch.testing.assert_close(y_own, y_up, rtol=0, atol=0)
print(" [PASS] test_rms_block_level_equivalence")


def test_rms_head_dim_equivalence() -> None:
"""Head-dim (hidden_size=256) equivalence — the q_norm / k_norm shape."""
hidden = 256
upstream = GemmaRMSNorm(hidden, eps=1e-6)
owned = Qwen3_5RMSNorm(hidden, eps=1e-6)
_seed_weights(upstream, owned)

torch.manual_seed(1)
x = torch.randn(8, 24, hidden, dtype=torch.bfloat16)
y_up = upstream(x.clone())
y_own = owned(x.clone())

torch.testing.assert_close(y_own, y_up, rtol=0, atol=0)
print(" [PASS] test_rms_head_dim_equivalence")


def test_rms_fused_residual_equivalence() -> None:
"""Block-level fused-add-residual forward equivalence."""
hidden = 5120
upstream = GemmaRMSNorm(hidden, eps=1e-6)
owned = Qwen3_5RMSNorm(hidden, eps=1e-6)
_seed_weights(upstream, owned)

torch.manual_seed(2)
x = torch.randn(8, hidden, dtype=torch.bfloat16)
r = torch.randn(8, hidden, dtype=torch.bfloat16)

y_up, r_up = upstream(x.clone(), r.clone())
y_own, r_own = owned(x.clone(), r.clone())

torch.testing.assert_close(y_own, y_up, rtol=0, atol=0)
torch.testing.assert_close(r_own, r_up, rtol=0, atol=0)
print(" [PASS] test_rms_fused_residual_equivalence")


def test_customop_registry_no_collision() -> None:
"""Both 'qwen3_5_rms_norm' and 'gemma_rms_norm' must coexist."""
assert "qwen3_5_rms_norm" in op_registry, "owned CustomOp name missing"
assert "gemma_rms_norm" in op_registry, "upstream CustomOp name was clobbered"
assert op_registry["qwen3_5_rms_norm"] is Qwen3_5RMSNorm
assert op_registry["gemma_rms_norm"] is GemmaRMSNorm
print(" [PASS] test_customop_registry_no_collision")


def test_mlp_class_surface() -> None:
"""Qwen3_5MLP constructor surface matches spec (no expert_gate / reduce_results)."""
import inspect

params = list(inspect.signature(Qwen3_5MLP.__init__).parameters)
assert "expert_gate" not in params, "expert_gate should have been dropped"
assert "reduce_results" not in params, "reduce_results should have been dropped"
# Expected surface:
assert params == [
"self",
"hidden_size",
"intermediate_size",
"hidden_act",
"quant_config",
"prefix",
], f"unexpected Qwen3_5MLP signature: {params}"
print(" [PASS] test_mlp_class_surface")


def main() -> int:
print("Running Tier-1 smoke tests for nvllm Phase C owned layers:")
tests = [
test_rms_block_level_equivalence,
test_rms_head_dim_equivalence,
test_rms_fused_residual_equivalence,
test_customop_registry_no_collision,
test_mlp_class_surface,
]
# CustomOp.__init__ calls get_current_vllm_config(); wrap the whole run
# in a bare VllmConfig context so module-import-time instantiations work
# without standing up a full engine. Same pattern as tests/conftest.py
# `default_vllm_config` fixture.
failed = 0
with set_current_vllm_config(VllmConfig()):
for fn in tests:
try:
fn()
except Exception as exc: # noqa: BLE001
print(f" [FAIL] {fn.__name__}: {exc}")
failed += 1
total = len(tests)
print(f"\nResult: {total - failed}/{total} passed.")
return 0 if failed == 0 else 1


if __name__ == "__main__":
sys.exit(main())
1 change: 1 addition & 0 deletions tools/pre_commit/mypy.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
EXCLUDE = [
"vllm/model_executor/models",
"vllm/nvllm/models",
"vllm/nvllm/layers",
"vllm/model_executor/layers/fla/ops",
# Ignore triton kernels in ops.
"vllm/v1/attention/ops",
Expand Down
15 changes: 15 additions & 0 deletions vllm/nvllm/layers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the nvllm fork
"""nvllm layer primitives — fork-owned RMSNorm, MLP, etc.

Classes here are full copies of upstream class bodies (not subclasses).
Upstream renames must not silently break fusion wiring. Only layer primitives
the uber-kernel will fuse against live here — embedding, MoE-block, parallel-
linear, and activation primitives remain upstream (see Phase C spec §Non-goals).
"""

from vllm.nvllm.layers.layernorm import Qwen3_5RMSNorm
from vllm.nvllm.layers.mlp import Qwen3_5MLP

__all__ = ["Qwen3_5RMSNorm", "Qwen3_5MLP"]
Loading