From e421959f86478b54f148861d5042856a166e1cc0 Mon Sep 17 00:00:00 2001
From: Jake Writer <writer.j@northeastern.edu>
Date: Fri, 6 Mar 2026 22:07:19 -0700
Subject: [PATCH 1/3] [BugFix] Fix Qwen3.5 LoRA IndexError in GDN fused
 projections

Fix IndexError: list index out of range when using LoRA adapters with
Qwen3.5 models (dense, MoE, and multimodal variants).

The root cause was a mismatch between the number of output_sizes slices
in the MergedColumnParallelLinear for in_proj_qkvz (4 slices) and the
number of entries in packed_modules_mapping (2 entries). When LoRA's
set_lora iterated over n_slices=4, it only had 2 LoRA weights available,
causing an IndexError at index 2.

Changes:
- Change create_qkvz_proj output_sizes from [key_dim, key_dim, value_dim,
  value_dim] to [key_dim * 2 + value_dim, value_dim] to match the 2-entry
  packed_modules_mapping ["in_proj_qkv", "in_proj_z"]
- Update stacked_params_mapping shard IDs from (0,1,2)/3 to 0/1 to match
  the new 2-slice layout
- Add regression tests validating the alignment between output_sizes and
  packed_modules_mapping for all Qwen3.5 variants

The fix preserves the total projection dimension (key_dim*2 + value_dim*2)
and matches the HuggingFace checkpoint structure where in_proj_qkv and
in_proj_z are stored as separate weight tensors.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Jake Writer <writer.j@northeastern.edu>
---
 tests/lora/test_qwen3_5_lora.py       | 156 ++++++++++++++++++++++++++
 vllm/model_executor/models/qwen3_5.py |   6 +-
 2 files changed, 159 insertions(+), 3 deletions(-)
 create mode 100644 tests/lora/test_qwen3_5_lora.py

diff --git a/tests/lora/test_qwen3_5_lora.py b/tests/lora/test_qwen3_5_lora.py
new file mode 100644
index 000000000000..7323bf990469
--- /dev/null
+++ b/tests/lora/test_qwen3_5_lora.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for Qwen3.5 LoRA support.
+
+Validates that the packed_modules_mapping and MergedColumnParallelLinear
+output_sizes are aligned for Qwen3.5's GDN (Gated Delta Network) layers,
+which use fused projections (in_proj_qkvz and in_proj_ba).
+
+Regression test for: IndexError in set_lora when output_sizes had 4 slices
+but packed_modules_mapping only had 2 entries.
+"""
+
+from vllm.model_executor.models.qwen3_5 import (
+    Qwen3_5ForCausalLMBase,
+    Qwen3_5ForConditionalGeneration,
+    Qwen3_5GatedDeltaNet,
+    Qwen3_5Model,
+    Qwen3_5MoeForCausalLM,
+)
+
+
+def test_qwen3_5_packed_modules_output_sizes_alignment():
+    """Verify packed_modules_mapping entries match output_sizes slices count.
+
+    The LoRA system requires that len(packed_modules_mapping[key]) equals
+    len(output_sizes) for the corresponding MergedColumnParallelLinear.
+    A mismatch causes IndexError in set_lora.
+    """
+    mapping = Qwen3_5ForCausalLMBase.packed_modules_mapping
+
+    # in_proj_qkvz should map to 2 sub-modules: in_proj_qkv, in_proj_z
+    assert "in_proj_qkvz" in mapping
+    assert mapping["in_proj_qkvz"] == ["in_proj_qkv", "in_proj_z"]
+
+    # in_proj_ba should map to 2 sub-modules: in_proj_b, in_proj_a
+    assert "in_proj_ba" in mapping
+    assert mapping["in_proj_ba"] == ["in_proj_b", "in_proj_a"]
+
+
+def test_qwen3_5_create_qkvz_proj_output_sizes():
+    """Verify create_qkvz_proj produces output_sizes with 2 slices.
+
+    The key_dim * 2 + value_dim formula represents the combined Q+K+V
+    projection (in_proj_qkv in the HuggingFace checkpoint), and value_dim
+    represents the Z projection (in_proj_z).
+
+    For the Qwen3.5-9B model: key_dim=2048, value_dim=4096.
+    The HF checkpoint has in_proj_qkv.weight shape [8192, 4096] and
+    in_proj_z.weight shape [4096, 4096].
+    """
+    import ast
+    import inspect
+    import textwrap
+
+    source = inspect.getsource(Qwen3_5GatedDeltaNet.create_qkvz_proj)
+    source = textwrap.dedent(source)
+    tree = ast.parse(source)
+
+    # Find the output_sizes keyword argument in MergedColumnParallelLinear()
+    output_sizes_node = None
+    for node in ast.walk(tree):
+        if isinstance(node, ast.keyword) and node.arg == "output_sizes":
+            output_sizes_node = node.value
+            break
+
+    assert output_sizes_node is not None, "Could not find output_sizes kwarg"
+    assert isinstance(output_sizes_node, ast.List), (
+        "output_sizes should be a list literal"
+    )
+
+    # Must have exactly 2 elements to match packed_modules_mapping
+    num_slices = len(output_sizes_node.elts)
+    num_packed = len(Qwen3_5ForCausalLMBase.packed_modules_mapping["in_proj_qkvz"])
+    assert num_slices == num_packed, (
+        f"output_sizes has {num_slices} slices but packed_modules_mapping "
+        f"has {num_packed} entries for in_proj_qkvz. These must match."
+    )
+
+    # Verify with concrete values: key_dim=2048, value_dim=4096
+    key_dim, value_dim = 2048, 4096
+    expected_sizes = [key_dim * 2 + value_dim, value_dim]
+    assert expected_sizes == [8192, 4096]
+    assert sum(expected_sizes) == key_dim * 2 + value_dim * 2
+
+
+def test_qwen3_5_conditional_generation_packed_mapping():
+    """Verify multimodal variant also has correct GDN packed mapping."""
+    mapping = Qwen3_5ForConditionalGeneration.packed_modules_mapping
+
+    # Should include the GDN mappings
+    assert mapping["in_proj_qkvz"] == ["in_proj_qkv", "in_proj_z"]
+    assert mapping["in_proj_ba"] == ["in_proj_b", "in_proj_a"]
+
+    # Should also include standard attention and MLP mappings
+    assert "qkv_proj" in mapping
+    assert "gate_up_proj" in mapping
+
+
+def test_qwen3_5_moe_inherits_packed_mapping():
+    """Verify MoE variant inherits the same packed_modules_mapping."""
+    dense_mapping = Qwen3_5ForCausalLMBase.packed_modules_mapping
+    moe_mapping = Qwen3_5MoeForCausalLM.packed_modules_mapping
+
+    # MoE should have identical GDN mappings as dense
+    assert moe_mapping["in_proj_qkvz"] == dense_mapping["in_proj_qkvz"]
+    assert moe_mapping["in_proj_ba"] == dense_mapping["in_proj_ba"]
+
+
+def test_qwen3_5_stacked_params_shard_ids():
+    """Verify load_weights stacked_params_mapping uses correct shard IDs.
+
+    With output_sizes=[qkv_size, z_size], shard IDs must be integers 0 and 1.
+    Tuple shard IDs like (0, 1, 2) would indicate a mismatch with output_sizes.
+    """
+    import ast
+    import inspect
+    import textwrap
+
+    # stacked_params_mapping is in Qwen3_5Model.load_weights (the inner model)
+    source = inspect.getsource(Qwen3_5Model.load_weights)
+    source = textwrap.dedent(source)
+    tree = ast.parse(source)
+
+    # Find the stacked_params_mapping list assignment
+    stacked_mapping = None
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Assign):
+            for target in node.targets:
+                if (
+                    isinstance(target, ast.Name)
+                    and target.id == "stacked_params_mapping"
+                ):
+                    stacked_mapping = ast.literal_eval(node.value)
+                    break
+
+    assert stacked_mapping is not None, "Could not find stacked_params_mapping"
+
+    # Find the in_proj_qkvz entries
+    qkvz_entries = [e for e in stacked_mapping if e[0] == "in_proj_qkvz"]
+    assert len(qkvz_entries) == 2, (
+        f"Expected 2 in_proj_qkvz entries, got {qkvz_entries}"
+    )
+
+    # Shard IDs must be simple integers 0 and 1 (not tuples)
+    shard_ids = {e[2] for e in qkvz_entries}
+    assert shard_ids == {0, 1}, (
+        f"in_proj_qkvz shard IDs should be {{0, 1}}, got {shard_ids}. "
+        "Tuple shard IDs like (0,1,2) indicate a mismatch with output_sizes."
+    )
+
+    # Verify in_proj_ba shard IDs
+    ba_entries = [e for e in stacked_mapping if e[0] == "in_proj_ba"]
+    assert len(ba_entries) == 2
+    ba_shard_ids = {e[2] for e in ba_entries}
+    assert ba_shard_ids == {0, 1}
diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py
index 30823ada1ee7..b6bce079d861 100644
--- a/vllm/model_executor/models/qwen3_5.py
+++ b/vllm/model_executor/models/qwen3_5.py
@@ -139,7 +139,7 @@ def create_qkvz_proj(
     ) -> MergedColumnParallelLinear:
         return MergedColumnParallelLinear(
             input_size=hidden_size,
-            output_sizes=[key_dim, key_dim, value_dim, value_dim],
+            output_sizes=[key_dim * 2 + value_dim, value_dim],
             bias=False,
             quant_config=quant_config,
             prefix=prefix,
@@ -372,8 +372,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
             # GDN
-            ("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
-            ("in_proj_qkvz", "in_proj_z", 3),
+            ("in_proj_qkvz", "in_proj_qkv", 0),
+            ("in_proj_qkvz", "in_proj_z", 1),
             ("in_proj_ba", "in_proj_b", 0),
             ("in_proj_ba", "in_proj_a", 1),
         ]

From ce82f3d9f16da8ae7903362b6cb1da1e4470ca40 Mon Sep 17 00:00:00 2001
From: Jake Writer <writer.j@northeastern.edu>
Date: Wed, 18 Mar 2026 22:21:58 -0600
Subject: [PATCH 2/3] [Bugfix] Fix gdn_in_proj output size for quantized models
 (AWQ/GPTQ) with LoRA

The `gdn_in_proj` custom op (introduced in f1740006e / PR #36795) uses
`self.in_proj_qkvz.weight.shape[0]` to communicate the output tensor
size to torch.compile's fake implementation. With LoRA + AWQ/GPTQ
quantization, `.weight` returns the quantized `qweight` whose shape is
packed (e.g. input_size // 8 for 4-bit), causing a dimension mismatch
in the subsequent `.split()` call.

Fix: compute output sizes analytically from model dimensions
(key_dim, value_dim, num_v_heads, tp_size) instead of reading from
the weight tensor shape. These computed values are identical to
weight.shape[0] for non-quantized models, so there is no regression.

Tested with:
- cyankiwi/Qwen3.5-9B-AWQ-4bit + LoRA adapters (torch.compile)
- Qwen/Qwen3.5-9B without quantization (torch.compile)
- Qwen/Qwen3.5-9B + LoRA adapters without quantization (eager)
- Qwen/Qwen3.5-35B-A3B-GPTQ-Int4 (torch.compile)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Jake Writer <writer.j@northeastern.edu>
---
 vllm/model_executor/models/qwen3_5.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py
index 9f0238a6d10b..d74e07dde62f 100644
--- a/vllm/model_executor/models/qwen3_5.py
+++ b/vllm/model_executor/models/qwen3_5.py
@@ -180,14 +180,15 @@ def forward(
         # ============================================================
         # Part 1: Input Projection
         # ============================================================
+        qkv_size = (self.key_dim * 2 + self.value_dim) // self.tp_size
+        z_size = self.value_dim // self.tp_size
+        ba_size = (self.num_v_heads * 2) // self.tp_size
         mixed_qkvz, ba = torch.ops.vllm.gdn_in_proj(
             hidden_states,
-            self.in_proj_qkvz.weight.shape[0],
-            self.in_proj_ba.weight.shape[0],
+            qkv_size + z_size,
+            ba_size,
             self.prefix,
         )
-        qkv_size = (self.key_dim * 2 + self.value_dim) // self.tp_size
-        z_size = self.value_dim // self.tp_size
         mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
         z = z.reshape(z.size(0), -1, self.head_v_dim)
         b, a = ba.chunk(2, dim=-1)

From 08d7b09b2cc1911b395163a8220a44bd5e8d8aa3 Mon Sep 17 00:00:00 2001
From: Jake Writer <writer.j@northeastern.edu>
Date: Wed, 18 Mar 2026 22:34:46 -0600
Subject: [PATCH 3/3] [Test] Add regression tests for gdn_in_proj quantization
 fix

Add two tests to prevent future regressions:

1. test_qwen3_5_forward_does_not_use_weight_shape_for_gdn_in_proj:
   Verifies the forward method computes gdn_in_proj output sizes from
   model dimensions instead of .weight.shape[0], which returns wrong
   values for quantized models (AWQ/GPTQ) with LoRA.

2. test_qwen3_5_gdn_output_sizes_match_model_dims:
   Validates the computed output size formulas against known Qwen3.5-9B
   dimensions, including TP sharding correctness.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Jake Writer <writer.j@northeastern.edu>
---
 tests/lora/test_qwen3_5_lora.py | 93 +++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/tests/lora/test_qwen3_5_lora.py b/tests/lora/test_qwen3_5_lora.py
index 7323bf990469..37e968c48c6e 100644
--- a/tests/lora/test_qwen3_5_lora.py
+++ b/tests/lora/test_qwen3_5_lora.py
@@ -154,3 +154,96 @@ def test_qwen3_5_stacked_params_shard_ids():
     assert len(ba_entries) == 2
     ba_shard_ids = {e[2] for e in ba_entries}
     assert ba_shard_ids == {0, 1}
+
+
+def test_qwen3_5_forward_does_not_use_weight_shape_for_gdn_in_proj():
+    """Verify forward() computes gdn_in_proj output sizes from model dims.
+
+    Regression test for: when using quantized models (AWQ/GPTQ) with LoRA,
+    self.in_proj_qkvz.weight.shape[0] returns the packed quantized weight
+    dimension (e.g. input_size // 8 for 4-bit) instead of the actual output
+    size. This caused torch.compile to trace with wrong tensor shapes,
+    leading to a split size mismatch error.
+
+    The fix computes output sizes from key_dim, value_dim, num_v_heads, and
+    tp_size instead of reading weight.shape[0].
+
+    Introduced by commit f1740006e ([Perf] Enable dual stream execution of
+    input projection for Qwen3 #36795) which added the gdn_in_proj custom op.
+    """
+    import ast
+    import inspect
+    import textwrap
+
+    source = inspect.getsource(Qwen3_5GatedDeltaNet.forward)
+    source = textwrap.dedent(source)
+    tree = ast.parse(source)
+
+    # Ensure the forward method does NOT reference .weight.shape
+    # (which breaks for quantized models with LoRA)
+    source_text = source
+    assert ".weight.shape" not in source_text, (
+        "Qwen3_5GatedDeltaNet.forward() must not use .weight.shape to "
+        "determine gdn_in_proj output sizes. For quantized models "
+        "(AWQ/GPTQ) with LoRA, .weight returns the packed qweight whose "
+        "shape does not reflect the actual output dimension. Use computed "
+        "sizes from model dimensions (key_dim, value_dim, etc.) instead."
+    )
+
+    # Verify gdn_in_proj is called with computed size expressions,
+    # not attribute accesses on weight
+    gdn_call_found = False
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute):
+            func_str = ast.dump(node.func)
+            if "gdn_in_proj" in func_str:
+                gdn_call_found = True
+                # The 2nd and 3rd positional args (index 1,2) should be
+                # computed sizes, not .weight.shape[0] attribute accesses
+                for arg in node.args[1:3]:
+                    arg_source = ast.get_source_segment(source, arg)
+                    if arg_source:
+                        assert "weight" not in arg_source, (
+                            f"gdn_in_proj argument '{arg_source}' must not "
+                            "reference .weight - use computed model dims"
+                        )
+    assert gdn_call_found, "Could not find gdn_in_proj call in forward()"
+
+
+def test_qwen3_5_gdn_output_sizes_match_model_dims():
+    """Verify computed output sizes match expected values for Qwen3.5-9B.
+
+    For Qwen3.5-9B:
+    - key_dim = num_k_heads * head_k_dim = 16 * 128 = 2048
+    - value_dim = num_v_heads * head_v_dim = 32 * 128 = 4096
+    - num_v_heads = 32
+
+    With tp_size=1:
+    - qkvz_output = (2048*2 + 4096) + 4096 = 12288
+    - ba_output = 32 * 2 = 64
+    """
+    key_dim = 2048
+    value_dim = 4096
+    num_v_heads = 32
+    tp_size = 1
+
+    qkv_size = (key_dim * 2 + value_dim) // tp_size  # 8192
+    z_size = value_dim // tp_size  # 4096
+    ba_size = (num_v_heads * 2) // tp_size  # 64
+
+    # These are the values passed to gdn_in_proj
+    qkvz_output = qkv_size + z_size
+    assert qkvz_output == 12288, f"Expected 12288, got {qkvz_output}"
+    assert ba_size == 64, f"Expected 64, got {ba_size}"
+
+    # The split after gdn_in_proj must consume the full qkvz output
+    assert qkv_size + z_size == qkvz_output
+
+    # Verify with tp_size=2
+    tp_size = 2
+    qkv_size_tp2 = (key_dim * 2 + value_dim) // tp_size  # 4096
+    z_size_tp2 = value_dim // tp_size  # 2048
+    ba_size_tp2 = (num_v_heads * 2) // tp_size  # 32
+
+    assert qkv_size_tp2 + z_size_tp2 == 6144
+    assert ba_size_tp2 == 32