From e421959f86478b54f148861d5042856a166e1cc0 Mon Sep 17 00:00:00 2001 From: Jake Writer Date: Fri, 6 Mar 2026 22:07:19 -0700 Subject: [PATCH 1/3] [BugFix] Fix Qwen3.5 LoRA IndexError in GDN fused projections Fix IndexError: list index out of range when using LoRA adapters with Qwen3.5 models (dense, MoE, and multimodal variants). The root cause was a mismatch between the number of output_sizes slices in the MergedColumnParallelLinear for in_proj_qkvz (4 slices) and the number of entries in packed_modules_mapping (2 entries). When LoRA's set_lora iterated over n_slices=4, it only had 2 LoRA weights available, causing an IndexError at index 2. Changes: - Change create_qkvz_proj output_sizes from [key_dim, key_dim, value_dim, value_dim] to [key_dim * 2 + value_dim, value_dim] to match the 2-entry packed_modules_mapping ["in_proj_qkv", "in_proj_z"] - Update stacked_params_mapping shard IDs from (0,1,2)/3 to 0/1 to match the new 2-slice layout - Add regression tests validating the alignment between output_sizes and packed_modules_mapping for all Qwen3.5 variants The fix preserves the total projection dimension (key_dim*2 + value_dim*2) and matches the HuggingFace checkpoint structure where in_proj_qkv and in_proj_z are stored as separate weight tensors. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Jake Writer --- tests/lora/test_qwen3_5_lora.py | 156 ++++++++++++++++++++++++++ vllm/model_executor/models/qwen3_5.py | 6 +- 2 files changed, 159 insertions(+), 3 deletions(-) create mode 100644 tests/lora/test_qwen3_5_lora.py diff --git a/tests/lora/test_qwen3_5_lora.py b/tests/lora/test_qwen3_5_lora.py new file mode 100644 index 000000000000..7323bf990469 --- /dev/null +++ b/tests/lora/test_qwen3_5_lora.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Tests for Qwen3.5 LoRA support. + +Validates that the packed_modules_mapping and MergedColumnParallelLinear +output_sizes are aligned for Qwen3.5's GDN (Gated Delta Network) layers, +which use fused projections (in_proj_qkvz and in_proj_ba). + +Regression test for: IndexError in set_lora when output_sizes had 4 slices +but packed_modules_mapping only had 2 entries. +""" + +from vllm.model_executor.models.qwen3_5 import ( + Qwen3_5ForCausalLMBase, + Qwen3_5ForConditionalGeneration, + Qwen3_5GatedDeltaNet, + Qwen3_5Model, + Qwen3_5MoeForCausalLM, +) + + +def test_qwen3_5_packed_modules_output_sizes_alignment(): + """Verify packed_modules_mapping entries match output_sizes slices count. + + The LoRA system requires that len(packed_modules_mapping[key]) equals + len(output_sizes) for the corresponding MergedColumnParallelLinear. + A mismatch causes IndexError in set_lora. + """ + mapping = Qwen3_5ForCausalLMBase.packed_modules_mapping + + # in_proj_qkvz should map to 2 sub-modules: in_proj_qkv, in_proj_z + assert "in_proj_qkvz" in mapping + assert mapping["in_proj_qkvz"] == ["in_proj_qkv", "in_proj_z"] + + # in_proj_ba should map to 2 sub-modules: in_proj_b, in_proj_a + assert "in_proj_ba" in mapping + assert mapping["in_proj_ba"] == ["in_proj_b", "in_proj_a"] + + +def test_qwen3_5_create_qkvz_proj_output_sizes(): + """Verify create_qkvz_proj produces output_sizes with 2 slices. + + The key_dim * 2 + value_dim formula represents the combined Q+K+V + projection (in_proj_qkv in the HuggingFace checkpoint), and value_dim + represents the Z projection (in_proj_z). + + For the Qwen3.5-9B model: key_dim=2048, value_dim=4096. + The HF checkpoint has in_proj_qkv.weight shape [8192, 4096] and + in_proj_z.weight shape [4096, 4096]. + """ + import ast + import inspect + import textwrap + + source = inspect.getsource(Qwen3_5GatedDeltaNet.create_qkvz_proj) + source = textwrap.dedent(source) + tree = ast.parse(source) + + # Find the output_sizes keyword argument in MergedColumnParallelLinear() + output_sizes_node = None + for node in ast.walk(tree): + if isinstance(node, ast.keyword) and node.arg == "output_sizes": + output_sizes_node = node.value + break + + assert output_sizes_node is not None, "Could not find output_sizes kwarg" + assert isinstance(output_sizes_node, ast.List), ( + "output_sizes should be a list literal" + ) + + # Must have exactly 2 elements to match packed_modules_mapping + num_slices = len(output_sizes_node.elts) + num_packed = len(Qwen3_5ForCausalLMBase.packed_modules_mapping["in_proj_qkvz"]) + assert num_slices == num_packed, ( + f"output_sizes has {num_slices} slices but packed_modules_mapping " + f"has {num_packed} entries for in_proj_qkvz. These must match." + ) + + # Verify with concrete values: key_dim=2048, value_dim=4096 + key_dim, value_dim = 2048, 4096 + expected_sizes = [key_dim * 2 + value_dim, value_dim] + assert expected_sizes == [8192, 4096] + assert sum(expected_sizes) == key_dim * 2 + value_dim * 2 + + +def test_qwen3_5_conditional_generation_packed_mapping(): + """Verify multimodal variant also has correct GDN packed mapping.""" + mapping = Qwen3_5ForConditionalGeneration.packed_modules_mapping + + # Should include the GDN mappings + assert mapping["in_proj_qkvz"] == ["in_proj_qkv", "in_proj_z"] + assert mapping["in_proj_ba"] == ["in_proj_b", "in_proj_a"] + + # Should also include standard attention and MLP mappings + assert "qkv_proj" in mapping + assert "gate_up_proj" in mapping + + +def test_qwen3_5_moe_inherits_packed_mapping(): + """Verify MoE variant inherits the same packed_modules_mapping.""" + dense_mapping = Qwen3_5ForCausalLMBase.packed_modules_mapping + moe_mapping = Qwen3_5MoeForCausalLM.packed_modules_mapping + + # MoE should have identical GDN mappings as dense + assert moe_mapping["in_proj_qkvz"] == dense_mapping["in_proj_qkvz"] + assert moe_mapping["in_proj_ba"] == dense_mapping["in_proj_ba"] + + +def test_qwen3_5_stacked_params_shard_ids(): + """Verify load_weights stacked_params_mapping uses correct shard IDs. + + With output_sizes=[qkv_size, z_size], shard IDs must be integers 0 and 1. + Tuple shard IDs like (0, 1, 2) would indicate a mismatch with output_sizes. + """ + import ast + import inspect + import textwrap + + # stacked_params_mapping is in Qwen3_5Model.load_weights (the inner model) + source = inspect.getsource(Qwen3_5Model.load_weights) + source = textwrap.dedent(source) + tree = ast.parse(source) + + # Find the stacked_params_mapping list assignment + stacked_mapping = None + for node in ast.walk(tree): + if isinstance(node, ast.Assign): + for target in node.targets: + if ( + isinstance(target, ast.Name) + and target.id == "stacked_params_mapping" + ): + stacked_mapping = ast.literal_eval(node.value) + break + + assert stacked_mapping is not None, "Could not find stacked_params_mapping" + + # Find the in_proj_qkvz entries + qkvz_entries = [e for e in stacked_mapping if e[0] == "in_proj_qkvz"] + assert len(qkvz_entries) == 2, ( + f"Expected 2 in_proj_qkvz entries, got {qkvz_entries}" + ) + + # Shard IDs must be simple integers 0 and 1 (not tuples) + shard_ids = {e[2] for e in qkvz_entries} + assert shard_ids == {0, 1}, ( + f"in_proj_qkvz shard IDs should be {{0, 1}}, got {shard_ids}. " + "Tuple shard IDs like (0,1,2) indicate a mismatch with output_sizes." + ) + + # Verify in_proj_ba shard IDs + ba_entries = [e for e in stacked_mapping if e[0] == "in_proj_ba"] + assert len(ba_entries) == 2 + ba_shard_ids = {e[2] for e in ba_entries} + assert ba_shard_ids == {0, 1} diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py index 30823ada1ee7..b6bce079d861 100644 --- a/vllm/model_executor/models/qwen3_5.py +++ b/vllm/model_executor/models/qwen3_5.py @@ -139,7 +139,7 @@ def create_qkvz_proj( ) -> MergedColumnParallelLinear: return MergedColumnParallelLinear( input_size=hidden_size, - output_sizes=[key_dim, key_dim, value_dim, value_dim], + output_sizes=[key_dim * 2 + value_dim, value_dim], bias=False, quant_config=quant_config, prefix=prefix, @@ -372,8 +372,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), # GDN - ("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)), - ("in_proj_qkvz", "in_proj_z", 3), + ("in_proj_qkvz", "in_proj_qkv", 0), + ("in_proj_qkvz", "in_proj_z", 1), ("in_proj_ba", "in_proj_b", 0), ("in_proj_ba", "in_proj_a", 1), ] From ce82f3d9f16da8ae7903362b6cb1da1e4470ca40 Mon Sep 17 00:00:00 2001 From: Jake Writer Date: Wed, 18 Mar 2026 22:21:58 -0600 Subject: [PATCH 2/3] [Bugfix] Fix gdn_in_proj output size for quantized models (AWQ/GPTQ) with LoRA The `gdn_in_proj` custom op (introduced in f1740006e / PR #36795) uses `self.in_proj_qkvz.weight.shape[0]` to communicate the output tensor size to torch.compile's fake implementation. With LoRA + AWQ/GPTQ quantization, `.weight` returns the quantized `qweight` whose shape is packed (e.g. input_size // 8 for 4-bit), causing a dimension mismatch in the subsequent `.split()` call. Fix: compute output sizes analytically from model dimensions (key_dim, value_dim, num_v_heads, tp_size) instead of reading from the weight tensor shape. These computed values are identical to weight.shape[0] for non-quantized models, so there is no regression. Tested with: - cyankiwi/Qwen3.5-9B-AWQ-4bit + LoRA adapters (torch.compile) - Qwen/Qwen3.5-9B without quantization (torch.compile) - Qwen/Qwen3.5-9B + LoRA adapters without quantization (eager) - Qwen/Qwen3.5-35B-A3B-GPTQ-Int4 (torch.compile) Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Jake Writer --- vllm/model_executor/models/qwen3_5.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py index 9f0238a6d10b..d74e07dde62f 100644 --- a/vllm/model_executor/models/qwen3_5.py +++ b/vllm/model_executor/models/qwen3_5.py @@ -180,14 +180,15 @@ def forward( # ============================================================ # Part 1: Input Projection # ============================================================ + qkv_size = (self.key_dim * 2 + self.value_dim) // self.tp_size + z_size = self.value_dim // self.tp_size + ba_size = (self.num_v_heads * 2) // self.tp_size mixed_qkvz, ba = torch.ops.vllm.gdn_in_proj( hidden_states, - self.in_proj_qkvz.weight.shape[0], - self.in_proj_ba.weight.shape[0], + qkv_size + z_size, + ba_size, self.prefix, ) - qkv_size = (self.key_dim * 2 + self.value_dim) // self.tp_size - z_size = self.value_dim // self.tp_size mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1) z = z.reshape(z.size(0), -1, self.head_v_dim) b, a = ba.chunk(2, dim=-1) From 08d7b09b2cc1911b395163a8220a44bd5e8d8aa3 Mon Sep 17 00:00:00 2001 From: Jake Writer Date: Wed, 18 Mar 2026 22:34:46 -0600 Subject: [PATCH 3/3] [Test] Add regression tests for gdn_in_proj quantization fix Add two tests to prevent future regressions: 1. test_qwen3_5_forward_does_not_use_weight_shape_for_gdn_in_proj: Verifies the forward method computes gdn_in_proj output sizes from model dimensions instead of .weight.shape[0], which returns wrong values for quantized models (AWQ/GPTQ) with LoRA. 2. test_qwen3_5_gdn_output_sizes_match_model_dims: Validates the computed output size formulas against known Qwen3.5-9B dimensions, including TP sharding correctness. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Jake Writer --- tests/lora/test_qwen3_5_lora.py | 93 +++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/tests/lora/test_qwen3_5_lora.py b/tests/lora/test_qwen3_5_lora.py index 7323bf990469..37e968c48c6e 100644 --- a/tests/lora/test_qwen3_5_lora.py +++ b/tests/lora/test_qwen3_5_lora.py @@ -154,3 +154,96 @@ def test_qwen3_5_stacked_params_shard_ids(): assert len(ba_entries) == 2 ba_shard_ids = {e[2] for e in ba_entries} assert ba_shard_ids == {0, 1} + + +def test_qwen3_5_forward_does_not_use_weight_shape_for_gdn_in_proj(): + """Verify forward() computes gdn_in_proj output sizes from model dims. + + Regression test for: when using quantized models (AWQ/GPTQ) with LoRA, + self.in_proj_qkvz.weight.shape[0] returns the packed quantized weight + dimension (e.g. input_size // 8 for 4-bit) instead of the actual output + size. This caused torch.compile to trace with wrong tensor shapes, + leading to a split size mismatch error. + + The fix computes output sizes from key_dim, value_dim, num_v_heads, and + tp_size instead of reading weight.shape[0]. + + Introduced by commit f1740006e ([Perf] Enable dual stream execution of + input projection for Qwen3 #36795) which added the gdn_in_proj custom op. + """ + import ast + import inspect + import textwrap + + source = inspect.getsource(Qwen3_5GatedDeltaNet.forward) + source = textwrap.dedent(source) + tree = ast.parse(source) + + # Ensure the forward method does NOT reference .weight.shape + # (which breaks for quantized models with LoRA) + source_text = source + assert ".weight.shape" not in source_text, ( + "Qwen3_5GatedDeltaNet.forward() must not use .weight.shape to " + "determine gdn_in_proj output sizes. For quantized models " + "(AWQ/GPTQ) with LoRA, .weight returns the packed qweight whose " + "shape does not reflect the actual output dimension. Use computed " + "sizes from model dimensions (key_dim, value_dim, etc.) instead." + ) + + # Verify gdn_in_proj is called with computed size expressions, + # not attribute accesses on weight + gdn_call_found = False + for node in ast.walk(tree): + if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute): + func_str = ast.dump(node.func) + if "gdn_in_proj" in func_str: + gdn_call_found = True + # The 2nd and 3rd positional args (index 1,2) should be + # computed sizes, not .weight.shape[0] attribute accesses + for arg in node.args[1:3]: + arg_source = ast.get_source_segment(source, arg) + if arg_source: + assert "weight" not in arg_source, ( + f"gdn_in_proj argument '{arg_source}' must not " + "reference .weight - use computed model dims" + ) + assert gdn_call_found, "Could not find gdn_in_proj call in forward()" + + +def test_qwen3_5_gdn_output_sizes_match_model_dims(): + """Verify computed output sizes match expected values for Qwen3.5-9B. + + For Qwen3.5-9B: + - key_dim = num_k_heads * head_k_dim = 16 * 128 = 2048 + - value_dim = num_v_heads * head_v_dim = 32 * 128 = 4096 + - num_v_heads = 32 + + With tp_size=1: + - qkvz_output = (2048*2 + 4096) + 4096 = 12288 + - ba_output = 32 * 2 = 64 + """ + key_dim = 2048 + value_dim = 4096 + num_v_heads = 32 + tp_size = 1 + + qkv_size = (key_dim * 2 + value_dim) // tp_size # 8192 + z_size = value_dim // tp_size # 4096 + ba_size = (num_v_heads * 2) // tp_size # 64 + + # These are the values passed to gdn_in_proj + qkvz_output = qkv_size + z_size + assert qkvz_output == 12288, f"Expected 12288, got {qkvz_output}" + assert ba_size == 64, f"Expected 64, got {ba_size}" + + # The split after gdn_in_proj must consume the full qkvz output + assert qkv_size + z_size == qkvz_output + + # Verify with tp_size=2 + tp_size = 2 + qkv_size_tp2 = (key_dim * 2 + value_dim) // tp_size # 4096 + z_size_tp2 = value_dim // tp_size # 2048 + ba_size_tp2 = (num_v_heads * 2) // tp_size # 32 + + assert qkv_size_tp2 + z_size_tp2 == 6144 + assert ba_size_tp2 == 32