From ac1c83417f0624142410971f3e7a55b6560f3515 Mon Sep 17 00:00:00 2001 From: lesj0610 Date: Fri, 10 Apr 2026 07:38:48 +0900 Subject: [PATCH 1/2] Fix GPTQMarlin group sizing for row-parallel layers Signed-off-by: lesj0610 --- tests/quantization/test_gptq_marlin.py | 58 +++++++++++++++++++ .../layers/quantization/gptq_marlin.py | 5 +- 2 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 tests/quantization/test_gptq_marlin.py diff --git a/tests/quantization/test_gptq_marlin.py b/tests/quantization/test_gptq_marlin.py new file mode 100644 index 000000000000..590d5209b9c4 --- /dev/null +++ b/tests/quantization/test_gptq_marlin.py @@ -0,0 +1,58 @@ +import torch + +import vllm.model_executor.layers.quantization.gptq_marlin as gptq_marlin_mod +import vllm.model_executor.parameter as parameter_mod +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig, + GPTQMarlinLinearMethod, +) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + + +class _DummyKernel: + def __init__(self, *args, **kwargs): + pass + + def process_weights_after_loading(self, layer): + return None + + def apply_weights(self, layer, x, bias): + raise NotImplementedError + + +def test_gptq_marlin_create_weights_uses_ceil_groups_for_row_parallel( + monkeypatch, +): + monkeypatch.setattr( + gptq_marlin_mod, "verify_marlin_supported", lambda **kwargs: None + ) + monkeypatch.setattr( + gptq_marlin_mod, "choose_mp_linear_kernel", lambda _: _DummyKernel + ) + monkeypatch.setattr(parameter_mod, "get_tensor_model_parallel_rank", lambda: 0) + monkeypatch.setattr(parameter_mod, "get_tensor_model_parallel_world_size", lambda: 1) + + config = GPTQMarlinConfig( + weight_bits=4, + group_size=128, + desc_act=False, + is_sym=True, + lm_head_quantized=False, + dynamic={}, + full_config={}, + ) + method = GPTQMarlinLinearMethod(config) + layer = torch.nn.Module() + + method.create_weights( + layer=layer, + input_size_per_partition=2112, + output_partition_sizes=[2816], + input_size=4224, + output_size=2816, + params_dtype=torch.float16, + weight_loader=default_weight_loader, + ) + + assert layer.scales.shape == (17, 2816) + assert layer.qzeros.shape == (17, 352) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index ce0dc0f4e052..29df6e8e839b 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -60,6 +60,7 @@ from vllm.scalar_type import scalar_types from vllm.transformers_utils.config import get_safetensors_params_metadata from vllm.utils.collection_utils import is_list_of +from vllm.utils.math_utils import cdiv logger = init_logger(__name__) @@ -395,12 +396,12 @@ def create_weights( # By setting scale_dim == None, weight_loader will # repeat the scales on each GPU in TP>1 case. scales_and_zp_input_dim = None - scales_and_zp_size = input_size // group_size + scales_and_zp_size = cdiv(input_size, group_size) else: # By setting scale_dim == 0, weight_loader will # shard the scales in TP>1 case. scales_and_zp_input_dim = 0 - scales_and_zp_size = input_size_per_partition // group_size + scales_and_zp_size = cdiv(input_size_per_partition, group_size) # Quantized weights qweight = PackedvLLMParameter( From 653c034b816699ad1d493acc92c47db55afe0867 Mon Sep 17 00:00:00 2001 From: lesj0610 Date: Sat, 11 Apr 2026 12:23:32 +0900 Subject: [PATCH 2/2] Handle Gemma4 AutoRound serving gaps on top of GPTQMarlin fix Signed-off-by: lesj0610 --- tests/quantization/test_gptq_marlin.py | 136 +++++++++++++++++- .../layers/fused_moe/router/gate_linear.py | 18 ++- .../layers/quantization/gptq_marlin.py | 6 + .../layers/quantization/moe_wna16.py | 5 +- vllm/model_executor/models/gemma4.py | 1 + vllm/model_executor/parameter.py | 10 +- 6 files changed, 163 insertions(+), 13 deletions(-) diff --git a/tests/quantization/test_gptq_marlin.py b/tests/quantization/test_gptq_marlin.py index 590d5209b9c4..f00935985795 100644 --- a/tests/quantization/test_gptq_marlin.py +++ b/tests/quantization/test_gptq_marlin.py @@ -1,11 +1,20 @@ +from types import SimpleNamespace + import torch +import vllm.model_executor.layers.fused_moe as fused_moe_mod +import vllm.model_executor.layers.linear as linear_mod import vllm.model_executor.layers.quantization.gptq_marlin as gptq_marlin_mod import vllm.model_executor.parameter as parameter_mod +from vllm.model_executor.layers.fused_moe.activation import MoEActivation +from vllm.model_executor.layers.fused_moe.router.gate_linear import GateLinear +from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.model_executor.layers.quantization.inc import INCConfig from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQMarlinConfig, GPTQMarlinLinearMethod, ) +from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Method from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -17,7 +26,10 @@ def process_weights_after_loading(self, layer): return None def apply_weights(self, layer, x, bias): - raise NotImplementedError + out = torch.zeros((*x.shape[:-1], layer.output_size), dtype=x.dtype) + if bias is not None: + out = out + bias + return out def test_gptq_marlin_create_weights_uses_ceil_groups_for_row_parallel( @@ -56,3 +68,125 @@ def test_gptq_marlin_create_weights_uses_ceil_groups_for_row_parallel( assert layer.scales.shape == (17, 2816) assert layer.qzeros.shape == (17, 352) + + +def test_inc_row_parallel_overlap_group_load_uses_global_group_offsets( + monkeypatch, +): + monkeypatch.setattr( + gptq_marlin_mod, "verify_marlin_supported", lambda **kwargs: None + ) + monkeypatch.setattr( + gptq_marlin_mod, "choose_mp_linear_kernel", lambda _: _DummyKernel + ) + monkeypatch.setattr(linear_mod, "get_tensor_model_parallel_rank", lambda: 1) + monkeypatch.setattr(linear_mod, "get_tensor_model_parallel_world_size", lambda: 2) + monkeypatch.setattr(parameter_mod, "get_tensor_model_parallel_rank", lambda: 1) + monkeypatch.setattr(parameter_mod, "get_tensor_model_parallel_world_size", lambda: 2) + + quant_config = INCConfig( + weight_bits=4, + group_size=128, + sym=True, + packing_format="auto_round:auto_gptq", + backend="marlin", + ) + layer = RowParallelLinear( + 2112, + 2816, + bias=False, + quant_config=quant_config, + prefix="model.layers.0.mlp.down_proj", + ) + + scales = torch.arange(17 * 2816, dtype=layer.scales.dtype).reshape(17, 2816) + qzeros = torch.arange(17 * 352, dtype=layer.qzeros.dtype).reshape(17, 352) + + layer.weight_loader_v2(layer.scales, scales) + layer.weight_loader_v2(layer.qzeros, qzeros) + + assert torch.equal(layer.scales.data, scales[8:17]) + assert torch.equal(layer.qzeros.data, qzeros[8:17]) + + +def test_gate_linear_quantized_forward_does_not_require_unquantized_weight( + monkeypatch, +): + monkeypatch.setattr( + gptq_marlin_mod, "verify_marlin_supported", lambda **kwargs: None + ) + monkeypatch.setattr( + gptq_marlin_mod, "choose_mp_linear_kernel", lambda _: _DummyKernel + ) + monkeypatch.setattr(linear_mod, "get_tensor_model_parallel_rank", lambda: 0) + monkeypatch.setattr(linear_mod, "get_tensor_model_parallel_world_size", lambda: 1) + monkeypatch.setattr(parameter_mod, "get_tensor_model_parallel_rank", lambda: 0) + monkeypatch.setattr(parameter_mod, "get_tensor_model_parallel_world_size", lambda: 1) + + quant_config = INCConfig( + weight_bits=4, + group_size=128, + sym=True, + packing_format="auto_round:auto_gptq", + backend="marlin", + ) + gate = GateLinear( + 2816, + 128, + bias=False, + out_dtype=torch.float32, + quant_config=quant_config, + prefix="model.layers.0.router.proj", + ) + + assert not hasattr(gate, "weight") + assert hasattr(gate, "qweight") + + x = torch.randn(3, 2816, dtype=torch.float16) + output, output_bias = gate(x) + + assert output_bias is None + assert output.shape == (3, 128) + assert output.dtype == torch.float32 + + +def test_moe_wna16_apply_passes_layer_activation(monkeypatch): + captured: dict[str, object] = {} + + def fake_fused_experts(x, w1, w2, **kwargs): + captured["activation"] = kwargs["activation"] + captured["quant_config"] = kwargs["quant_config"] + return torch.zeros_like(x) + + monkeypatch.setattr(fused_moe_mod, "fused_experts", fake_fused_experts) + + method = MoeWNA16Method( + SimpleNamespace(), + SimpleNamespace(disable_inplace=False), + ) + method.moe_quant_config = object() + + layer = SimpleNamespace( + activation=MoEActivation.GELU, + w13_qweight=torch.empty(2, 4, 4, dtype=torch.uint8), + w2_qweight=torch.empty(2, 4, 4, dtype=torch.uint8), + apply_router_weight_on_input=False, + global_num_experts=2, + expert_map=None, + ) + + x = torch.randn(3, 8, dtype=torch.float16) + topk_weights = torch.ones(3, 1, dtype=torch.float32) + topk_ids = torch.zeros(3, 1, dtype=torch.int32) + + output = method.apply( + layer=layer, + x=x, + topk_weights=topk_weights, + topk_ids=topk_ids, + shared_experts_input=None, + ) + + assert captured["activation"] == MoEActivation.GELU + assert captured["quant_config"] is method.moe_quant_config + assert output.shape == x.shape diff --git a/vllm/model_executor/layers/fused_moe/router/gate_linear.py b/vllm/model_executor/layers/fused_moe/router/gate_linear.py index 77d8e756026d..d5173bc5700b 100644 --- a/vllm/model_executor/layers/fused_moe/router/gate_linear.py +++ b/vllm/model_executor/layers/fused_moe/router/gate_linear.py @@ -5,6 +5,7 @@ from vllm.model_executor.custom_op import PluggableLayer from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.platforms import current_platform @@ -33,6 +34,7 @@ def __init__( out_dtype: torch.dtype | None = None, params_dtype: torch.dtype | None = None, force_fp32_compute: bool = False, + quant_config: QuantizationConfig | None = None, prefix: str = "", ): is_hopper_or_blackwell = current_platform.is_device_capability( @@ -52,10 +54,12 @@ def __init__( output_size, bias=bias, params_dtype=params_dtype, - quant_config=None, + quant_config=quant_config, prefix=prefix, ) self.out_dtype = out_dtype + weight = getattr(self, "weight", None) + weight_dtype = None if weight is None else weight.dtype # DSV3 specialized kernel eligibility (SM90+, exact dims) self.allow_specialized_router_gemm = can_use_specialized_kernels @@ -68,7 +72,7 @@ def __init__( # cuBLAS bf16→fp32 eligibility self.allow_cublas_router_gemm = ( self.allow_specialized_router_gemm - and self.weight.dtype == torch.bfloat16 + and weight_dtype == torch.bfloat16 and self.out_dtype == torch.float32 ) @@ -87,7 +91,10 @@ def set_out_dtype(self, out_dtype: torch.dtype) -> None: and self.allow_specialized_router_gemm and out_dtype == torch.float32 ): - self.allow_cublas_router_gemm = self.weight.dtype == torch.bfloat16 + weight = getattr(self, "weight", None) + self.allow_cublas_router_gemm = ( + weight is not None and weight.dtype == torch.bfloat16 + ) def forward( self, x: torch.Tensor @@ -109,8 +116,9 @@ def forward( return output, None # Tier 3: F.linear (ReplicatedLinear) - if self.out_dtype is not None and x.dtype != self.weight.dtype: - x = x.to(self.weight.dtype) + weight = getattr(self, "weight", None) + if self.out_dtype is not None and weight is not None and x.dtype != weight.dtype: + x = x.to(weight.dtype) output, output_bias = super().forward(x) if self.out_dtype is not None and output.dtype != self.out_dtype: output = output.to(self.out_dtype) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 29df6e8e839b..52c348a4bd03 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -464,6 +464,12 @@ def create_weights( packed_factor=self.quant_config.pack_factor, **qzeros_args, ) + row_group_attrs = { + "row_group_size": group_size, + "row_input_size_per_partition": input_size_per_partition, + } + set_weight_attrs(scales, row_group_attrs) + set_weight_attrs(qzeros, row_group_attrs) layer.register_parameter("qweight", qweight) layer.register_parameter("g_idx", g_idx) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index a327ac17bbc9..71abd61eb0ab 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -372,10 +372,6 @@ def apply( ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts - assert layer.activation == MoEActivation.SILU, ( - f"Only SiLU activation is supported, not {layer.activation}." - ) - return fused_experts( x, layer.w13_qweight, @@ -383,6 +379,7 @@ def apply( topk_weights=topk_weights, topk_ids=topk_ids, inplace=not self.moe.disable_inplace, + activation=layer.activation, apply_router_weight_on_input=layer.apply_router_weight_on_input, global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, diff --git a/vllm/model_executor/models/gemma4.py b/vllm/model_executor/models/gemma4.py index c41c0de52b01..455344ff930b 100644 --- a/vllm/model_executor/models/gemma4.py +++ b/vllm/model_executor/models/gemma4.py @@ -161,6 +161,7 @@ def __init__( config.num_experts, bias=False, out_dtype=torch.float32, + quant_config=quant_config, prefix=f"{prefix}.proj", ) diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index 410e277493b0..e1be0915a4e1 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -219,9 +219,13 @@ def input_dim(self): def load_row_parallel_weight(self, loaded_weight: torch.Tensor): shard_size = self.data.shape[self.input_dim] - loaded_weight = loaded_weight.narrow( - self.input_dim, self.tp_rank * shard_size, shard_size - ) + group_size = getattr(self, "row_group_size", None) + input_partition_size = getattr(self, "row_input_size_per_partition", None) + if group_size is not None and input_partition_size is not None: + start_idx = (self.tp_rank * input_partition_size) // group_size + else: + start_idx = self.tp_rank * shard_size + loaded_weight = loaded_weight.narrow(self.input_dim, start_idx, shard_size) if len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1)