diff --git a/docker/versions.json b/docker/versions.json index ee23b5baf04f..6301c6bab81a 100644 --- a/docker/versions.json +++ b/docker/versions.json @@ -68,7 +68,7 @@ "default": "true" }, "FLASHINFER_VERSION": { - "default": "0.6.11.post2" + "default": "0.6.11.post3" }, "GDRCOPY_CUDA_VERSION": { "default": "12.8" diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 20fec60f27d3..bc31330eee43 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -9,8 +9,8 @@ torchaudio==2.11.0 # These must be updated alongside torch torchvision==0.26.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # FlashInfer should be updated together with the Dockerfile -flashinfer-python==0.6.11.post2 -flashinfer-cubin==0.6.11.post2 +flashinfer-python==0.6.11.post3 +flashinfer-cubin==0.6.11.post3 apache-tvm-ffi==0.1.9 tilelang==0.1.9 # Cap nvidia-cudnn-frontend (transitive dep of flashinfer) due to diff --git a/tests/kernels/moe/test_flashinfer_b12x_moe.py b/tests/kernels/moe/test_flashinfer_b12x_moe.py index ec0a9594fe12..733f7d2f0589 100644 --- a/tests/kernels/moe/test_flashinfer_b12x_moe.py +++ b/tests/kernels/moe/test_flashinfer_b12x_moe.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from types import SimpleNamespace + import pytest import torch @@ -8,8 +10,7 @@ if not current_platform.is_device_capability_family(120): pytest.skip( - reason="FlashInfer CuteDSL SM12x MoE requires SM120 " - "(RTX Pro 6000 / DGX Spark).", + reason="FlashInfer B12x MoE requires SM120 (RTX Pro 6000 / DGX Spark).", allow_module_level=True, ) @@ -18,8 +19,8 @@ if not has_flashinfer_b12x_moe(): pytest.skip( reason=( - "FlashInfer cute_dsl_fused_moe_nvfp4 / convert_sf_to_mma_layout " - "not available in installed FlashInfer (needs PRs #3051 and #3066)." + "FlashInfer B12xMoEWrapper not available in installed " + "FlashInfer (needs PR #3080)." ), allow_module_level=True, ) @@ -40,7 +41,6 @@ from vllm.model_executor.layers.fused_moe.experts.flashinfer_b12x_moe import ( FlashInferB12xExperts, ) -from vllm.utils.flashinfer import flashinfer_convert_sf_to_mma_layout from vllm.utils.torch_utils import set_random_seed # Dimensions chosen to satisfy FP4 alignment requirements (k multiple of 256, @@ -59,9 +59,9 @@ def _reorder_gate_up_to_up_gate( ) -> tuple[torch.Tensor, torch.Tensor]: """Swap gate and up-projection halves along dim=1 to [up, gate] order. - The SM12x kernel expects weights in [up (w3), gate (w1)] order while the + The B12x kernel expects weights in [up (w3), gate (w1)] order while the BF16 reference uses [gate (w1), up (w3)]. This replicates the reordering - done at model-load time by ``prepare_nvfp4_moe_layer_for_fi_or_cutlass``. + done at model-load time by the FP4 layer-prep helper. """ n = w.shape[1] // 2 return ( @@ -70,6 +70,22 @@ def _reorder_gate_up_to_up_gate( ) +def _process_b12x_weights( + experts: FlashInferB12xExperts, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + w1_scale_2: torch.Tensor, + w2_scale_2: torch.Tensor, +) -> None: + layer = SimpleNamespace( + w13_weight_scale=w1_scale, + w13_weight_scale_2=w1_scale_2, + w2_weight_scale=w2_scale, + w2_weight_scale_2=w2_scale_2, + ) + experts.process_weights_after_loading(layer) + + @pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("e", [8, 16]) @pytest.mark.parametrize("topk", [1, 2, 4]) @@ -174,22 +190,12 @@ def test_flashinfer_b12x_moe( moe_config=moe_config, quant_config=quant_config, ) - # In production, process_weights_after_loading computes these after - # normalizing block scales. In the test the scales are already in final - # form (global_scale=1.0), so we compute the MMA layouts directly. - num_experts_w1, m1, k1_sf = w1_blockscale.shape - experts.w1_sf_mma = flashinfer_convert_sf_to_mma_layout( - w1_blockscale.reshape(num_experts_w1 * m1, k1_sf), - m=m1, - k=k1_sf * 16, - num_groups=num_experts_w1, - ) - num_experts_w2, m2, k2_sf = w2_blockscale.shape - experts.w2_sf_mma = flashinfer_convert_sf_to_mma_layout( - w2_blockscale.reshape(num_experts_w2 * m2, k2_sf), - m=m2, - k=k2_sf * 16, - num_groups=num_experts_w2, + _process_b12x_weights( + experts, + w1_blockscale, + w2_blockscale, + ones_e, + ones_e, ) kernel = mk.FusedMoEKernel( @@ -225,5 +231,135 @@ def test_flashinfer_b12x_moe( torch.testing.assert_close(sm12x_output, torch_output, atol=2e-1, rtol=2e-1) +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) +@pytest.mark.parametrize("e", [8, 16]) +@pytest.mark.parametrize("topk", [1, 2, 4]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@torch.inference_mode() +def test_flashinfer_b12x_moe_relu2( + m: int, + n: int, + k: int, + e: int, + topk: int, + dtype: torch.dtype, + workspace_init, +): + """Test FlashInferB12xExperts with ReLU2 (non-gated) activation. + + ReLU2 is used by Nemotron-H style models. Unlike the gated SiLU + path, w1 has shape [E, N, K] (not [E, 2N, K]) and the activation + is relu(x)^2 without a gate/up split. + """ + set_random_seed(7) + with set_current_vllm_config( + VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) + ): + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + + # Non-gated: w1 shape is (e, n, k), not (e, 2n, k). + w1_bf16 = torch.randn((e, n, k), device="cuda", dtype=dtype) / 15 + w2_bf16 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 15 + + gs = torch.ones(1, device="cuda", dtype=torch.float32) + sf_vec_size = 16 + + # W1: no gate/up reordering for non-gated. + w1_flat = w1_bf16.reshape(e * n, k) + w1_q_flat, w1_sf_flat = fp4_quantize( + w1_flat, + global_scale=gs, + sf_vec_size=sf_vec_size, + is_sf_swizzled_layout=True, + ) + w1_q = w1_q_flat.view(e, n, k // 2) + w1_blockscale = w1_sf_flat.view(e, n, w1_sf_flat.shape[1]) + + w2_flat = w2_bf16.reshape(e * k, n) + w2_q_flat, w2_sf_flat = fp4_quantize( + w2_flat, + global_scale=gs, + sf_vec_size=sf_vec_size, + is_sf_swizzled_layout=True, + ) + w2_q = w2_q_flat.view(e, k, n // 2) + w2_blockscale = w2_sf_flat.view(e, k, w2_sf_flat.shape[1]) + + ones_e = torch.ones(e, device="cuda", dtype=torch.float32) + + quant_config = nvfp4_moe_quant_config( + g1_alphas=ones_e, + g2_alphas=ones_e, + a1_gscale=ones_e, + a2_gscale=ones_e, + w1_scale=w1_blockscale, + w2_scale=w2_blockscale, + ) + + moe_config = make_dummy_moe_config( + num_experts=e, + experts_per_token=topk, + hidden_dim=k, + intermediate_size_per_partition=n, + in_dtype=dtype, + activation=MoEActivation.RELU2_NO_MUL, + is_act_and_mul=False, + ) + + experts = FlashInferB12xExperts( + moe_config=moe_config, + quant_config=quant_config, + ) + _process_b12x_weights( + experts, + w1_blockscale, + w2_blockscale, + ones_e, + ones_e, + ) + + kernel = mk.FusedMoEKernel( + maybe_make_prepare_finalize( + moe=moe_config, + quant_config=quant_config, + allow_new_interface=True, + use_monolithic=False, + ), + experts, + inplace=False, + ) + + score = torch.randn((m, e), device="cuda", dtype=dtype) + topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False) + + b12x_output = kernel.apply( + hidden_states=a, + w1=w1_q, + w2=w2_q, + topk_weights=topk_weights, + topk_ids=topk_ids, + global_num_experts=e, + activation=MoEActivation.RELU2_NO_MUL, + apply_router_weight_on_input=False, + expert_map=None, + ) + + torch_output = torch_moe( + a, + w1_bf16, + w2_bf16, + score, + topk, + activation=MoEActivation.RELU2_NO_MUL, + ) + + torch.testing.assert_close( + b12x_output, + torch_output, + atol=2e-1, + rtol=2e-1, + ) + + if __name__ == "__main__": test_flashinfer_b12x_moe(16, 128, 256, 8, 2, torch.bfloat16) diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index acb2c21b3896..9141586c0d08 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -53,6 +53,8 @@ def make_dummy_moe_config( hidden_dim: int = 1, intermediate_size_per_partition: int = 1, in_dtype: torch.dtype = torch.bfloat16, + activation: MoEActivation = MoEActivation.SILU, + is_act_and_mul: bool = True, ) -> FusedMoEConfig: """ This is a dummy config for the mk constructor interface @@ -69,7 +71,8 @@ def make_dummy_moe_config( num_local_experts=num_experts, num_logical_experts=num_experts, moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), - activation=MoEActivation.SILU, + activation=activation, + is_act_and_mul=is_act_and_mul, in_dtype=in_dtype, device="cuda", routing_method=RoutingMethodType.TopK, diff --git a/vllm/model_executor/layers/fused_moe/experts/flashinfer_b12x_moe.py b/vllm/model_executor/layers/fused_moe/experts/flashinfer_b12x_moe.py index 6481434f2e78..cb7f1c060382 100644 --- a/vllm/model_executor/layers/fused_moe/experts/flashinfer_b12x_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/flashinfer_b12x_moe.py @@ -17,10 +17,11 @@ QuantKey, kNvfp4Dynamic, kNvfp4Static, + kNvfp4StaticGroupScale, + kStaticTensorScale, ) from vllm.platforms import current_platform from vllm.utils.flashinfer import ( - flashinfer_b12x_fused_moe, flashinfer_convert_sf_to_mma_layout, has_flashinfer_b12x_moe, ) @@ -42,6 +43,11 @@ class FlashInferB12xExperts(mk.FusedMoEExpertsModular): Only NVFP4 (kNvfp4Static/kNvfp4Dynamic) quantization is supported. """ + _ACTIVATION_MAP: dict[MoEActivation, str] = { + MoEActivation.SILU: "silu", + MoEActivation.RELU2_NO_MUL: "relu2", + } + def __init__( self, moe_config: FusedMoEConfig, @@ -55,6 +61,60 @@ def __init__( self.num_local_experts = moe_config.num_local_experts self.ep_rank = moe_config.moe_parallel_config.ep_rank + # Shape params for B12xMoEWrapper construction. + self.global_num_experts = moe_config.num_experts + self.topk = moe_config.experts_per_token + self.hidden_dim = moe_config.hidden_dim + self.intermediate_size_per_partition = ( + moe_config.intermediate_size_per_partition + ) + self.max_num_tokens = moe_config.max_num_tokens + self.local_expert_offset = self.ep_rank * self.num_local_experts + + activation = moe_config.activation + if activation not in self._ACTIVATION_MAP: + raise ValueError( + f"FlashInferB12xExperts does not support " + f"activation {activation!r}. " + f"Supported: {list(self._ACTIVATION_MAP.keys())}" + ) + self._activation_str = self._ACTIVATION_MAP[activation] + + self.activation_precision = ( + "fp4" if quant_config.a1_gscale is not None else "bf16" + ) + + self.source_format = self._detect_source_format() + + # Lazily created on first apply() call. + self._wrapper: object | None = None + # Populated in process_weights_after_loading. + self.w1_sf_mma: torch.Tensor | None = None + self.w2_sf_mma: torch.Tensor | None = None + + @staticmethod + def _detect_source_format() -> str: + """Walk the constructor's call stack to find the parent quant-method + class and map it to a FlashInfer ``source_format`` string. + + ``make_nvfp4_moe_kernel`` instantiates the experts class from the + parent method's ``create_weights`` (compressed-tensors) or equivalent + (modelopt) — so the parent ``self`` is reachable in an outer frame. + Fall back to "modelopt" if no recognized parent is found. + """ + import inspect + + for frame_info in inspect.stack(): + parent = frame_info.frame.f_locals.get("self") + if parent is None: + continue + cls_name = type(parent).__name__ + if "CompressedTensors" in cls_name: + return "compressed_tensors" + if "ModelOpt" in cls_name: + return "modelopt" + return "modelopt" + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # Normalise block scales to absorb the per-expert weight global scale # (w_gs). vLLM's NVFP4 convention stores: @@ -87,26 +147,27 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if self.a2_gscale is not None: self.a2_gscale.fill_(1.0) - # Precompute MMA-layout views of the weight scale factors once here - # rather than recomputing on every forward pass. - assert self.w1_scale is not None - num_experts_w1, m1, k1_sf = self.w1_scale.shape - k1 = k1_sf * 16 + # Precompute MMA-layout views of the (now-rewritten) weight scale + # factors once here rather than recomputing on every forward pass. + # Converts swizzled 3D scale factors [E, M, K_sf] to the 6D MMA + # layout expected by the SM12x kernel's _get_weight_views(). + assert self.w1_scale is not None and self.w2_scale is not None + sf_vec_size = 16 + E_w1, M_w1, K_sf_w1 = self.w1_scale.shape self.w1_sf_mma = flashinfer_convert_sf_to_mma_layout( - self.w1_scale.reshape(num_experts_w1 * m1, k1_sf), - m=m1, - k=k1, - num_groups=num_experts_w1, + self.w1_scale.reshape(E_w1 * M_w1, K_sf_w1), + m=M_w1, + k=K_sf_w1 * sf_vec_size, + num_groups=E_w1, + sf_vec_size=sf_vec_size, ) - - assert self.w2_scale is not None - num_experts_w2, m2, k2_sf = self.w2_scale.shape - k2 = k2_sf * 16 + E_w2, M_w2, K_sf_w2 = self.w2_scale.shape self.w2_sf_mma = flashinfer_convert_sf_to_mma_layout( - self.w2_scale.reshape(num_experts_w2 * m2, k2_sf), - m=m2, - k=k2, - num_groups=num_experts_w2, + self.w2_scale.reshape(E_w2 * M_w2, K_sf_w2), + m=M_w2, + k=K_sf_w2 * sf_vec_size, + num_groups=E_w2, + sf_vec_size=sf_vec_size, ) @staticmethod @@ -124,18 +185,32 @@ def _supports_current_device() -> bool: @staticmethod def _supports_no_act_and_mul() -> bool: - return False + return True @staticmethod def _supports_quant_scheme( weight_key: QuantKey | None, activation_key: QuantKey | None, ) -> bool: - return (weight_key, activation_key) == (kNvfp4Static, kNvfp4Dynamic) + # Original W4A4 NVFP4 (modelopt format). + if (weight_key, activation_key) == (kNvfp4Static, kNvfp4Dynamic): + return True + + # W4A16 NVFP4 compressed-tensors `nvfp4-pack-quantized` + if ( + weight_key is not None + and weight_key.dtype == torch.uint8 + and weight_key.scale == kNvfp4StaticGroupScale + and weight_key.scale2 == kStaticTensorScale + and weight_key.symmetric + and activation_key is None + ): + return True + return False @staticmethod def _supports_activation(activation: MoEActivation) -> bool: - return activation == MoEActivation.SILU + return activation in (MoEActivation.SILU, MoEActivation.RELU2_NO_MUL) @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: @@ -167,13 +242,31 @@ def workspace_shapes( @property def expects_unquantized_inputs(self) -> bool: - # b12x_fused_moe expects BF16 hidden states and performs its own FP4 + # B12xMoEWrapper expects BF16 hidden states and performs its own FP4 # quantization internally. Returning True prevents the modular kernel - # from pre-quantizing activations, which would produce an FP4-packed - # tensor with size(-1)=k//2 and break the scale-factor conversion that - # expects size(-1)=k. + # from pre-quantizing activations. return True + def _ensure_wrapper(self) -> None: + """Lazily create B12xMoEWrapper on first use.""" + if self._wrapper is not None: + return + + from flashinfer.fused_moe import B12xMoEWrapper + + self._wrapper = B12xMoEWrapper( + num_experts=self.global_num_experts, + top_k=self.topk, + hidden_size=self.hidden_dim, + intermediate_size=self.intermediate_size_per_partition, + use_cuda_graph=True, + max_num_tokens=self.max_num_tokens, + num_local_experts=self.num_local_experts, + activation=self._activation_str, + activation_precision=self.activation_precision, + source_format=self.source_format, + ) + def apply( self, output: torch.Tensor, @@ -201,13 +294,14 @@ def apply( assert self.a2_gscale is not None, ( "a2_gscale must not be None for FlashInferB12xExperts" ) + assert self.w1_sf_mma is not None and self.w2_sf_mma is not None, ( + "process_weights_after_loading must run before FlashInferB12xExperts.apply" + ) - top_k = topk_ids.shape[1] + self._ensure_wrapper() - flashinfer_b12x_fused_moe( + result = self._wrapper.run( x=hidden_states, - token_selected_experts=topk_ids.to(torch.int32), - token_final_scales=topk_weights, w1_weight=w1, w1_weight_sf=self.w1_sf_mma, w1_alpha=self.g1_alphas, @@ -215,9 +309,7 @@ def apply( w2_weight=w2, w2_weight_sf=self.w2_sf_mma, w2_alpha=self.g2_alphas, - num_experts=global_num_experts, - top_k=top_k, - num_local_experts=self.num_local_experts, - output_dtype=self.out_dtype, - output=output, + token_selected_experts=topk_ids.to(torch.int32), + token_final_scales=topk_weights, ) + output.copy_(result) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 4a3b7619b94c..5d0c951f8b8d 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -50,6 +50,7 @@ LinearMethodBase, UnquantizedLinearMethod, ) +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, @@ -2306,6 +2307,24 @@ def _resolve_quant_algo(self, prefix: str) -> str | None: if prefix in self.quantized_layers: return self.quantized_layers[prefix]["quant_algo"].upper() + # Qwen VLM wrappers can construct the LM head under a nested vLLM + # prefix while ModelOpt exports the tensor names as top-level lm_head.*. + if prefix.endswith(".lm_head") and "lm_head" in self.quantized_layers: + return self.quantized_layers["lm_head"]["quant_algo"].upper() + + # Qwen3.5/3.6-MoE VLM: vLLM's internal naming can be + # language_model.model.layers.X... while ModelOpt exports keys as + # model.language_model.layers.X... (swapped wrapper order). Try the + # swap as a direct fallback for any prefix that does not match. + if prefix.startswith("language_model.model."): + swapped = "model.language_model." + prefix[len("language_model.model.") :] + if swapped in self.quantized_layers: + return self.quantized_layers[swapped]["quant_algo"].upper() + elif prefix.startswith("model.language_model."): + swapped = "language_model.model." + prefix[len("model.language_model.") :] + if swapped in self.quantized_layers: + return self.quantized_layers[swapped]["quant_algo"].upper() + # 2. Packed / fused layer lookup proj_name = prefix.rsplit(".", 1)[-1] if self.packed_modules_mapping and proj_name in self.packed_modules_mapping: @@ -2357,6 +2376,13 @@ def get_quant_method( # Layer not in quantized_layers — leave unquantized return UnquantizedLinearMethod() + if isinstance(layer, ParallelLMHead): + if quant_algo == "FP8": + return ModelOptFp8LinearMethod(self.fp8_config) + if quant_algo == "NVFP4": + return ModelOptNvFp4LinearMethod(self.nvfp4_config) + return UnquantizedLinearMethod() + if isinstance(layer, RoutedExperts): if quant_algo == "FP8": return ModelOptFp8MoEMethod( diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 082e42f964f4..df509adbfca8 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -388,5 +388,8 @@ def prepare_nvfp4_moe_layer_for_fi_or_cutlass( w2_scale = torch.nn.functional.pad(w2_scale, (0, pad_size // 16)) w2_scale = swizzle_blockscale(w2_scale) + layer.moe_config.intermediate_size_per_partition = ( + layer.moe_config.intermediate_size_per_partition + pad_size + ) return w13, w13_scale, w13_scale_2, a13_scale, w2, w2_scale, w2_scale_2, a2_scale diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index c02d39c17a02..69b26c05a0be 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -216,7 +216,11 @@ def prepare_fp4_layer_for_marlin( part_size_n = layer.output_size_per_partition part_size_k = layer.input_size_per_partition - param_dtype = layer.params_dtype + # VocabParallelEmbedding / ParallelLMHead does not store params_dtype as an + # attribute + param_dtype = getattr(layer, "params_dtype", None) + if param_dtype is None: + param_dtype = torch.get_default_dtype() assert layer.weight.shape == (part_size_n, part_size_k // 2) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index ddae01856da0..31dd59caa10e 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -438,6 +438,16 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # If parameter does not have output dim, then it should # be copied onto all gpus (e.g. g_idx for act_order gptq). if output_dim is None: + # AutoQuantize-quantized lm_head emits FP4 scalar scales + # (input_scale, weight_scale_2). Their on-disk shape is () while + # PerTensorScaleParameter materializes them as (1,) -- same numel, + # different rank. Reshape rather than asserting. + if ( + param.data.shape != loaded_weight.shape + and param.data.numel() == loaded_weight.numel() + ): + param.data.copy_(loaded_weight.reshape(param.data.shape)) + return assert param.data.shape == loaded_weight.shape param.data.copy_(loaded_weight) return diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 8915d8172c56..0d303e3eb8a4 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -875,6 +875,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = ParallelLMHead( config.vocab_size, config.hidden_size, + quant_config=self.quant_config, prefix=maybe_prefix(prefix, "lm_head"), ) diff --git a/vllm/model_executor/models/nemotron_h_mtp.py b/vllm/model_executor/models/nemotron_h_mtp.py index fe737438c30f..1f0693c5ae5e 100644 --- a/vllm/model_executor/models/nemotron_h_mtp.py +++ b/vllm/model_executor/models/nemotron_h_mtp.py @@ -11,6 +11,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.parallel import ParallelConfig +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( fused_moe_make_expert_params_mapping, ) @@ -36,6 +37,8 @@ NemotronHMoEDecoderLayer, ) +logger = init_logger(__name__) + class NemotronHMTPAttentionDecoderLayer(NemotronHAttentionDecoderLayer): def __init__( @@ -242,6 +245,35 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Total number of physical layers = num_steps * pattern_len total_layers = self.num_mtp_layers * self.pattern_len + + quant_config = vllm_config.quant_config + if ( + quant_config is not None + and quant_config.get_name() == "compressed-tensors" + and hasattr(quant_config, "ignore") + ): + num_experts = getattr(config, "n_routed_experts", None) + if getattr(config, "model_type", None) == "nemotron_h_puzzle": + num_experts = getattr(config, "mtp_n_routed_experts", num_experts) + if num_experts: + extra: list[str] = [] + for i in range(total_layers): + if self.pattern_str[i % self.pattern_len] != "E": + continue + for eid in range(num_experts): + for proj in ("gate_proj", "up_proj", "down_proj"): + extra.append( + f"{prefix}.layers.{i}.mixer.experts.{eid}.{proj}" + ) + new_entries = [n for n in extra if n not in quant_config.ignore] + quant_config.ignore.extend(new_entries) + if new_entries: + logger.info( + "NemotronH-MTP: extended compressed-tensors ignore " + "with %d per-expert MTP linears (BF16 in the checkpoint)", + len(new_entries), + ) + for i in range(total_layers): step_rel_idx = i % self.pattern_len @@ -346,6 +378,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = ParallelLMHead( self.config.vocab_size, self.config.hidden_size, + quant_config=self.quant_config, prefix=maybe_prefix(prefix, "lm_head"), ) diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py index 86da03ced0b2..ce2e341dcf90 100644 --- a/vllm/model_executor/models/qwen3_5.py +++ b/vllm/model_executor/models/qwen3_5.py @@ -475,6 +475,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = ParallelLMHead( config.vocab_size, config.hidden_size, + quant_config=self.quant_config, prefix=maybe_prefix(prefix, "lm_head"), ) else: diff --git a/vllm/model_executor/models/qwen3_5_mtp.py b/vllm/model_executor/models/qwen3_5_mtp.py index 5622065a7ff3..eff8ee584fbc 100644 --- a/vllm/model_executor/models/qwen3_5_mtp.py +++ b/vllm/model_executor/models/qwen3_5_mtp.py @@ -96,6 +96,28 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=f"{prefix}.fc", ) + if ( + quant_config is not None + and quant_config.get_name() == "compressed-tensors" + and hasattr(quant_config, "ignore") + ): + num_experts = getattr(config, "num_experts", 0) + extra: list[str] = [] + for idx in range(self.num_mtp_layers): + for eid in range(num_experts): + for proj in ("gate_proj", "up_proj", "down_proj"): + extra.append( + f"{prefix}.layers.{idx}.mlp.experts.{eid}.{proj}" + ) + new_entries = [n for n in extra if n not in quant_config.ignore] + quant_config.ignore.extend(new_entries) + if new_entries: + logger.info( + "Qwen3_5MTP: extended compressed-tensors ignore with " + "%d per-expert MTP linears (BF16 in the checkpoint)", + len(new_entries), + ) + self.layers = torch.nn.ModuleList( Qwen3_5DecoderLayer( vllm_config, @@ -381,6 +403,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = ParallelLMHead( config.vocab_size, config.hidden_size, + quant_config=self.quant_config, prefix=maybe_prefix(prefix, "lm_head"), ) else: