diff --git a/docker/versions.json b/docker/versions.json
index ee23b5baf04f..6301c6bab81a 100644
--- a/docker/versions.json
+++ b/docker/versions.json
@@ -68,7 +68,7 @@
       "default": "true"
     },
     "FLASHINFER_VERSION": {
-      "default": "0.6.11.post2"
+      "default": "0.6.11.post3"
     },
     "GDRCOPY_CUDA_VERSION": {
       "default": "12.8"
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 20fec60f27d3..bc31330eee43 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -9,8 +9,8 @@ torchaudio==2.11.0
 # These must be updated alongside torch
 torchvision==0.26.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.6.11.post2
-flashinfer-cubin==0.6.11.post2
+flashinfer-python==0.6.11.post3
+flashinfer-cubin==0.6.11.post3
 apache-tvm-ffi==0.1.9
 tilelang==0.1.9
 # Cap nvidia-cudnn-frontend (transitive dep of flashinfer) due to
diff --git a/tests/kernels/moe/test_flashinfer_b12x_moe.py b/tests/kernels/moe/test_flashinfer_b12x_moe.py
index ec0a9594fe12..733f7d2f0589 100644
--- a/tests/kernels/moe/test_flashinfer_b12x_moe.py
+++ b/tests/kernels/moe/test_flashinfer_b12x_moe.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from types import SimpleNamespace
+
 import pytest
 import torch
 
@@ -8,8 +10,7 @@
 
 if not current_platform.is_device_capability_family(120):
     pytest.skip(
-        reason="FlashInfer CuteDSL SM12x MoE requires SM120 "
-        "(RTX Pro 6000 / DGX Spark).",
+        reason="FlashInfer B12x MoE requires SM120 (RTX Pro 6000 / DGX Spark).",
         allow_module_level=True,
     )
 
@@ -18,8 +19,8 @@
 if not has_flashinfer_b12x_moe():
     pytest.skip(
         reason=(
-            "FlashInfer cute_dsl_fused_moe_nvfp4 / convert_sf_to_mma_layout "
-            "not available in installed FlashInfer (needs PRs #3051 and #3066)."
+            "FlashInfer B12xMoEWrapper not available in installed "
+            "FlashInfer (needs PR #3080)."
         ),
         allow_module_level=True,
     )
@@ -40,7 +41,6 @@
 from vllm.model_executor.layers.fused_moe.experts.flashinfer_b12x_moe import (
     FlashInferB12xExperts,
 )
-from vllm.utils.flashinfer import flashinfer_convert_sf_to_mma_layout
 from vllm.utils.torch_utils import set_random_seed
 
 # Dimensions chosen to satisfy FP4 alignment requirements (k multiple of 256,
@@ -59,9 +59,9 @@ def _reorder_gate_up_to_up_gate(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Swap gate and up-projection halves along dim=1 to [up, gate] order.
 
-    The SM12x kernel expects weights in [up (w3), gate (w1)] order while the
+    The B12x kernel expects weights in [up (w3), gate (w1)] order while the
     BF16 reference uses [gate (w1), up (w3)].  This replicates the reordering
-    done at model-load time by ``prepare_nvfp4_moe_layer_for_fi_or_cutlass``.
+    done at model-load time by the FP4 layer-prep helper.
     """
     n = w.shape[1] // 2
     return (
@@ -70,6 +70,22 @@ def _reorder_gate_up_to_up_gate(
     )
 
 
+def _process_b12x_weights(
+    experts: FlashInferB12xExperts,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    w1_scale_2: torch.Tensor,
+    w2_scale_2: torch.Tensor,
+) -> None:
+    layer = SimpleNamespace(
+        w13_weight_scale=w1_scale,
+        w13_weight_scale_2=w1_scale_2,
+        w2_weight_scale=w2_scale,
+        w2_weight_scale_2=w2_scale_2,
+    )
+    experts.process_weights_after_loading(layer)
+
+
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("e", [8, 16])
 @pytest.mark.parametrize("topk", [1, 2, 4])
@@ -174,22 +190,12 @@ def test_flashinfer_b12x_moe(
             moe_config=moe_config,
             quant_config=quant_config,
         )
-        # In production, process_weights_after_loading computes these after
-        # normalizing block scales. In the test the scales are already in final
-        # form (global_scale=1.0), so we compute the MMA layouts directly.
-        num_experts_w1, m1, k1_sf = w1_blockscale.shape
-        experts.w1_sf_mma = flashinfer_convert_sf_to_mma_layout(
-            w1_blockscale.reshape(num_experts_w1 * m1, k1_sf),
-            m=m1,
-            k=k1_sf * 16,
-            num_groups=num_experts_w1,
-        )
-        num_experts_w2, m2, k2_sf = w2_blockscale.shape
-        experts.w2_sf_mma = flashinfer_convert_sf_to_mma_layout(
-            w2_blockscale.reshape(num_experts_w2 * m2, k2_sf),
-            m=m2,
-            k=k2_sf * 16,
-            num_groups=num_experts_w2,
+        _process_b12x_weights(
+            experts,
+            w1_blockscale,
+            w2_blockscale,
+            ones_e,
+            ones_e,
         )
 
         kernel = mk.FusedMoEKernel(
@@ -225,5 +231,135 @@ def test_flashinfer_b12x_moe(
         torch.testing.assert_close(sm12x_output, torch_output, atol=2e-1, rtol=2e-1)
 
 
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [8, 16])
+@pytest.mark.parametrize("topk", [1, 2, 4])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@torch.inference_mode()
+def test_flashinfer_b12x_moe_relu2(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    workspace_init,
+):
+    """Test FlashInferB12xExperts with ReLU2 (non-gated) activation.
+
+    ReLU2 is used by Nemotron-H style models.  Unlike the gated SiLU
+    path, w1 has shape [E, N, K] (not [E, 2N, K]) and the activation
+    is relu(x)^2 without a gate/up split.
+    """
+    set_random_seed(7)
+    with set_current_vllm_config(
+        VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+    ):
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+
+        # Non-gated: w1 shape is (e, n, k), not (e, 2n, k).
+        w1_bf16 = torch.randn((e, n, k), device="cuda", dtype=dtype) / 15
+        w2_bf16 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 15
+
+        gs = torch.ones(1, device="cuda", dtype=torch.float32)
+        sf_vec_size = 16
+
+        # W1: no gate/up reordering for non-gated.
+        w1_flat = w1_bf16.reshape(e * n, k)
+        w1_q_flat, w1_sf_flat = fp4_quantize(
+            w1_flat,
+            global_scale=gs,
+            sf_vec_size=sf_vec_size,
+            is_sf_swizzled_layout=True,
+        )
+        w1_q = w1_q_flat.view(e, n, k // 2)
+        w1_blockscale = w1_sf_flat.view(e, n, w1_sf_flat.shape[1])
+
+        w2_flat = w2_bf16.reshape(e * k, n)
+        w2_q_flat, w2_sf_flat = fp4_quantize(
+            w2_flat,
+            global_scale=gs,
+            sf_vec_size=sf_vec_size,
+            is_sf_swizzled_layout=True,
+        )
+        w2_q = w2_q_flat.view(e, k, n // 2)
+        w2_blockscale = w2_sf_flat.view(e, k, w2_sf_flat.shape[1])
+
+        ones_e = torch.ones(e, device="cuda", dtype=torch.float32)
+
+        quant_config = nvfp4_moe_quant_config(
+            g1_alphas=ones_e,
+            g2_alphas=ones_e,
+            a1_gscale=ones_e,
+            a2_gscale=ones_e,
+            w1_scale=w1_blockscale,
+            w2_scale=w2_blockscale,
+        )
+
+        moe_config = make_dummy_moe_config(
+            num_experts=e,
+            experts_per_token=topk,
+            hidden_dim=k,
+            intermediate_size_per_partition=n,
+            in_dtype=dtype,
+            activation=MoEActivation.RELU2_NO_MUL,
+            is_act_and_mul=False,
+        )
+
+        experts = FlashInferB12xExperts(
+            moe_config=moe_config,
+            quant_config=quant_config,
+        )
+        _process_b12x_weights(
+            experts,
+            w1_blockscale,
+            w2_blockscale,
+            ones_e,
+            ones_e,
+        )
+
+        kernel = mk.FusedMoEKernel(
+            maybe_make_prepare_finalize(
+                moe=moe_config,
+                quant_config=quant_config,
+                allow_new_interface=True,
+                use_monolithic=False,
+            ),
+            experts,
+            inplace=False,
+        )
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
+
+        b12x_output = kernel.apply(
+            hidden_states=a,
+            w1=w1_q,
+            w2=w2_q,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            global_num_experts=e,
+            activation=MoEActivation.RELU2_NO_MUL,
+            apply_router_weight_on_input=False,
+            expert_map=None,
+        )
+
+        torch_output = torch_moe(
+            a,
+            w1_bf16,
+            w2_bf16,
+            score,
+            topk,
+            activation=MoEActivation.RELU2_NO_MUL,
+        )
+
+        torch.testing.assert_close(
+            b12x_output,
+            torch_output,
+            atol=2e-1,
+            rtol=2e-1,
+        )
+
+
 if __name__ == "__main__":
     test_flashinfer_b12x_moe(16, 128, 256, 8, 2, torch.bfloat16)
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index acb2c21b3896..9141586c0d08 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -53,6 +53,8 @@ def make_dummy_moe_config(
     hidden_dim: int = 1,
     intermediate_size_per_partition: int = 1,
     in_dtype: torch.dtype = torch.bfloat16,
+    activation: MoEActivation = MoEActivation.SILU,
+    is_act_and_mul: bool = True,
 ) -> FusedMoEConfig:
     """
     This is a dummy config for the mk constructor interface
@@ -69,7 +71,8 @@ def make_dummy_moe_config(
         num_local_experts=num_experts,
         num_logical_experts=num_experts,
         moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
-        activation=MoEActivation.SILU,
+        activation=activation,
+        is_act_and_mul=is_act_and_mul,
         in_dtype=in_dtype,
         device="cuda",
         routing_method=RoutingMethodType.TopK,
diff --git a/vllm/model_executor/layers/fused_moe/experts/flashinfer_b12x_moe.py b/vllm/model_executor/layers/fused_moe/experts/flashinfer_b12x_moe.py
index 6481434f2e78..cb7f1c060382 100644
--- a/vllm/model_executor/layers/fused_moe/experts/flashinfer_b12x_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/flashinfer_b12x_moe.py
@@ -17,10 +17,11 @@
     QuantKey,
     kNvfp4Dynamic,
     kNvfp4Static,
+    kNvfp4StaticGroupScale,
+    kStaticTensorScale,
 )
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import (
-    flashinfer_b12x_fused_moe,
     flashinfer_convert_sf_to_mma_layout,
     has_flashinfer_b12x_moe,
 )
@@ -42,6 +43,11 @@ class FlashInferB12xExperts(mk.FusedMoEExpertsModular):
     Only NVFP4 (kNvfp4Static/kNvfp4Dynamic) quantization is supported.
     """
 
+    _ACTIVATION_MAP: dict[MoEActivation, str] = {
+        MoEActivation.SILU: "silu",
+        MoEActivation.RELU2_NO_MUL: "relu2",
+    }
+
     def __init__(
         self,
         moe_config: FusedMoEConfig,
@@ -55,6 +61,60 @@ def __init__(
         self.num_local_experts = moe_config.num_local_experts
         self.ep_rank = moe_config.moe_parallel_config.ep_rank
 
+        # Shape params for B12xMoEWrapper construction.
+        self.global_num_experts = moe_config.num_experts
+        self.topk = moe_config.experts_per_token
+        self.hidden_dim = moe_config.hidden_dim
+        self.intermediate_size_per_partition = (
+            moe_config.intermediate_size_per_partition
+        )
+        self.max_num_tokens = moe_config.max_num_tokens
+        self.local_expert_offset = self.ep_rank * self.num_local_experts
+
+        activation = moe_config.activation
+        if activation not in self._ACTIVATION_MAP:
+            raise ValueError(
+                f"FlashInferB12xExperts does not support "
+                f"activation {activation!r}. "
+                f"Supported: {list(self._ACTIVATION_MAP.keys())}"
+            )
+        self._activation_str = self._ACTIVATION_MAP[activation]
+
+        self.activation_precision = (
+            "fp4" if quant_config.a1_gscale is not None else "bf16"
+        )
+
+        self.source_format = self._detect_source_format()
+
+        # Lazily created on first apply() call.
+        self._wrapper: object | None = None
+        # Populated in process_weights_after_loading.
+        self.w1_sf_mma: torch.Tensor | None = None
+        self.w2_sf_mma: torch.Tensor | None = None
+
+    @staticmethod
+    def _detect_source_format() -> str:
+        """Walk the constructor's call stack to find the parent quant-method
+        class and map it to a FlashInfer ``source_format`` string.
+
+        ``make_nvfp4_moe_kernel`` instantiates the experts class from the
+        parent method's ``create_weights`` (compressed-tensors) or equivalent
+        (modelopt) — so the parent ``self`` is reachable in an outer frame.
+        Fall back to "modelopt" if no recognized parent is found.
+        """
+        import inspect
+
+        for frame_info in inspect.stack():
+            parent = frame_info.frame.f_locals.get("self")
+            if parent is None:
+                continue
+            cls_name = type(parent).__name__
+            if "CompressedTensors" in cls_name:
+                return "compressed_tensors"
+            if "ModelOpt" in cls_name:
+                return "modelopt"
+        return "modelopt"
+
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Normalise block scales to absorb the per-expert weight global scale
         # (w_gs).  vLLM's NVFP4 convention stores:
@@ -87,26 +147,27 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if self.a2_gscale is not None:
             self.a2_gscale.fill_(1.0)
 
-        # Precompute MMA-layout views of the weight scale factors once here
-        # rather than recomputing on every forward pass.
-        assert self.w1_scale is not None
-        num_experts_w1, m1, k1_sf = self.w1_scale.shape
-        k1 = k1_sf * 16
+        # Precompute MMA-layout views of the (now-rewritten) weight scale
+        # factors once here rather than recomputing on every forward pass.
+        # Converts swizzled 3D scale factors [E, M, K_sf] to the 6D MMA
+        # layout expected by the SM12x kernel's _get_weight_views().
+        assert self.w1_scale is not None and self.w2_scale is not None
+        sf_vec_size = 16
+        E_w1, M_w1, K_sf_w1 = self.w1_scale.shape
         self.w1_sf_mma = flashinfer_convert_sf_to_mma_layout(
-            self.w1_scale.reshape(num_experts_w1 * m1, k1_sf),
-            m=m1,
-            k=k1,
-            num_groups=num_experts_w1,
+            self.w1_scale.reshape(E_w1 * M_w1, K_sf_w1),
+            m=M_w1,
+            k=K_sf_w1 * sf_vec_size,
+            num_groups=E_w1,
+            sf_vec_size=sf_vec_size,
         )
-
-        assert self.w2_scale is not None
-        num_experts_w2, m2, k2_sf = self.w2_scale.shape
-        k2 = k2_sf * 16
+        E_w2, M_w2, K_sf_w2 = self.w2_scale.shape
         self.w2_sf_mma = flashinfer_convert_sf_to_mma_layout(
-            self.w2_scale.reshape(num_experts_w2 * m2, k2_sf),
-            m=m2,
-            k=k2,
-            num_groups=num_experts_w2,
+            self.w2_scale.reshape(E_w2 * M_w2, K_sf_w2),
+            m=M_w2,
+            k=K_sf_w2 * sf_vec_size,
+            num_groups=E_w2,
+            sf_vec_size=sf_vec_size,
         )
 
     @staticmethod
@@ -124,18 +185,32 @@ def _supports_current_device() -> bool:
 
     @staticmethod
     def _supports_no_act_and_mul() -> bool:
-        return False
+        return True
 
     @staticmethod
     def _supports_quant_scheme(
         weight_key: QuantKey | None,
         activation_key: QuantKey | None,
     ) -> bool:
-        return (weight_key, activation_key) == (kNvfp4Static, kNvfp4Dynamic)
+        # Original W4A4 NVFP4 (modelopt format).
+        if (weight_key, activation_key) == (kNvfp4Static, kNvfp4Dynamic):
+            return True
+        
+        # W4A16 NVFP4 compressed-tensors `nvfp4-pack-quantized`
+        if (
+            weight_key is not None
+            and weight_key.dtype == torch.uint8
+            and weight_key.scale == kNvfp4StaticGroupScale
+            and weight_key.scale2 == kStaticTensorScale
+            and weight_key.symmetric
+            and activation_key is None
+        ):
+            return True
+        return False
 
     @staticmethod
     def _supports_activation(activation: MoEActivation) -> bool:
-        return activation == MoEActivation.SILU
+        return activation in (MoEActivation.SILU, MoEActivation.RELU2_NO_MUL)
 
     @staticmethod
     def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
@@ -167,13 +242,31 @@ def workspace_shapes(
 
     @property
     def expects_unquantized_inputs(self) -> bool:
-        # b12x_fused_moe expects BF16 hidden states and performs its own FP4
+        # B12xMoEWrapper expects BF16 hidden states and performs its own FP4
         # quantization internally.  Returning True prevents the modular kernel
-        # from pre-quantizing activations, which would produce an FP4-packed
-        # tensor with size(-1)=k//2 and break the scale-factor conversion that
-        # expects size(-1)=k.
+        # from pre-quantizing activations.
         return True
 
+    def _ensure_wrapper(self) -> None:
+        """Lazily create B12xMoEWrapper on first use."""
+        if self._wrapper is not None:
+            return
+
+        from flashinfer.fused_moe import B12xMoEWrapper
+
+        self._wrapper = B12xMoEWrapper(
+            num_experts=self.global_num_experts,
+            top_k=self.topk,
+            hidden_size=self.hidden_dim,
+            intermediate_size=self.intermediate_size_per_partition,
+            use_cuda_graph=True,
+            max_num_tokens=self.max_num_tokens,
+            num_local_experts=self.num_local_experts,
+            activation=self._activation_str,
+            activation_precision=self.activation_precision,
+            source_format=self.source_format,
+        )
+
     def apply(
         self,
         output: torch.Tensor,
@@ -201,13 +294,14 @@ def apply(
         assert self.a2_gscale is not None, (
             "a2_gscale must not be None for FlashInferB12xExperts"
         )
+        assert self.w1_sf_mma is not None and self.w2_sf_mma is not None, (
+            "process_weights_after_loading must run before FlashInferB12xExperts.apply"
+        )
 
-        top_k = topk_ids.shape[1]
+        self._ensure_wrapper()
 
-        flashinfer_b12x_fused_moe(
+        result = self._wrapper.run(
             x=hidden_states,
-            token_selected_experts=topk_ids.to(torch.int32),
-            token_final_scales=topk_weights,
             w1_weight=w1,
             w1_weight_sf=self.w1_sf_mma,
             w1_alpha=self.g1_alphas,
@@ -215,9 +309,7 @@ def apply(
             w2_weight=w2,
             w2_weight_sf=self.w2_sf_mma,
             w2_alpha=self.g2_alphas,
-            num_experts=global_num_experts,
-            top_k=top_k,
-            num_local_experts=self.num_local_experts,
-            output_dtype=self.out_dtype,
-            output=output,
+            token_selected_experts=topk_ids.to(torch.int32),
+            token_final_scales=topk_weights,
         )
+        output.copy_(result)
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 4a3b7619b94c..5d0c951f8b8d 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -50,6 +50,7 @@
     LinearMethodBase,
     UnquantizedLinearMethod,
 )
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
@@ -2306,6 +2307,24 @@ def _resolve_quant_algo(self, prefix: str) -> str | None:
         if prefix in self.quantized_layers:
             return self.quantized_layers[prefix]["quant_algo"].upper()
 
+        # Qwen VLM wrappers can construct the LM head under a nested vLLM
+        # prefix while ModelOpt exports the tensor names as top-level lm_head.*.
+        if prefix.endswith(".lm_head") and "lm_head" in self.quantized_layers:
+            return self.quantized_layers["lm_head"]["quant_algo"].upper()
+
+        # Qwen3.5/3.6-MoE VLM: vLLM's internal naming can be
+        # language_model.model.layers.X... while ModelOpt exports keys as
+        # model.language_model.layers.X... (swapped wrapper order). Try the
+        # swap as a direct fallback for any prefix that does not match.
+        if prefix.startswith("language_model.model."):
+            swapped = "model.language_model." + prefix[len("language_model.model.") :]
+            if swapped in self.quantized_layers:
+                return self.quantized_layers[swapped]["quant_algo"].upper()
+        elif prefix.startswith("model.language_model."):
+            swapped = "language_model.model." + prefix[len("model.language_model.") :]
+            if swapped in self.quantized_layers:
+                return self.quantized_layers[swapped]["quant_algo"].upper()
+
         # 2. Packed / fused layer lookup
         proj_name = prefix.rsplit(".", 1)[-1]
         if self.packed_modules_mapping and proj_name in self.packed_modules_mapping:
@@ -2357,6 +2376,13 @@ def get_quant_method(
             # Layer not in quantized_layers — leave unquantized
             return UnquantizedLinearMethod()
 
+        if isinstance(layer, ParallelLMHead):
+            if quant_algo == "FP8":
+                return ModelOptFp8LinearMethod(self.fp8_config)
+            if quant_algo == "NVFP4":
+                return ModelOptNvFp4LinearMethod(self.nvfp4_config)
+            return UnquantizedLinearMethod()
+
         if isinstance(layer, RoutedExperts):
             if quant_algo == "FP8":
                 return ModelOptFp8MoEMethod(
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 082e42f964f4..df509adbfca8 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -388,5 +388,8 @@ def prepare_nvfp4_moe_layer_for_fi_or_cutlass(
             w2_scale = torch.nn.functional.pad(w2_scale, (0, pad_size // 16))
 
         w2_scale = swizzle_blockscale(w2_scale)
+        layer.moe_config.intermediate_size_per_partition = (
+            layer.moe_config.intermediate_size_per_partition + pad_size
+        )
 
     return w13, w13_scale, w13_scale_2, a13_scale, w2, w2_scale, w2_scale_2, a2_scale
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index c02d39c17a02..69b26c05a0be 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -216,7 +216,11 @@ def prepare_fp4_layer_for_marlin(
 
     part_size_n = layer.output_size_per_partition
     part_size_k = layer.input_size_per_partition
-    param_dtype = layer.params_dtype
+    # VocabParallelEmbedding / ParallelLMHead does not store params_dtype as an
+    # attribute
+    param_dtype = getattr(layer, "params_dtype", None)
+    if param_dtype is None:
+        param_dtype = torch.get_default_dtype()
 
     assert layer.weight.shape == (part_size_n, part_size_k // 2)
 
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index ddae01856da0..31dd59caa10e 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -438,6 +438,16 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # If parameter does not have output dim, then it should
         # be copied onto all gpus (e.g. g_idx for act_order gptq).
         if output_dim is None:
+            # AutoQuantize-quantized lm_head emits FP4 scalar scales
+            # (input_scale, weight_scale_2). Their on-disk shape is () while
+            # PerTensorScaleParameter materializes them as (1,) -- same numel,
+            # different rank. Reshape rather than asserting.
+            if (
+                param.data.shape != loaded_weight.shape
+                and param.data.numel() == loaded_weight.numel()
+            ):
+                param.data.copy_(loaded_weight.reshape(param.data.shape))
+                return
             assert param.data.shape == loaded_weight.shape
             param.data.copy_(loaded_weight)
             return
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 8915d8172c56..0d303e3eb8a4 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -875,6 +875,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.lm_head = ParallelLMHead(
             config.vocab_size,
             config.hidden_size,
+            quant_config=self.quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
diff --git a/vllm/model_executor/models/nemotron_h_mtp.py b/vllm/model_executor/models/nemotron_h_mtp.py
index fe737438c30f..1f0693c5ae5e 100644
--- a/vllm/model_executor/models/nemotron_h_mtp.py
+++ b/vllm/model_executor/models/nemotron_h_mtp.py
@@ -11,6 +11,7 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.config.parallel import ParallelConfig
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
     fused_moe_make_expert_params_mapping,
 )
@@ -36,6 +37,8 @@
     NemotronHMoEDecoderLayer,
 )
 
+logger = init_logger(__name__)
+
 
 class NemotronHMTPAttentionDecoderLayer(NemotronHAttentionDecoderLayer):
     def __init__(
@@ -242,6 +245,35 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         # Total number of physical layers = num_steps * pattern_len
         total_layers = self.num_mtp_layers * self.pattern_len
+
+        quant_config = vllm_config.quant_config
+        if (
+            quant_config is not None
+            and quant_config.get_name() == "compressed-tensors"
+            and hasattr(quant_config, "ignore")
+        ):
+            num_experts = getattr(config, "n_routed_experts", None)
+            if getattr(config, "model_type", None) == "nemotron_h_puzzle":
+                num_experts = getattr(config, "mtp_n_routed_experts", num_experts)
+            if num_experts:
+                extra: list[str] = []
+                for i in range(total_layers):
+                    if self.pattern_str[i % self.pattern_len] != "E":
+                        continue
+                    for eid in range(num_experts):
+                        for proj in ("gate_proj", "up_proj", "down_proj"):
+                            extra.append(
+                                f"{prefix}.layers.{i}.mixer.experts.{eid}.{proj}"
+                            )
+                new_entries = [n for n in extra if n not in quant_config.ignore]
+                quant_config.ignore.extend(new_entries)
+                if new_entries:
+                    logger.info(
+                        "NemotronH-MTP: extended compressed-tensors ignore "
+                        "with %d per-expert MTP linears (BF16 in the checkpoint)",
+                        len(new_entries),
+                    )
+
         for i in range(total_layers):
             step_rel_idx = i % self.pattern_len
 
@@ -346,6 +378,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.lm_head = ParallelLMHead(
             self.config.vocab_size,
             self.config.hidden_size,
+            quant_config=self.quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
 
diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py
index 86da03ced0b2..ce2e341dcf90 100644
--- a/vllm/model_executor/models/qwen3_5.py
+++ b/vllm/model_executor/models/qwen3_5.py
@@ -475,6 +475,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 self.lm_head = ParallelLMHead(
                     config.vocab_size,
                     config.hidden_size,
+                    quant_config=self.quant_config,
                     prefix=maybe_prefix(prefix, "lm_head"),
                 )
         else:
diff --git a/vllm/model_executor/models/qwen3_5_mtp.py b/vllm/model_executor/models/qwen3_5_mtp.py
index 5622065a7ff3..eff8ee584fbc 100644
--- a/vllm/model_executor/models/qwen3_5_mtp.py
+++ b/vllm/model_executor/models/qwen3_5_mtp.py
@@ -96,6 +96,28 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=f"{prefix}.fc",
         )
 
+        if (
+            quant_config is not None
+            and quant_config.get_name() == "compressed-tensors"
+            and hasattr(quant_config, "ignore")
+        ):
+            num_experts = getattr(config, "num_experts", 0)
+            extra: list[str] = []
+            for idx in range(self.num_mtp_layers):
+                for eid in range(num_experts):
+                    for proj in ("gate_proj", "up_proj", "down_proj"):
+                        extra.append(
+                            f"{prefix}.layers.{idx}.mlp.experts.{eid}.{proj}"
+                        )
+            new_entries = [n for n in extra if n not in quant_config.ignore]
+            quant_config.ignore.extend(new_entries)
+            if new_entries:
+                logger.info(
+                    "Qwen3_5MTP: extended compressed-tensors ignore with "
+                    "%d per-expert MTP linears (BF16 in the checkpoint)",
+                    len(new_entries),
+                )
+
         self.layers = torch.nn.ModuleList(
             Qwen3_5DecoderLayer(
                 vllm_config,
@@ -381,6 +403,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 self.lm_head = ParallelLMHead(
                     config.vocab_size,
                     config.hidden_size,
+                    quant_config=self.quant_config,
                     prefix=maybe_prefix(prefix, "lm_head"),
                 )
         else: