vllm-project · ProExpertProg · May 10, 2026 · May 3, 2026 · May 4, 2026 · May 4, 2026
diff --git a/tests/compile/correctness_e2e/test_async_tp.py b/tests/compile/correctness_e2e/test_async_tp.py
@@ -13,6 +13,17 @@
 from vllm.config import (
     CompilationMode,
 )
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer
+
+NVFP4_MODEL_ID = "nvidia/Llama-3.1-8B-Instruct-NVFP4"
+NVFP4_HF_OVERRIDES = {
+    "num_hidden_layers": 4,
+    "hidden_size": 512,
+    "intermediate_size": 800,
+    "num_attention_heads": 4,
+    "num_key_value_heads": 1,
+}
 
 
 @create_new_process_for_each_test()
@@ -82,3 +93,65 @@ def test_async_tp_pass_correctness(
     ]
 
     compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
+
+
+@create_new_process_for_each_test()
+def test_async_tp_pass_nvfp4_correctness(num_gpus_available: int, monkeypatch):
+    if (
+        not current_platform.is_cuda()
+        or not current_platform.is_device_capability_family(100)
+    ):
+        pytest.skip("NVFP4 requires Blackwell")
+    if not has_flashinfer():
+        pytest.skip("FlashInfer is required for the NVFP4 AsyncTP path")
+
+    monkeypatch.setenv("VLLM_NVFP4_GEMM_BACKEND", "flashinfer-cutlass")
+
+    tp_size = 2
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Need at least {tp_size} GPUs")
+
+    common_args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+        "--load-format",
+        "dummy",
+        "--hf-overrides",
+        json.dumps(NVFP4_HF_OVERRIDES),
+    ]
+
+    compilation_config = {
+        "mode": CompilationMode.VLLM_COMPILE,
+        "compile_sizes": [2, 4, 8],
+        "splitting_ops": [],
+        "pass_config": {
+            "enable_sp": True,
+            "fuse_gemm_comms": True,
+            "fuse_allreduce_rms": False,
+            "sp_min_token_num": 1,
+        },
+    }
+
+    async_tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+        "--compilation_config",
+        json.dumps(compilation_config),
+    ]
+
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+
+    compare_two_settings(NVFP4_MODEL_ID, async_tp_args, tp_args, method="generate")
diff --git a/tests/compile/correctness_e2e/test_sequence_parallel.py b/tests/compile/correctness_e2e/test_sequence_parallel.py
@@ -21,12 +21,14 @@
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
-from ...models.registry import HF_EXAMPLE_MODELS
+from ...models.registry import HF_EXAMPLE_MODELS, _HfExamplesInfo
 from ...utils import compare_two_settings, create_new_process_for_each_test
 
 logger = init_logger("test_sequence_parallel")
 
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+NVFP4_MODEL_ID = "nvidia/Llama-3.1-8B-Instruct-NVFP4"
+NVFP4_MODEL_INFO = _HfExamplesInfo(NVFP4_MODEL_ID)
 
 
 class ParallelSetup(NamedTuple):
@@ -41,6 +43,7 @@ class ParallelSetup(NamedTuple):
 class SPTestOptions(NamedTuple):
     multi_node_only: bool
     load_format: str | None = None
+    model_info: _HfExamplesInfo | None = None
 
 
 @dataclass
@@ -170,6 +173,7 @@ def _compare_sp(
     *,
     method: Literal["generate", "encode"],
     is_multimodal: bool,
+    dtype: str = "float16",
 ):
     (
         tp_size,
@@ -180,14 +184,15 @@ def _compare_sp(
         chunked_prefill,
     ) = parallel_setup
 
-    multi_node_only, load_format = test_options
+    multi_node_only = test_options.multi_node_only
+    load_format = test_options.load_format
 
-    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info = test_options.model_info or HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_transformers_version(on_fail="skip")
 
     trust_remote_code = model_info.trust_remote_code
     tokenizer_mode = model_info.tokenizer_mode
-    hf_overrides = model_info.hf_overrides
+    hf_overrides = dict(model_info.hf_overrides)
     require_embed_inputs = model_info.require_embed_inputs
 
     if load_format == "dummy":
@@ -220,7 +225,7 @@ def _compare_sp(
     common_args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
-        "float16",
+        dtype,
         "--max-model-len",
         "2048",
         "--max-num-seqs",
@@ -352,3 +357,37 @@ def test_tp_sp_generation(
         method="generate",
         is_multimodal=False,
     )
+
+
+@create_new_process_for_each_test()
+def test_tp_sp_nvfp4_generation(num_gpus_available: int):
+    if (
+        not current_platform.is_cuda()
+        or not current_platform.is_device_capability_family(100)
+    ):
+        pytest.skip("NVFP4 requires Blackwell")
+
+    _compare_sp(
+        NVFP4_MODEL_ID,
+        ParallelSetup(
+            tp_size=2,
+            pp_size=1,
+            fuse_norm_quant=True,
+            fuse_act_quant=True,
+            eager_mode=True,
+            chunked_prefill=False,
+        ),
+        "mp",
+        "auto",
+        SPTestOptions(
+            multi_node_only=False,
+            load_format="dummy",
+            model_info=NVFP4_MODEL_INFO,
+        ),
+        num_gpus_available,
+        use_inductor_graph_partition=False,
+        fuse_gemm_comms=False,
+        method="generate",
+        is_multimodal=False,
+        dtype="bfloat16",
+    )
diff --git a/tests/compile/fullgraph/test_toy_llama.py b/tests/compile/fullgraph/test_toy_llama.py
@@ -17,7 +17,6 @@
 import torch
 from torch import nn
 
-from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CompilationConfig,
@@ -340,6 +339,8 @@ def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
 def test_toy_llama(
     backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
 ):
+    from vllm.compilation.counter import compilation_counter
+
     # We disable the vLLM compile cache into a new tmp dir for 1 reason:
     # 1. To make sure we can properly track the number of Inductor compilations.
     monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")

diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py
@@ -13,11 +13,13 @@
     AttentionBackendCase,
     Matches,
     custom_ops_combos,
+    is_blackwell,
 )
 from .models import (
     FLASHINFER_ATTN,
     TRITON_ATTN,
     llama3_8b,
+    llama3_8b_fp4,
     llama3_8b_fp8,
     llama4_scout_fp8,
     qwen3_a3b,
@@ -90,6 +92,69 @@ def test_tp2_async_tp_fp8_fusions(
     )
 
 
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b_fp4],
+)
+@pytest.mark.parametrize("attn_backend", [FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+@pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4")
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
+def test_tp2_async_tp_nvfp4_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+):
+    # NVFP4 currently wires the all-gather + GEMM path only.
+    matches = matches_fn(n_layers)._replace(async_tp=n_layers * 2)
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+    model_kwargs["kernel_config"] = {"enable_flashinfer_autotune": False}
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            enable_sp=True,
+            fuse_gemm_comms=True,
+            fuse_allreduce_rms=False,
+            # Override threshold for testing (models have small hidden_size)
+            sp_min_token_num=512,
+        ),
+    )
+
+    matches_check = [
+        "act_quant_fusion",
+        "attn_quant_fusion",
+        "sequence_parallel",
+        "async_tp",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
+
+
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
     "model_name, matches_fn, model_kwargs, hf_overrides",