[Misc] Fix `Current vLLM config is not set.` warnings, assert to avoid issues in the future #31747

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

simon-mo merged 26 commits into vllm-project:main from neuralmagic:lwilkinson/fix-warnings

Jan 8, 2026

tests/compile/distributed/test_async_tp.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -15,6 +15,7 @@
  
        ModelConfig,

        PassConfig,

        VllmConfig,

        set_current_vllm_config,

    )

    from vllm.distributed import (

        tensor_model_parallel_all_gather,

    @@ -340,38 +341,42 @@ def async_tp_pass_on_test_model(
  
        )

        async_tp_pass = AsyncTPPass(vllm_config)

        backend = TestBackend(async_tp_pass)

        assert (

            async_tp_pass.compilation_config.splitting_ops

            == vllm_config.compilation_config.splitting_ops

        )

        assert (

            async_tp_pass.compilation_config.use_inductor_graph_partition

            == vllm_config.compilation_config.use_inductor_graph_partition

        )

        # Set the global vllm_config for TestBackend which calls

        # get_current_vllm_config()

        with set_current_vllm_config(vllm_config):

            backend = TestBackend(async_tp_pass)

        model = test_model_cls(hidden_size, dtype)  # Pass dtype to model constructor

            assert (

                async_tp_pass.compilation_config.splitting_ops

                == vllm_config.compilation_config.splitting_ops

            )

            assert (

                async_tp_pass.compilation_config.use_inductor_graph_partition

                == vllm_config.compilation_config.use_inductor_graph_partition

            )

        hidden_states = torch.randn(

            (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False

        )

            model = test_model_cls(hidden_size, dtype)  # Pass dtype to model constructor

            hidden_states = torch.randn(

                (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False

            )

        if dynamic:

            torch._dynamo.mark_dynamic(hidden_states, 0)

            if dynamic:

                torch._dynamo.mark_dynamic(hidden_states, 0)

        compiled_model = torch.compile(model, backend=backend)

        compiled_model(hidden_states)

            compiled_model = torch.compile(model, backend=backend)

            compiled_model(hidden_states)

        assert async_tp_pass.matched_count == 1

            assert async_tp_pass.matched_count == 1

        # In pre-nodes, all gather or reduce scatter should exist,

        # fused_matmul_reduce_scatter or fused_all_gather_matmul should not

        backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)

            # In pre-nodes, all gather or reduce scatter should exist,

            # fused_matmul_reduce_scatter or fused_all_gather_matmul should not

            backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)

        # In post-nodes, fused_matmul_reduce_scatter or \

        # fused_all_gather_matmul should exist

        backend.check_after_ops(model.ops_in_model_after())

            # In post-nodes, fused_matmul_reduce_scatter or \

            # fused_all_gather_matmul should exist

            backend.check_after_ops(model.ops_in_model_after())

    @create_new_process_for_each_test()

tests/compile/test_config.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -430,7 +430,7 @@ def test_cudagraph_sizes_post_init( @@
             )
-    def test_cached_compilation_config():
+    def test_cached_compilation_config(default_vllm_config):
         import torch
         from torch._inductor.utils import run_and_get_code
@@ Expand Down @@

tests/conftest.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -189,6 +189,17 @@ def dist_init(): @@
         cleanup_dist_env_and_memory()
+    @pytest.fixture
+    def default_vllm_config():
+        """Set a default VllmConfig for tests that directly test CustomOps or pathways
+        that use get_current_vllm_config() outside of a full engine context.
+        """
+        from vllm.config import VllmConfig, set_current_vllm_config
+        with set_current_vllm_config(VllmConfig()):
+            yield
     @pytest.fixture()
     def should_do_global_cleanup_after_test(request) -> bool:
         """Allow subdirectories to skip global cleanup by overriding this fixture.
@@ Expand Down @@

tests/kernels/attention/test_flashinfer_trtllm_attention.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -458,7 +458,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( @@
         )
-    def test_trtllm_attention_rejects_num_kv_heads_1() -> None:
+    def test_trtllm_attention_rejects_num_kv_heads_1(default_vllm_config) -> None:
         """Test that TRTLLM attention correctly rejects num_kv_heads=1.
         When num_kv_heads=1 (MQA), the KV cache strides become degenerate
@@ Expand Down @@

tests/kernels/attention/test_mha_attn.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -36,7 +36,7 @@ def clear_cache(): @@
     @pytest.mark.parametrize("device", devices)
-    def test_mha_attn_platform(device: str):
+    def test_mha_attn_platform(default_vllm_config, device: str):
         """
         Test the attention selector between different platform and device.
         """
@@ Expand Down Expand Up / @@ -116,6 +116,7 @@ def ref_attention( @@
     @pytest.mark.parametrize("dtype", DTYPES)
     @pytest.mark.parametrize("device", CUDA_DEVICES)
     def test_mha_attn_forward(
+        default_vllm_config,
         batch_size: int,
         seq_len: int,
         num_heads: int,
@@ Expand Down Expand Up / @@ -162,6 +163,7 @@ def test_mha_attn_forward( @@
     @pytest.mark.parametrize("dtype", DTYPES)
     @pytest.mark.parametrize("device", CUDA_DEVICES)
     def test_mha_attn_varlen_forward(
+        default_vllm_config,
         var_seq_len: list[int],
         num_heads: int,
         num_kv_heads: int,
@@ Expand Down @@

tests/kernels/core/test_activation.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -45,6 +45,7 @@ @@
     @pytest.mark.parametrize("device", CUDA_DEVICES)
     @torch.inference_mode()
     def test_act_and_mul(
+        default_vllm_config,
         activation: str,
         num_tokens: int,
         d: int,
@@ Expand Down Expand Up / @@ -122,6 +123,7 @@ def _get_rtol(output) -> float: @@
     @pytest.mark.parametrize("device", CUDA_DEVICES)
     @torch.inference_mode()
     def test_activation(
+        default_vllm_config,
         activation: type[torch.nn.Module],
         num_tokens: int,
         d: int,
@@ Expand Down @@

tests/kernels/core/test_fused_qk_norm_rope.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -57,6 +57,7 @@ def _apply_qk_norm_rope( @@
     @pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25])
     @torch.inference_mode()
     def test_fused_qk_norm_rope_matches_reference(
+        default_vllm_config,
         device: str,
         dtype: torch.dtype,
         is_neox: bool,
@@ Expand Down @@

tests/kernels/core/test_fused_quant_layernorm.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -147,6 +147,7 @@ def ops_impl( @@
     @pytest.mark.parametrize("device", CUDA_DEVICES)
     @torch.inference_mode()
     def test_rms_norm(
+        default_vllm_config,
         num_tokens: int,
         hidden_size: int,
         add_residual: bool,
@@ Expand Down @@

tests/kernels/core/test_layernorm.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -26,6 +26,7 @@ @@
     @pytest.mark.parametrize("strided_input", [False, True])
     @torch.inference_mode()
     def test_rms_norm(
+        default_vllm_config,
         num_tokens: int,
         hidden_size: int,
         add_residual: bool,
@@ Expand Down @@

tests/kernels/core/test_mrope.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -90,6 +90,7 @@ class MRoPETestInfo(NamedTuple): @@
     @pytest.mark.parametrize("dtype", [torch.bfloat16])
     @pytest.mark.parametrize("num_tokens", num_tokens_list)
     def test_mrope(
+        default_vllm_config,
         model_name: str,
         model_info: MRoPETestInfo,
         tp_size: int,
@@ Expand Down Expand Up / @@ -159,6 +160,7 @@ def test_mrope( @@
     @pytest.mark.parametrize("dtype", [torch.bfloat16])
     @pytest.mark.parametrize("num_tokens", num_tokens_list)
     def test_mrope_torch_compile_tracing(
+        default_vllm_config,
         model_name: str,
         model_info: MRoPETestInfo,
         tp_size: int,
@@ Expand Down @@

tests/kernels/core/test_pos_encoding.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -62,6 +62,7 @@ def _get_batch_tensor_shape( @@
     @pytest.mark.parametrize("use_key", USE_KEY)
     @torch.inference_mode()
     def test_rotary_embedding(
+        default_vllm_config,
         is_neox_style: bool,
         tensor_shape_fn: Callable[[int, int, int, int], tuple[int, ...]],
         batch_size: int,
@@ Expand Down Expand Up / @@ -123,7 +124,7 @@ def test_rotary_embedding( @@
     @torch.inference_mode()
-    def test_rope_module_cache():
+    def test_rope_module_cache(default_vllm_config):
         MAX_POSITIONS = [123, 1234]
         ROPE_THETAS = [10000, 1000000]
         ROPE_PARAMETERS = (
@@ Expand Down @@

tests/kernels/core/test_rotary_embedding.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -36,6 +36,7 @@ def rotary_embedding_opcheck( @@
     @pytest.mark.parametrize("use_key", [True, False])
     @pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
     def test_rotary_embedding_opcheck(
+        default_vllm_config,
         dist_init,
         device,
         max_position,
@@ Expand Down @@

tests/kernels/moe/test_cpu_fused_moe.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -6,7 +6,7 @@
  
    from tests.kernels.allclose_default import get_default_atol, get_default_rtol

    from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight

    from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul

    from vllm.model_executor.layers.fused_moe.cpu_fused_moe import _CPU_MOE_ACT

    from vllm.platforms import current_platform

    from vllm.utils.torch_utils import set_random_seed

    @@ -24,11 +24,6 @@
  
    ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]

    DTYPE = [torch.bfloat16]

    _CPU_MOE_ACT = {

        "silu": SiluAndMul(),

        "swigluoai": SwigluOAIAndMul(),

    }

    def ref_fused_moe(

        input: torch.Tensor,

    @@ -106,6 +101,7 @@ def ref_fused_moe(
  
    @pytest.mark.parametrize("act", ACT)

    @pytest.mark.parametrize("isa", ISA)

    def test_cpu_fused_moe(

        default_vllm_config,

        batch_size: int,

        expert_num: int,

        hidden_size: int,

tests/kernels/moe/test_moe.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -468,7 +468,12 @@ def test_fused_moe_wn16( @@
     )
     @torch.inference_mode()
     def test_mixtral_moe(
-        dist_init, dtype: torch.dtype, padding: bool, use_rocm_aiter: bool, monkeypatch
+        default_vllm_config,
+        dist_init,
+        dtype: torch.dtype,
+        padding: bool,
+        use_rocm_aiter: bool,
+        monkeypatch,
     ):
         """Make sure our Mixtral MoE implementation agrees with the one from
         huggingface."""
@@ Expand Down @@

tests/kernels/quantization/test_fp8_quant_group.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -23,7 +23,12 @@ @@
     @pytest.mark.parametrize("use_ue8m0", [True, False])
     @torch.inference_mode()
     def test_quantfp8_group_functionality(
-        batch_size: int, hidden_dim: int, group_size: int, seed: int, use_ue8m0: bool
+        default_vllm_config,
+        batch_size: int,
+        hidden_dim: int,
+        group_size: int,
+        seed: int,
+        use_ue8m0: bool,
     ) -> None:
         """Test QuantFP8 group quantization with various configurations.
@@ Expand Down Expand Up / @@ -82,7 +87,9 @@ def test_quantfp8_group_functionality( @@
     @pytest.mark.parametrize("seed", [42])
     @pytest.mark.parametrize("use_ue8m0", [True, False])
     @torch.inference_mode()
-    def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
+    def test_quantfp8_group_multidimensional(
+        default_vllm_config, seed: int, use_ue8m0: bool
+    ) -> None:
         set_random_seed(seed)
         group_size = 64
@@ Expand Down Expand Up @@
     @pytest.mark.parametrize("seed", [42])
     @torch.inference_mode()
-    def test_quantfp8_group_edge_cases(seed: int) -> None:
+    def test_quantfp8_group_edge_cases(default_vllm_config, seed: int) -> None:
         set_random_seed(seed)
         batch_size = 16
@@ Expand Down @@

tests/kernels/quantization/test_int8_kernel.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -102,7 +102,7 @@ def setup_cuda(): @@
         itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS),
     )
     @torch.inference_mode()
-    def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
+    def test_w8a8_fp8_fused_moe(default_vllm_config, M, N, K, E, topk, dtype, seed):
         torch.manual_seed(seed)
         # Initialize int8 quantization parameters
         factor_for_scale = 1e-2
@@ Expand Down @@

tests/kernels/quantization/test_silu_mul_nvfp4_quant.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -31,6 +31,7 @@ @@
     @pytest.mark.parametrize("shape", SHAPES)
     @torch.inference_mode()
     def test_silu_mul_nvfp4_quant(
+        default_vllm_config,
         dtype: torch.dtype,
         shape: tuple[int, int],
     ) -> None:
@@ Expand Down @@

tests/kernels/test_fused_quant_activation.py

-Original file line number
+Diff line change
@@ Expand Up @@
     @pytest.mark.parametrize("device", CUDA_DEVICES)
     @torch.inference_mode()
     def test_silu_and_mul(
+        default_vllm_config,
         num_tokens: int,
         hidden_size: int,
         dtype: torch.dtype,
@@ Expand Down @@

tests/lora/conftest.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -82,7 +82,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA): @@
     @pytest.fixture
-    def dummy_model() -> nn.Module:
+    def dummy_model(default_vllm_config) -> nn.Module:
         model = DummyLoRAModel(
             OrderedDict(
                 [
@@ Expand Down Expand Up / @@ -115,7 +115,7 @@ def dummy_model() -> nn.Module: @@
     @pytest.fixture
-    def dummy_model_gate_up() -> nn.Module:
+    def dummy_model_gate_up(default_vllm_config) -> nn.Module:
         model = DummyLoRAModel(
             OrderedDict(
                 [
@@ Expand Down @@

tests/lora/test_layers.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool:
  
    @pytest.mark.parametrize("device", DEVICES)

    @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])

    @pytest.mark.parametrize("stage", STAGES)

    def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:

    def test_embeddings(

        default_vllm_config, dist_init, num_loras, device, vocab_size, stage

    ) -> None:

        # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA

        # device, see: https://github.com/triton-lang/triton/issues/2925

        # Same below.

    @@ -353,7 +355,7 @@ def create_random_embedding_layer():
  
    @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])

    @pytest.mark.parametrize("stage", STAGES)

    def test_lm_head_logits_processor(

        dist_init, num_loras, device, vocab_size, stage

        default_vllm_config, dist_init, num_loras, device, vocab_size, stage

    ) -> None:

        if current_platform.is_cuda_alike():

            torch.cuda.set_device(device)

    @@ -470,6 +472,7 @@ def _pretest():
  
    @pytest.mark.parametrize("device", DEVICES)

    @pytest.mark.parametrize("stage", STAGES)

    def test_linear_replicated(

        default_vllm_config,

        dist_init,

        num_loras,

        device,

    @@ -580,7 +583,7 @@ def create_random_linear_replicated_layer():
  
    @pytest.mark.parametrize("device", DEVICES)

    @pytest.mark.parametrize("stage", STAGES)

    def test_linear_parallel(

        dist_init, num_loras, orientation, fully_shard, device, stage

        default_vllm_config, dist_init, num_loras, orientation, fully_shard, device, stage

    ) -> None:

        if current_platform.is_cuda_alike():

            torch.cuda.set_device(device)

    @@ -705,7 +708,7 @@ def create_random_linear_parallel_layer():
  
    @pytest.mark.parametrize("device", DEVICES)

    @pytest.mark.parametrize("stage", STAGES)

    def test_column_parallel_packed(

        dist_init, num_loras, repeats, fully_shard, device, stage

        default_vllm_config, dist_init, num_loras, repeats, fully_shard, device, stage

    ) -> None:

        if current_platform.is_cuda_alike():

            torch.cuda.set_device(device)

    @@ -851,7 +854,7 @@ class FakeConfig:
  
    @pytest.mark.parametrize(

        "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))

    )

    def test_vocab_parallel_embedding_indices(tp_size, seed):

    def test_vocab_parallel_embedding_indices(tp_size, seed, default_vllm_config):

        random.seed(seed)

        vocab_size = random.randint(4000, 64000)

        added_vocab_size = random.randint(0, 1024)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Misc] Fix `Current vLLM config is not set.` warnings, assert to avoid issues in the future #31747

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

[Misc] Fix Current vLLM config is not set. warnings, assert to avoid issues in the future #31747

Uh oh!

[Misc] Fix Current vLLM config is not set. warnings, assert to avoid issues in the future #31747

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

[Misc] Fix `Current vLLM config is not set.` warnings, assert to avoid issues in the future #31747

[Misc] Fix `Current vLLM config is not set.` warnings, assert to avoid issues in the future #31747