vllm-project · yewentao256 · Apr 22, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 22, 2026
@@ -292,7 +292,6 @@ def compile(
                     "from the cache, took %.3f s",
                     str(compile_range),
                     elapsed,
-                    scope="local",
                 )
             return compiled_graph
 
@@ -377,15 +376,13 @@ def autograd_cache_key(*args, **kwargs):
                 logger.info_once(
                     "Cache the graph of compile range %s for later use",
                     str(compile_range),
-                    scope="local",
                 )
             logger.debug_once(
                 "Store the %s-th graph for compile range%s from %s via handle %s",
                 graph_index,
                 str(compile_range),
                 self.compiler.name,
                 handle,
-                scope="local",
             )
 
         # after compiling the last graph, record the end time
@@ -399,7 +396,6 @@ def autograd_cache_key(*args, **kwargs):
                 "Compiling a graph for compile range %s takes %.2f s",
                 str(compile_range),
                 elapsed,
-                scope="local",
             )
 
         return compiled_graph
@@ -1072,12 +1068,11 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
         disable_cache = disable_cache or is_ngram_gpu_enabled
 
         if disable_cache:
-            logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
+            logger.info_once("vLLM's torch.compile cache is disabled.")
         else:
             logger.info_once(
                 "Using cache directory: %s for vLLM's torch.compile",
                 local_cache_dir,
-                scope="local",
             )
 
         self.compiler_manager.initialize_cache(
@@ -1134,9 +1129,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
         from .monitor import torch_compile_start_time
 
         dynamo_time = time.perf_counter() - torch_compile_start_time
-        logger.info_once(
-            "Dynamo bytecode transform time: %.2f s", dynamo_time, scope="local"
-        )
+        logger.info_once("Dynamo bytecode transform time: %.2f s", dynamo_time)
         if self.is_encoder:
             self.compilation_config.encoder_compilation_time += dynamo_time
         else:
@@ -1215,7 +1208,6 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
             logger.info_once(
                 "Saved compiler manager cache in %.2f seconds.",
                 elapsed,
-                scope="local",
             )
 
         from torch._guards import detect_fake_mode
@@ -1254,9 +1246,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
             with open(graph_path, "w") as f:
                 f.write(src)
 
-            logger.debug_once(
-                "Computation graph saved to %s", graph_path, scope="local"
-            )
+            logger.debug_once("Computation graph saved to %s", graph_path)
 
         self._called = True
         graph_to_serialize = (

@@ -665,7 +665,6 @@ def save_aot_compiled_function(self: type[_T]) -> None:
             logger.info_once(
                 "saved AOT compiled function to %s",
                 self._aot_compilation_path,
-                scope="local",
             )
         except Exception as e:
             logger.warning(

@@ -45,7 +45,7 @@ def monitor_torch_compile(
     else:
         total_compile_time = time.perf_counter() - torch_compile_start_time
         if compilation_config.mode == CompilationMode.VLLM_COMPILE:
-            logger.info_once(message, total_compile_time, scope="local")
+            logger.info_once(message, total_compile_time)
     finally:
         if depyf_cm is not None:
             try:
@@ -76,7 +76,6 @@ def monitor_profiling_run() -> Generator[None, None, None]:
     logger.info_once(
         "Initial profiling/warmup run took %.2f s",
         elapsed,
-        scope="local",
     )
 
 

@@ -239,7 +239,6 @@ def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
             logger.info_once(
                 "Chunked prefill is enabled with max_num_batched_tokens=%d.",
                 self.max_num_batched_tokens,
-                scope="local",
             )
 
         if self.max_num_partial_prefills > 1:

@@ -716,9 +716,7 @@ def __post_init__(self):
         self.instance_id = f"{time.time_ns()}"
 
         if self.performance_mode != "balanced":
-            logger.info_once(
-                "Performance mode set to '%s'.", self.performance_mode, scope="local"
-            )
+            logger.info_once("Performance mode set to '%s'.", self.performance_mode)
 
         self.try_verify_and_update_config()
 
@@ -818,7 +816,6 @@ def __post_init__(self):
                     "Async scheduling not supported with %s-based "
                     "speculative decoding and will be disabled.",
                     self.speculative_config.method,
-                    scope="local",
                 )
                 self.scheduler_config.async_scheduling = False
             elif (
@@ -828,15 +825,13 @@ def __post_init__(self):
                 logger.warning_once(
                     "Async scheduling is not compatible with "
                     "disable_padded_drafter_batch=True and will be disabled.",
-                    scope="local",
                 )
                 self.scheduler_config.async_scheduling = False
             elif not executor_supports_async_sched:
                 logger.warning_once(
                     "Async scheduling will be disabled because it is not supported "
                     "with the `%s` distributed executor backend. ",
                     executor_backend,
-                    scope="local",
                 )
                 self.scheduler_config.async_scheduling = False
             else:
@@ -855,7 +850,6 @@ def __post_init__(self):
                     logger.info_once(
                         "Disabling NCCL for DP synchronization "
                         "when using async scheduling.",
-                        scope="local",
                     )
                 self.parallel_config.disable_nccl_for_dp_synchronization = True
             else:
@@ -870,7 +864,6 @@ def __post_init__(self):
             logger.warning_once(
                 "Disabling cascade attention (not yet compatible with "
                 "async speculative decoding).",
-                scope="local",
             )
             self.model_config.disable_cascade_attn = True
 
@@ -1231,7 +1224,6 @@ def has_blocked_weights():
             self.model_config.disable_cascade_attn = True
             logger.warning_once(
                 "Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.",
-                scope="local",
             )
 
         if self.parallel_config.use_ubatching:
@@ -1418,7 +1410,6 @@ def _set_max_num_scheduled_tokens(self):
                     " performance. Consider increasing max_num_batched_tokens to"
                     " accommodate the additional draft token slots, or decrease"
                     " num_speculative_tokens or max_num_seqs.",
-                    scope="local",
                 )
 
             max_num_scheduled_tokens = self.scheduler_config.max_num_scheduled_tokens

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
@@ -108,9 +108,7 @@ def __init__(
         if self.rank == 0:
             # get the unique id from NCCL
             self.unique_id = self.nccl.ncclGetUniqueId()
-            logger.info_once(
-                "vLLM is using nccl==%s", self.nccl.ncclGetVersion(), scope="local"
-            )
+            logger.info_once("vLLM is using nccl==%s", self.nccl.ncclGetVersion())
         else:
             # construct an empty unique id
             self.unique_id = ncclUniqueId()

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -2254,7 +2254,6 @@ def _set_default_chunked_prefill_and_prefix_caching_args(
                 "This model does not officially support disabling chunked prefill. "
                 "Disabling this manually may cause the engine to crash "
                 "or produce incorrect outputs.",
-                scope="local",
             )
         elif (
             model_config.runner_type == "pooling"
@@ -2265,7 +2264,6 @@ def _set_default_chunked_prefill_and_prefix_caching_args(
                 "This model does not officially support chunked prefill. "
                 "Enabling this manually may cause the engine to crash "
                 "or produce incorrect outputs.",
-                scope="local",
             )
 
         if self.enable_prefix_caching is None:
@@ -2284,7 +2282,6 @@ def _set_default_chunked_prefill_and_prefix_caching_args(
                 "This model does not officially support prefix caching. "
                 "Enabling this manually may cause the engine to crash "
                 "or produce incorrect outputs.",
-                scope="local",
             )
 
         # Disable chunked prefill and prefix caching for:

@@ -387,7 +387,6 @@ def _parent_module(module_name: str) -> str:
                     "LoRA is not supported for non-gated MoE gate module."
                     " %s will be ignored.",
                     module_name,
-                    scope="local",
                 )
                 continue
 

@@ -332,7 +332,6 @@ def __init__(
             logger.warning_once(
                 "Disabling prefix caching for FLASHINFER/TRITON_MLA "
                 "with batch invariance, as it is not yet supported.",
-                scope="local",
             )
             cache_config.enable_prefix_caching = False
 

@@ -427,7 +427,6 @@ def __init__(
             logger.warning_once(
                 "Disabling prefix caching for TRITON_MLA / FLASHINFER "
                 "with batch invariance, as it is not yet supported.",
-                scope="local",
             )
             cache_config.enable_prefix_caching = False
 
@@ -1523,17 +1522,14 @@ def determine_prefill_query_data_type(
 
         if use_fp8:
             fp8_dtype = current_platform.fp8_dtype()
-            logger.info_once(
-                "FP8 prefill attention enabled: query data type is FP8", scope="local"
-            )
+            logger.info_once("FP8 prefill attention enabled: query data type is FP8")
             return fp8_dtype
         elif vllm_config.attention_config.use_prefill_query_quantization:
             logger.info_once(
                 "Unable to perform FP8 prefill attention when"
                 " use_prefill_query_quantization is enabled. Please"
                 " ensure that --kv-cache-dtype is set to fp8 and your prefill"
                 " backend is compatible with FP8 attention.",
-                scope="local",
             )
             return model_dtype
         elif (
@@ -1547,7 +1543,6 @@ def determine_prefill_query_data_type(
                 "prefill latency. To enable, add: "
                 '--attention-config \'{"use_prefill_query_quantization"'
                 ": true}'",
-                scope="local",
             )
 
         return model_dtype
@@ -2225,21 +2220,19 @@ def __init__(
         )
 
         if use_trtllm_ragged_deepseek_prefill():
-            logger.info_once(
-                "Using TRT-LLM ragged DeepSeek prefill for MLA", scope="local"
-            )
+            logger.info_once("Using TRT-LLM ragged DeepSeek prefill for MLA")
             self._run_prefill_context_chunk = (
                 self._run_prefill_context_chunk_trtllm_ragged
             )
             self._run_prefill_new_tokens = self._run_prefill_new_tokens_trtllm_ragged
             self._pad_v = False
         elif use_flashinfer_prefill():
-            logger.info_once("Using FlashInfer prefill for MLA", scope="local")
+            logger.info_once("Using FlashInfer prefill for MLA")
             self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
             self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
             self._pad_v = False
         elif use_cudnn_prefill():
-            logger.info_once("Using CUDNN prefill for MLA", scope="local")
+            logger.info_once("Using CUDNN prefill for MLA")
             self._run_prefill_context_chunk = self._run_prefill_context_chunk_cudnn
             self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn
             self._pad_v = False
@@ -2250,7 +2243,7 @@ def __init__(
                     "available. Please install flash_attn or use "
                     "--attention-backend ROCM_AITER_MLA."
                 )
-            logger.info_once("Using FlashAttention prefill for MLA", scope="local")
+            logger.info_once("Using FlashAttention prefill for MLA")
             self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa
             self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa
 

@@ -227,9 +227,7 @@ def __init__(
         if self.attn_backend == AttentionBackendEnum.FLASHINFER:
             _get_flashinfer_workspace_buffer()
 
-        logger.info_once(
-            f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
-        )
+        logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
 
     @classmethod
     def enabled(cls) -> bool:

@@ -1020,7 +1020,7 @@ def override_envs_for_invariance(
             "You are using a non-decode-invariant form of batch invariance. "
             "This will not be invariant between prefill and decode."
         )
-        logger.warning_once(warning, scope="local")
+        logger.warning_once(warning)
     os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
 
     os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

@@ -369,7 +369,6 @@ def estimate_expected_m(
             logger.warning_once(
                 "DPMetadata unavailable. Defaulting expected_m to "
                 f"{max_tokens_per_expert}.",
-                scope="local",
             )
             return max_tokens_per_expert
 

@@ -1091,7 +1091,6 @@ def get_moe_configs(
         "Using default MoE config. Performance might be sub-optimal! "
         "Config file not found at %s",
         ", ".join(config_file_paths),
-        scope="local",
     )
     return None
 

@@ -123,7 +123,6 @@ def post_init_setup(self, fused_experts: mk.FusedMoEExperts):
                 "NixlEPPrepareAndFinalize is setup to dispatch raw/unquantized "
                 f"activations despite ({fused_experts.__class__.__name__}) being able "
                 "to support quantized activations.",
-                scope="local",
             )
 
     def num_dispatchers(self) -> int:

@@ -266,7 +266,7 @@ def _return_or_raise(
                 k_cls, config, weight_key, activation_key, activation_format
             )
             if supported:
-                logger.info_once(_make_log_backend(backend), scope="local")
+                logger.info_once(_make_log_backend(backend))
                 return backend, k_cls
         raise ValueError(_make_log_unsupported(backend, reason))
 
@@ -337,12 +337,10 @@ def _return_or_raise(
                     )
 
                     if supported:
-                        logger.info_once(_make_log_backend(backend), scope="local")
+                        logger.info_once(_make_log_backend(backend))
                         return backend, k_cls
                     else:
-                        logger.debug_once(
-                            _make_log_unsupported(backend, reason), scope="local"
-                        )
+                        logger.debug_once(_make_log_unsupported(backend, reason))
 
             raise NotImplementedError(
                 "Found VLLM_USE_FLASHINFER_MOE_FP8=1, but no "
@@ -396,10 +394,10 @@ def _return_or_raise(
                 activation_format,
             )
             if supported:
-                logger.info_once(_make_log_backend(backend), scope="local")
+                logger.info_once(_make_log_backend(backend))
                 return backend, k_cls
             else:
-                logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
+                logger.debug_once(_make_log_unsupported(backend, reason))
 
     # TODO(rob): per discussion with TPU team, we need a way to register
     # MoE backends by OOT plugins, rather than having an explicit list
@@ -580,7 +578,7 @@ def make_fp8_moe_kernel(
     )
     assert prepare_finalize is not None
 
-    logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local")
+    logger.info_once("Using %s", prepare_finalize.__class__.__name__)
 
     # Create Experts.
     if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts: