vllm-project · MatthewBonanni · Mar 18, 2026 · Mar 17, 2026
@@ -371,13 +371,15 @@ def autograd_cache_key(*args, **kwargs):
                 logger.info_once(
                     "Cache the graph of compile range %s for later use",
                     str(compile_range),
+                    scope="local",
                 )
-            logger.debug(
+            logger.debug_once(
                 "Store the %s-th graph for compile range%s from %s via handle %s",
                 graph_index,
                 str(compile_range),
                 self.compiler.name,
                 handle,
+                scope="local",
             )
 
         # after compiling the last graph, record the end time

@@ -228,9 +228,10 @@ def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
         self.encoder_cache_size = self.max_num_batched_tokens
 
         if self.enable_chunked_prefill:
-            logger.info(
+            logger.info_once(
                 "Chunked prefill is enabled with max_num_batched_tokens=%d.",
                 self.max_num_batched_tokens,
+                scope="local",
             )
 
         if self.max_num_partial_prefills > 1:

@@ -227,7 +227,9 @@ def __init__(
         if self.attn_backend == AttentionBackendEnum.FLASHINFER:
             _get_flashinfer_workspace_buffer()
 
-        logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
+        logger.info_once(
+            f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
+        )
 
     @classmethod
     def enabled(cls) -> bool:

@@ -188,14 +188,15 @@ def __init__(self) -> None:
             use_flashinfer = supports_flashinfer
 
         if use_flashinfer:
-            logger.info_once("Using FlashInfer GDN prefill kernel")
+            logger.info_once("Using FlashInfer GDN prefill kernel", scope="local")
             logger.info_once(
                 "FlashInfer GDN prefill kernel is JIT-compiled; first run may "
                 "take a while to compile. Set `--gdn-prefill-backend triton` to "
-                "avoid JIT compile time."
+                "avoid JIT compile time.",
+                scope="local",
             )
         else:
-            logger.info_once("Using Triton/FLA GDN prefill kernel")
+            logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local")
 
         self._forward_method = (
             self.forward_cuda if use_flashinfer else self.forward_native

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -371,7 +371,8 @@ def get_vit_attn_backend(
                     )
                 if is_backend_supported:
                     logger.info_once(
-                        f"Using backend {vit_attn_backend} for vit attention"
+                        f"Using backend {vit_attn_backend} for vit attention",
+                        scope="local",
                     )
                     return vit_attn_backend
             except ImportError:

@@ -994,12 +994,13 @@ def set_multiprocessing_worker_envs():
         "OMP_NUM_THREADS" not in os.environ
         and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
     ):
-        logger.warning(
+        logger.warning_once(
             "Reducing Torch parallelism from %d threads to %d to avoid "
             "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
             "external environment to tune this value as needed.",
             current_parallelism,
             default_omp_num_threads,
+            scope="local",
         )
         os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
         torch.set_num_threads(default_omp_num_threads)
@@ -28,7 +28,8 @@ def _get_device_and_group(parallel_config: ParallelConfig):
     # this optimization if we run into this case.
     if parallel_config.disable_nccl_for_dp_synchronization:
         logger.info_once(
-            "Using CPU all reduce to synchronize DP padding between ranks."
+            "Using CPU all reduce to synchronize DP padding between ranks.",
+            scope="local",
         )
         device = "cpu"
         group = get_dp_group().cpu_group

@@ -5510,13 +5510,14 @@ def profile_run(self) -> None:
                             dummy_modality
                         ]
 
-                        logger.info(
+                        logger.info_once(
                             "Encoder cache will be initialized with a "
                             "budget of %s tokens, and profiled with "
                             "%s %s items of the maximum feature size.",
                             encoder_budget,
                             max_mm_items_per_batch,
                             dummy_modality,
+                            scope="local",
                         )
 
                         # Create dummy batch of multimodal inputs.