diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 51dff720b307..3526099dc7dc 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -371,13 +371,15 @@ def autograd_cache_key(*args, **kwargs): logger.info_once( "Cache the graph of compile range %s for later use", str(compile_range), + scope="local", ) - logger.debug( + logger.debug_once( "Store the %s-th graph for compile range%s from %s via handle %s", graph_index, str(compile_range), self.compiler.name, handle, + scope="local", ) # after compiling the last graph, record the end time diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 9f6284c4b389..584080ae12a0 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -228,9 +228,10 @@ def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None: self.encoder_cache_size = self.max_num_batched_tokens if self.enable_chunked_prefill: - logger.info( + logger.info_once( "Chunked prefill is enabled with max_num_batched_tokens=%d.", self.max_num_batched_tokens, + scope="local", ) if self.max_num_partial_prefills > 1: diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py index 46d461c38b3f..6755e9af9e65 100644 --- a/vllm/model_executor/layers/attention/mm_encoder_attention.py +++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py @@ -227,7 +227,9 @@ def __init__( if self.attn_backend == AttentionBackendEnum.FLASHINFER: _get_flashinfer_workspace_buffer() - logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.") + logger.info_once( + f"Using {self.attn_backend} for MMEncoderAttention.", scope="local" + ) @classmethod def enabled(cls) -> bool: diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index bbe30c71903f..d84b81cb12db 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -188,14 +188,15 @@ def __init__(self) -> None: use_flashinfer = supports_flashinfer if use_flashinfer: - logger.info_once("Using FlashInfer GDN prefill kernel") + logger.info_once("Using FlashInfer GDN prefill kernel", scope="local") logger.info_once( "FlashInfer GDN prefill kernel is JIT-compiled; first run may " "take a while to compile. Set `--gdn-prefill-backend triton` to " - "avoid JIT compile time." + "avoid JIT compile time.", + scope="local", ) else: - logger.info_once("Using Triton/FLA GDN prefill kernel") + logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local") self._forward_method = ( self.forward_cuda if use_flashinfer else self.forward_native diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 2025c41ab8d9..a23309333058 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -371,7 +371,8 @@ def get_vit_attn_backend( ) if is_backend_supported: logger.info_once( - f"Using backend {vit_attn_backend} for vit attention" + f"Using backend {vit_attn_backend} for vit attention", + scope="local", ) return vit_attn_backend except ImportError: diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 95336034caf7..8c7257bfb136 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -994,12 +994,13 @@ def set_multiprocessing_worker_envs(): "OMP_NUM_THREADS" not in os.environ and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads ): - logger.warning( + logger.warning_once( "Reducing Torch parallelism from %d threads to %d to avoid " "unnecessary CPU contention. Set OMP_NUM_THREADS in the " "external environment to tune this value as needed.", current_parallelism, default_omp_num_threads, + scope="local", ) os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads) torch.set_num_threads(default_omp_num_threads) diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 688c16a3133c..051fe42155ee 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -28,7 +28,8 @@ def _get_device_and_group(parallel_config: ParallelConfig): # this optimization if we run into this case. if parallel_config.disable_nccl_for_dp_synchronization: logger.info_once( - "Using CPU all reduce to synchronize DP padding between ranks." + "Using CPU all reduce to synchronize DP padding between ranks.", + scope="local", ) device = "cpu" group = get_dp_group().cpu_group diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 22459bc49ef7..77951a743500 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5510,13 +5510,14 @@ def profile_run(self) -> None: dummy_modality ] - logger.info( + logger.info_once( "Encoder cache will be initialized with a " "budget of %s tokens, and profiled with " "%s %s items of the maximum feature size.", encoder_budget, max_mm_items_per_batch, dummy_modality, + scope="local", ) # Create dummy batch of multimodal inputs.