diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index c3900ffc67d3..501436275a09 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -292,7 +292,6 @@ def compile( "from the cache, took %.3f s", str(compile_range), elapsed, - scope="local", ) return compiled_graph @@ -377,7 +376,6 @@ def autograd_cache_key(*args, **kwargs): logger.info_once( "Cache the graph of compile range %s for later use", str(compile_range), - scope="local", ) logger.debug_once( "Store the %s-th graph for compile range%s from %s via handle %s", @@ -385,7 +383,6 @@ def autograd_cache_key(*args, **kwargs): str(compile_range), self.compiler.name, handle, - scope="local", ) # after compiling the last graph, record the end time @@ -399,7 +396,6 @@ def autograd_cache_key(*args, **kwargs): "Compiling a graph for compile range %s takes %.2f s", str(compile_range), elapsed, - scope="local", ) return compiled_graph @@ -1072,12 +1068,11 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any: disable_cache = disable_cache or is_ngram_gpu_enabled if disable_cache: - logger.info_once("vLLM's torch.compile cache is disabled.", scope="local") + logger.info_once("vLLM's torch.compile cache is disabled.") else: logger.info_once( "Using cache directory: %s for vLLM's torch.compile", local_cache_dir, - scope="local", ) self.compiler_manager.initialize_cache( @@ -1134,9 +1129,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any: from .monitor import torch_compile_start_time dynamo_time = time.perf_counter() - torch_compile_start_time - logger.info_once( - "Dynamo bytecode transform time: %.2f s", dynamo_time, scope="local" - ) + logger.info_once("Dynamo bytecode transform time: %.2f s", dynamo_time) if self.is_encoder: self.compilation_config.encoder_compilation_time += dynamo_time else: @@ -1215,7 +1208,6 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any: logger.info_once( "Saved compiler manager cache in %.2f seconds.", elapsed, - scope="local", ) from torch._guards import detect_fake_mode @@ -1254,9 +1246,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any: with open(graph_path, "w") as f: f.write(src) - logger.debug_once( - "Computation graph saved to %s", graph_path, scope="local" - ) + logger.debug_once("Computation graph saved to %s", graph_path) self._called = True graph_to_serialize = ( diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 79daf00de66b..a9ecb321cb38 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -665,7 +665,6 @@ def save_aot_compiled_function(self: type[_T]) -> None: logger.info_once( "saved AOT compiled function to %s", self._aot_compilation_path, - scope="local", ) except Exception as e: logger.warning( diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index f584f526f08f..a15f1d5fe73c 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -45,7 +45,7 @@ def monitor_torch_compile( else: total_compile_time = time.perf_counter() - torch_compile_start_time if compilation_config.mode == CompilationMode.VLLM_COMPILE: - logger.info_once(message, total_compile_time, scope="local") + logger.info_once(message, total_compile_time) finally: if depyf_cm is not None: try: @@ -76,7 +76,6 @@ def monitor_profiling_run() -> Generator[None, None, None]: logger.info_once( "Initial profiling/warmup run took %.2f s", elapsed, - scope="local", ) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index b9a48144ded4..fb6951ea7dd1 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -239,7 +239,6 @@ def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None: logger.info_once( "Chunked prefill is enabled with max_num_batched_tokens=%d.", self.max_num_batched_tokens, - scope="local", ) if self.max_num_partial_prefills > 1: diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 26506642561f..0726e93d2fef 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -716,9 +716,7 @@ def __post_init__(self): self.instance_id = f"{time.time_ns()}" if self.performance_mode != "balanced": - logger.info_once( - "Performance mode set to '%s'.", self.performance_mode, scope="local" - ) + logger.info_once("Performance mode set to '%s'.", self.performance_mode) self.try_verify_and_update_config() @@ -818,7 +816,6 @@ def __post_init__(self): "Async scheduling not supported with %s-based " "speculative decoding and will be disabled.", self.speculative_config.method, - scope="local", ) self.scheduler_config.async_scheduling = False elif ( @@ -828,7 +825,6 @@ def __post_init__(self): logger.warning_once( "Async scheduling is not compatible with " "disable_padded_drafter_batch=True and will be disabled.", - scope="local", ) self.scheduler_config.async_scheduling = False elif not executor_supports_async_sched: @@ -836,7 +832,6 @@ def __post_init__(self): "Async scheduling will be disabled because it is not supported " "with the `%s` distributed executor backend. ", executor_backend, - scope="local", ) self.scheduler_config.async_scheduling = False else: @@ -855,7 +850,6 @@ def __post_init__(self): logger.info_once( "Disabling NCCL for DP synchronization " "when using async scheduling.", - scope="local", ) self.parallel_config.disable_nccl_for_dp_synchronization = True else: @@ -870,7 +864,6 @@ def __post_init__(self): logger.warning_once( "Disabling cascade attention (not yet compatible with " "async speculative decoding).", - scope="local", ) self.model_config.disable_cascade_attn = True @@ -1231,7 +1224,6 @@ def has_blocked_weights(): self.model_config.disable_cascade_attn = True logger.warning_once( "Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.", - scope="local", ) if self.parallel_config.use_ubatching: @@ -1418,7 +1410,6 @@ def _set_max_num_scheduled_tokens(self): " performance. Consider increasing max_num_batched_tokens to" " accommodate the additional draft token slots, or decrease" " num_speculative_tokens or max_num_seqs.", - scope="local", ) max_num_scheduled_tokens = self.scheduler_config.max_num_scheduled_tokens diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 6ac3b9ea3c7c..990c808a9831 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -108,9 +108,7 @@ def __init__( if self.rank == 0: # get the unique id from NCCL self.unique_id = self.nccl.ncclGetUniqueId() - logger.info_once( - "vLLM is using nccl==%s", self.nccl.ncclGetVersion(), scope="local" - ) + logger.info_once("vLLM is using nccl==%s", self.nccl.ncclGetVersion()) else: # construct an empty unique id self.unique_id = ncclUniqueId() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7028b12dab32..e6528849b219 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -2254,7 +2254,6 @@ def _set_default_chunked_prefill_and_prefix_caching_args( "This model does not officially support disabling chunked prefill. " "Disabling this manually may cause the engine to crash " "or produce incorrect outputs.", - scope="local", ) elif ( model_config.runner_type == "pooling" @@ -2265,7 +2264,6 @@ def _set_default_chunked_prefill_and_prefix_caching_args( "This model does not officially support chunked prefill. " "Enabling this manually may cause the engine to crash " "or produce incorrect outputs.", - scope="local", ) if self.enable_prefix_caching is None: @@ -2284,7 +2282,6 @@ def _set_default_chunked_prefill_and_prefix_caching_args( "This model does not officially support prefix caching. " "Enabling this manually may cause the engine to crash " "or produce incorrect outputs.", - scope="local", ) # Disable chunked prefill and prefix caching for: diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 3b58031dcbab..52ff8ebc91f3 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -387,7 +387,6 @@ def _parent_module(module_name: str) -> str: "LoRA is not supported for non-gated MoE gate module." " %s will be ignored.", module_name, - scope="local", ) continue diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index d229e32be758..61fb687e4637 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -332,7 +332,6 @@ def __init__( logger.warning_once( "Disabling prefix caching for FLASHINFER/TRITON_MLA " "with batch invariance, as it is not yet supported.", - scope="local", ) cache_config.enable_prefix_caching = False diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index 9d6ae6bf601e..5c7dc60fe15c 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -427,7 +427,6 @@ def __init__( logger.warning_once( "Disabling prefix caching for TRITON_MLA / FLASHINFER " "with batch invariance, as it is not yet supported.", - scope="local", ) cache_config.enable_prefix_caching = False @@ -1523,9 +1522,7 @@ def determine_prefill_query_data_type( if use_fp8: fp8_dtype = current_platform.fp8_dtype() - logger.info_once( - "FP8 prefill attention enabled: query data type is FP8", scope="local" - ) + logger.info_once("FP8 prefill attention enabled: query data type is FP8") return fp8_dtype elif vllm_config.attention_config.use_prefill_query_quantization: logger.info_once( @@ -1533,7 +1530,6 @@ def determine_prefill_query_data_type( " use_prefill_query_quantization is enabled. Please" " ensure that --kv-cache-dtype is set to fp8 and your prefill" " backend is compatible with FP8 attention.", - scope="local", ) return model_dtype elif ( @@ -1547,7 +1543,6 @@ def determine_prefill_query_data_type( "prefill latency. To enable, add: " '--attention-config \'{"use_prefill_query_quantization"' ": true}'", - scope="local", ) return model_dtype @@ -2225,21 +2220,19 @@ def __init__( ) if use_trtllm_ragged_deepseek_prefill(): - logger.info_once( - "Using TRT-LLM ragged DeepSeek prefill for MLA", scope="local" - ) + logger.info_once("Using TRT-LLM ragged DeepSeek prefill for MLA") self._run_prefill_context_chunk = ( self._run_prefill_context_chunk_trtllm_ragged ) self._run_prefill_new_tokens = self._run_prefill_new_tokens_trtllm_ragged self._pad_v = False elif use_flashinfer_prefill(): - logger.info_once("Using FlashInfer prefill for MLA", scope="local") + logger.info_once("Using FlashInfer prefill for MLA") self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi self._pad_v = False elif use_cudnn_prefill(): - logger.info_once("Using CUDNN prefill for MLA", scope="local") + logger.info_once("Using CUDNN prefill for MLA") self._run_prefill_context_chunk = self._run_prefill_context_chunk_cudnn self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn self._pad_v = False @@ -2250,7 +2243,7 @@ def __init__( "available. Please install flash_attn or use " "--attention-backend ROCM_AITER_MLA." ) - logger.info_once("Using FlashAttention prefill for MLA", scope="local") + logger.info_once("Using FlashAttention prefill for MLA") self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py index 6755e9af9e65..46d461c38b3f 100644 --- a/vllm/model_executor/layers/attention/mm_encoder_attention.py +++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py @@ -227,9 +227,7 @@ def __init__( if self.attn_backend == AttentionBackendEnum.FLASHINFER: _get_flashinfer_workspace_buffer() - logger.info_once( - f"Using {self.attn_backend} for MMEncoderAttention.", scope="local" - ) + logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.") @classmethod def enabled(cls) -> bool: diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 4a88421e3b51..152333beecb7 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -1020,7 +1020,7 @@ def override_envs_for_invariance( "You are using a non-decode-invariant form of batch invariance. " "This will not be invariant between prefill and decode." ) - logger.warning_once(warning, scope="local") + logger.warning_once(warning) os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0" os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" diff --git a/vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py index fad39b3e9d4a..7bd383b9cdac 100644 --- a/vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py @@ -369,7 +369,6 @@ def estimate_expected_m( logger.warning_once( "DPMetadata unavailable. Defaulting expected_m to " f"{max_tokens_per_expert}.", - scope="local", ) return max_tokens_per_expert diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index bf083eb9b55d..cf53907e2c3f 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1091,7 +1091,6 @@ def get_moe_configs( "Using default MoE config. Performance might be sub-optimal! " "Config file not found at %s", ", ".join(config_file_paths), - scope="local", ) return None diff --git a/vllm/model_executor/layers/fused_moe/nixl_ep_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/nixl_ep_prepare_finalize.py index dbc54e2c9def..a1068a752427 100644 --- a/vllm/model_executor/layers/fused_moe/nixl_ep_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/nixl_ep_prepare_finalize.py @@ -123,7 +123,6 @@ def post_init_setup(self, fused_experts: mk.FusedMoEExperts): "NixlEPPrepareAndFinalize is setup to dispatch raw/unquantized " f"activations despite ({fused_experts.__class__.__name__}) being able " "to support quantized activations.", - scope="local", ) def num_dispatchers(self) -> int: diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index 4420bb38731a..584c2bf79285 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -266,7 +266,7 @@ def _return_or_raise( k_cls, config, weight_key, activation_key, activation_format ) if supported: - logger.info_once(_make_log_backend(backend), scope="local") + logger.info_once(_make_log_backend(backend)) return backend, k_cls raise ValueError(_make_log_unsupported(backend, reason)) @@ -337,12 +337,10 @@ def _return_or_raise( ) if supported: - logger.info_once(_make_log_backend(backend), scope="local") + logger.info_once(_make_log_backend(backend)) return backend, k_cls else: - logger.debug_once( - _make_log_unsupported(backend, reason), scope="local" - ) + logger.debug_once(_make_log_unsupported(backend, reason)) raise NotImplementedError( "Found VLLM_USE_FLASHINFER_MOE_FP8=1, but no " @@ -396,10 +394,10 @@ def _return_or_raise( activation_format, ) if supported: - logger.info_once(_make_log_backend(backend), scope="local") + logger.info_once(_make_log_backend(backend)) return backend, k_cls else: - logger.debug_once(_make_log_unsupported(backend, reason), scope="local") + logger.debug_once(_make_log_unsupported(backend, reason)) # TODO(rob): per discussion with TPU team, we need a way to register # MoE backends by OOT plugins, rather than having an explicit list @@ -580,7 +578,7 @@ def make_fp8_moe_kernel( ) assert prepare_finalize is not None - logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local") + logger.info_once("Using %s", prepare_finalize.__class__.__name__) # Create Experts. if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts: diff --git a/vllm/model_executor/layers/fused_moe/oracle/int8.py b/vllm/model_executor/layers/fused_moe/oracle/int8.py index efa2792b420b..cdb1be108b5d 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/int8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/int8.py @@ -117,7 +117,7 @@ def _return_or_raise( k_cls, config, weight_key, activation_key, activation_format ) if supported: - logger.info_once(_make_log_backend(backend), scope="local") + logger.info_once(_make_log_backend(backend)) return backend, k_cls raise ValueError(_make_log_unsupported(backend, reason)) @@ -138,10 +138,10 @@ def _return_or_raise( activation_format, ) if supported: - logger.info_once(_make_log_backend(backend), scope="local") + logger.info_once(_make_log_backend(backend)) return backend, k_cls else: - logger.debug_once(_make_log_unsupported(backend, reason), scope="local") + logger.debug_once(_make_log_unsupported(backend, reason)) raise NotImplementedError( "No Int8 MoE backend supports the deployment configuration." @@ -193,7 +193,7 @@ def make_int8_moe_kernel( ) assert prepare_finalize is not None - logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local") + logger.info_once("Using %s", prepare_finalize.__class__.__name__) # Create Experts. if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts: diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py index 13d7a902c30e..6306d0e2e9d7 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py @@ -269,7 +269,7 @@ def _return_or_raise( k_cls, config, weight_key, activation_key, activation_format ) if supported: - logger.info_once(_make_log_backend(backend), scope="local") + logger.info_once(_make_log_backend(backend)) return backend, k_cls raise ValueError(_make_log_unsupported(backend, reason)) @@ -363,10 +363,10 @@ def _return_or_raise( k_cls, config, kMxfp4Static, activation_key, activation_format ) if supported: - logger.info_once(_make_log_backend(backend), scope="local") + logger.info_once(_make_log_backend(backend)) return backend, k_cls else: - logger.debug_once(_make_log_unsupported(backend, reason), scope="local") + logger.debug_once(_make_log_unsupported(backend, reason)) if current_platform.is_xpu(): backend = Mxfp4MoeBackend.XPU @@ -861,7 +861,7 @@ def make_mxfp4_moe_kernel( ) assert prepare_finalize is not None - logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local") + logger.info_once("Using %s", prepare_finalize.__class__.__name__) # Create Experts. if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts: diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py index 6d0b66cb9f53..724f6d5399bf 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py @@ -252,12 +252,10 @@ def _return_or_raise( activation_format, ) if supported: - logger.info_once(_make_log_backend(backend), scope="local") + logger.info_once(_make_log_backend(backend)) return backend, k_cls else: - logger.debug_once( - _make_log_unsupported(backend, reason), scope="local" - ) + logger.debug_once(_make_log_unsupported(backend, reason)) raise NotImplementedError( "Found VLLM_USE_FLASHINFER_MOE_FP4=1, but no " @@ -282,10 +280,10 @@ def _return_or_raise( ) if supported: - logger.info_once(_make_log_backend(backend), scope="local") + logger.info_once(_make_log_backend(backend)) return backend, k_cls else: - logger.debug_once(_make_log_unsupported(backend, reason), scope="local") + logger.debug_once(_make_log_unsupported(backend, reason)) raise NotImplementedError( "No NvFp4 MoE backend supports the deployment configuration." diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py index af7cb7baf963..cdfd6bb8c027 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py +++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py @@ -210,7 +210,7 @@ def _return_or_raise( k_cls, config, None, None, activation_format ) if supported: - logger.info_once(_make_log_backend(backend), scope="local") + logger.info_once(_make_log_backend(backend)) return backend, k_cls raise ValueError(_make_log_unsupported(backend, reason)) @@ -271,12 +271,10 @@ def _return_or_raise( k_cls, moe_config, None, None, activation_format ) if supported: - logger.info_once(_make_log_backend(backend), scope="local") + logger.info_once(_make_log_backend(backend)) return backend, k_cls else: - logger.debug_once( - _make_log_unsupported(backend, reason), scope="local" - ) + logger.debug_once(_make_log_unsupported(backend, reason)) raise NotImplementedError( "Found VLLM_USE_FLASHINFER_MOE_FP16=1, but no " @@ -298,10 +296,10 @@ def _return_or_raise( k_cls, moe_config, None, None, activation_format ) if supported: - logger.info_once(_make_log_backend(backend), scope="local") + logger.info_once(_make_log_backend(backend)) return backend, k_cls - logger.debug_once(_make_log_unsupported(backend, reason), scope="local") + logger.debug_once(_make_log_unsupported(backend, reason)) raise NotImplementedError( "No Unquantized MoE backend supports the deployment configuration." @@ -355,7 +353,7 @@ def make_unquantized_moe_kernel( ) assert prepare_finalize is not None - logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local") + logger.info_once("Using %s", prepare_finalize.__class__.__name__) # Create Experts if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts: diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py b/vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py index 0c6e32ae4a53..058d09d23bf2 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize/deepep_ll.py @@ -135,7 +135,6 @@ def post_init_setup(self, fused_experts: mk.FusedMoEExperts): "DeepEPLLPrepareAndFinalize is setup to dispatch raw/unquantized " f"activations despite ({fused_experts.__class__.__name__}) being able " "to support quantized activations.", - scope="local", ) def num_dispatchers(self) -> int: diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index c105badabcb4..227014e23973 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -69,16 +69,14 @@ def __init__( # TODO: Remove this after more extensive testings with TP/DP # and other execution modes if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM: - logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local") + logger.debug_once("Disabling MoE shared_experts cuda stream") self._stream = None else: # TODO(rob): enable shared expert overlap with non-cuda-alike. # aux_stream() returns None on non-cuda-alike platforms. self._stream = aux_stream() if self._stream is not None: - logger.debug_once( - "Enabled separate cuda stream for MoE shared_experts", scope="local" - ) + logger.debug_once("Enabled separate cuda stream for MoE shared_experts") @property def _disable_shared_experts_overlap(self) -> bool: diff --git a/vllm/model_executor/layers/mamba/gdn_linear_attn.py b/vllm/model_executor/layers/mamba/gdn_linear_attn.py index c74ca13024a8..7a0b54335baa 100644 --- a/vllm/model_executor/layers/mamba/gdn_linear_attn.py +++ b/vllm/model_executor/layers/mamba/gdn_linear_attn.py @@ -143,15 +143,14 @@ def __init__(self) -> None: use_flashinfer = supports_flashinfer if use_flashinfer: - logger.info_once("Using FlashInfer GDN prefill kernel", scope="local") + logger.info_once("Using FlashInfer GDN prefill kernel") logger.info_once( "FlashInfer GDN prefill kernel is JIT-compiled; first run may " "take a while to compile. Set `--gdn-prefill-backend triton` to " "avoid JIT compile time.", - scope="local", ) else: - logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local") + logger.info_once("Using Triton/FLA GDN prefill kernel") self._forward_method = ( self.forward_cuda if use_flashinfer else self.forward_native diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py index 57ebb961d487..9d3e0e7a787f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py @@ -44,10 +44,10 @@ def __init__(self, moe): self.use_cutlass_mxfp4 = CutlassExpertsMxfp4._supports_current_device() self.experts_cls: type[mk.FusedMoEExperts] if self.use_cutlass_mxfp4: - logger.info_once("Using CutlassExpertsMxfp4 for MXFP4 MoE", scope="local") + logger.info_once("Using CutlassExpertsMxfp4 for MXFP4 MoE") self.experts_cls = CutlassExpertsMxfp4 else: - logger.info_once("Using MarlinExperts for MXFP4 MoE", scope="local") + logger.info_once("Using MarlinExperts for MXFP4 MoE") self.experts_cls = MarlinExperts def create_weights( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py index 216eed6372a9..81b7efaa6d7e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py @@ -87,7 +87,6 @@ def __init__( logger.info_once( f"Using {self.kernel_backend} backend for WNA16 MoE " f"(group_size={self.group_size}, num_bits={self.num_bits})", - scope="local", ) def get_weight_shape( diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 019bb45d65dc..b53c7cc9ac1a 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -83,7 +83,6 @@ def get_quant_method( logger.debug_once( "MXFP4 linear layer is not implemented - falling back to " "UnquantizedLinearMethod.", - scope="local", ) return UnquantizedLinearMethod() elif isinstance(layer, FusedMoE): @@ -92,7 +91,6 @@ def get_quant_method( logger.debug_once( "MXFP4 attention layer is not implemented. " "Skipping quantization for this layer.", - scope="local", ) return None diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index 32c7a772f3fd..973f759698f0 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -111,7 +111,6 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend: logger.info_once( "Flashinfer TRTLLM MOE backend is only supported on " "SM100 and later, using CUTLASS backend instead", - scope="local", ) return FlashinferMoeBackend.CUTLASS return backend_map[flashinfer_moe_backend] @@ -239,7 +238,6 @@ def align_fp4_moe_weights_for_fi( "Padding intermediate size from %d to %d for up/down projection weights.", intermediate, padded_intermediate, - scope="local", ) up_mult = 2 if is_act_and_mul else 1 @@ -285,7 +283,6 @@ def align_trtllm_fp4_moe_hidden_dim_for_fi( "performance degradation.", hidden_size, padded_hidden_size, - scope="local", ) padded_w13 = w13.new_zeros((num_experts, gate_up_dim, padded_hidden_size // 2)) @@ -331,7 +328,6 @@ def align_fp8_moe_weights_for_fi( "Padding intermediate size from %d to %d for up/down projection weights.", intermediate, padded_intermediate, - scope="local", ) up_mult = 2 if is_act_and_mul else 1 diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py index d6c38664fde6..fb2f77d1b112 100644 --- a/vllm/model_executor/model_loader/base_loader.py +++ b/vllm/model_executor/model_loader/base_loader.py @@ -70,7 +70,6 @@ def load_model( logger.debug_once( "Peak GPU memory after loading weights: %s GiB", format_gib(peak_memory), - scope="local", ) # Process weights into kernel format. Note that when using online diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 5c9c97f4b64a..037195b9063a 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -384,7 +384,6 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: logger.info_once( "Loading weights took %.2f seconds", self.counter_after_loading_weights - self.counter_before_loading_weights, - scope="local", ) # We only enable strict check for non-quantized models # that have loaded weights tracking currently. diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index a87731e8bc0b..87b4b72db2a1 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -157,7 +157,6 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: logger.info_once( "Loading weights took %.2f seconds", counter_after_loading_weights - counter_before_loading_weights, - scope="local", ) if state_dict: raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!") diff --git a/vllm/model_executor/offloader/base.py b/vllm/model_executor/offloader/base.py index b8c1b6cfa48a..ceff60cd4cde 100644 --- a/vllm/model_executor/offloader/base.py +++ b/vllm/model_executor/offloader/base.py @@ -118,11 +118,9 @@ def set_offloader(instance: BaseOffloader) -> None: global _instance _instance = instance if isinstance(instance, NoopOffloader): - logger.debug_once( - "Offloader set to NoopOffloader (no offloading).", scope="local" - ) + logger.debug_once("Offloader set to NoopOffloader (no offloading).") else: - logger.info_once("Offloader set to %s", type(instance).__name__, scope="local") + logger.info_once("Offloader set to %s", type(instance).__name__) def create_offloader(offload_config: "OffloadConfig") -> BaseOffloader: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index d79d31918204..4f9b9d7bf234 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -369,7 +369,6 @@ def get_attn_backend_cls( "Using %s attention backend out of potential backends: %s.", selected_backend.name, "[" + ", ".join(f"'{b[0].name}'" for b in valid_backends_priorities) + "]", - scope="local", ) return selected_backend.get_path() @@ -423,7 +422,6 @@ def get_vit_attn_backend( if is_backend_supported: logger.info_once( f"Using backend {vit_attn_backend} for vit attention", - scope="local", ) return vit_attn_backend except ImportError: diff --git a/vllm/profiler/wrapper.py b/vllm/profiler/wrapper.py index 7cd4d8874df2..201b45078492 100644 --- a/vllm/profiler/wrapper.py +++ b/vllm/profiler/wrapper.py @@ -63,7 +63,7 @@ def _call_stop(self) -> None: """Call _stop with error handling but no safeguards.""" try: self._stop() - logger.info_once("Profiler stopped successfully.", scope="local") + logger.info_once("Profiler stopped successfully.") except Exception as e: logger.warning("Failed to stop profiler: %s", e) self._running = False # Always mark as not running, assume stop worked @@ -93,7 +93,7 @@ def step(self) -> None: and self._delay_iters > 0 and self._active_iteration_count == self._delay_iters ): - logger.info_once("Starting profiler after delay...", scope="local") + logger.info_once("Starting profiler after delay...") self._call_start() # Call profiler step for schedule-based profiling @@ -109,9 +109,7 @@ def step(self) -> None: # Automatically stop the profiler after max iters # will be marked as not running, but leave as active so that stop # can clean up properly - logger.info_once( - "Max profiling iterations reached. Stopping profiler...", scope="local" - ) + logger.info_once("Max profiling iterations reached. Stopping profiler...") self._call_stop() return @@ -141,7 +139,7 @@ def stop(self) -> None: def shutdown(self) -> None: """Ensure profiler is stopped when shutting down.""" - logger.info_once("Shutting down profiler", scope="local") + logger.info_once("Shutting down profiler") if self._running: self.stop() @@ -176,7 +174,6 @@ def __init__( logger.info_once( "Torch profiling enabled. Traces will be saved to: %s", torch_profiler_trace_dir, - scope="local", ) logger.debug( "Profiler config: record_shapes=%s," @@ -216,7 +213,6 @@ def __init__( profiler_config.wait_iterations, profiler_config.warmup_iterations, profiler_config.active_iterations, - scope="local", ) self.profiler = torch.profiler.profile( diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index a2e10ea3951f..637e9ec37e08 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -106,16 +106,14 @@ def is_deep_gemm_e8m0_used() -> bool: _lazy_init() if _fp8_gemm_nt_impl is None: - logger.info_once( - "DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found", scope="local" - ) + logger.info_once("DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found") return False if envs.VLLM_USE_DEEP_GEMM_E8M0: - logger.info_once("DeepGEMM E8M0 enabled on current platform.", scope="local") + logger.info_once("DeepGEMM E8M0 enabled on current platform.") return True - logger.info_once("DeepGEMM E8M0 disabled on current configuration.", scope="local") + logger.info_once("DeepGEMM E8M0 disabled on current configuration.") return False diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py index 31b63d1e6b49..6cf57c6894ab 100644 --- a/vllm/utils/import_utils.py +++ b/vllm/utils/import_utils.py @@ -66,14 +66,12 @@ def import_triton_kernels(): logger.debug_once( f"Loading module triton_kernels from {triton_kernels.__file__}.", - scope="local", ) elif _has_module("vllm.third_party.triton_kernels"): import vllm.third_party.triton_kernels as triton_kernels logger.debug_once( f"Loading module triton_kernels from {triton_kernels.__file__}.", - scope="local", ) sys.modules["triton_kernels"] = triton_kernels else: diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py index db8cafeb7487..76f98965623d 100644 --- a/vllm/v1/attention/backends/fa_utils.py +++ b/vllm/v1/attention/backends/fa_utils.py @@ -118,7 +118,6 @@ def get_flash_attn_version( logger.warning_once( "Cannot use FA version 4 with batch invariance, " "defaulting to FA version 2.", - scope="local", ) fa_version = 2 diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 6af0fa7c4966..4926851903b0 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -637,7 +637,6 @@ def __init__( logger.info_once( "Using FlashAttention version %s", self.vllm_flash_attn_version, - scope="local", ) # Cache the batch invariant result for use in forward passes self.batch_invariant_enabled = envs.VLLM_BATCH_INVARIANT diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 3f6999b82a4d..8f4963fcc873 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1334,7 +1334,7 @@ def _report_kv_cache_config( dcp_size, ) num_tokens_str = f"{num_tokens:,}" - logger.info_once("GPU KV cache size: %s tokens", num_tokens_str, scope="local") + logger.info_once("GPU KV cache size: %s tokens", num_tokens_str) max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" max_concurrency = get_max_concurrency_for_kv_cache_config( vllm_config, kv_cache_config @@ -1343,7 +1343,6 @@ def _report_kv_cache_config( "Maximum concurrency for %s tokens per request: %.2fx", max_model_len_str, max_concurrency, - scope="local", ) @@ -1445,7 +1444,6 @@ def _auto_fit_max_model_len( "Auto-fit max_model_len: attention-free model, " "using derived max_model_len=%d", original_max, - scope="local", ) return @@ -1472,7 +1470,6 @@ def _auto_fit_max_model_len( "Auto-fit max_model_len: full model context length %d fits in " "available GPU memory", original_max, - scope="local", ) else: # Need to reduce max_model_len to fit in memory @@ -1483,7 +1480,6 @@ def _auto_fit_max_model_len( original_max, auto_fit_max, format_gib(limiting_worker_mem), - scope="local", ) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index c2c1a239adb2..6bf6910cc6f2 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -293,7 +293,6 @@ def _initialize_kv_caches(self, vllm_config: VllmConfig) -> KVCacheConfig: compile_time + encoder_compile_time, compile_time, encoder_compile_time, - scope="local", ) elif compile_time > 0: logger.info_once( @@ -301,13 +300,11 @@ def _initialize_kv_caches(self, vllm_config: VllmConfig) -> KVCacheConfig: "%.2f s (compilation: %.2f s)", elapsed, compile_time, - scope="local", ) else: logger.info_once( "init engine (profile, create kv cache, warmup model) took %.2f s", elapsed, - scope="local", ) return scheduler_kv_cache_config diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 52969783f091..db21d7cee779 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -1032,7 +1032,6 @@ def set_multiprocessing_worker_envs(): "external environment to tune this value as needed.", current_parallelism, default_omp_num_threads, - scope="local", ) os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads) torch.set_num_threads(default_omp_num_threads) diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 051fe42155ee..fbc88f81db8b 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -29,7 +29,6 @@ def _get_device_and_group(parallel_config: ParallelConfig): if parallel_config.disable_nccl_for_dp_synchronization: logger.info_once( "Using CPU all reduce to synchronize DP padding between ranks.", - scope="local", ) device = "cpu" group = get_dp_group().cpu_group diff --git a/vllm/v1/worker/gpu/eplb_utils.py b/vllm/v1/worker/gpu/eplb_utils.py index 61d70fafea33..4ffb081ca303 100644 --- a/vllm/v1/worker/gpu/eplb_utils.py +++ b/vllm/v1/worker/gpu/eplb_utils.py @@ -92,9 +92,7 @@ def maybe_register_model( if not is_mixture_of_experts(model): return False - logger.info_once( - "EPLB is enabled for model %s.", model_config.model, scope="local" - ) + logger.info_once("EPLB is enabled for model %s.", model_config.model) assert self.state is not None self.state.add_model(model, model_config) self._has_registered_models = True diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b6bc942fc857..386db4fecd4b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4855,7 +4855,6 @@ def load_model(self, load_dummy_weights: bool = False) -> None: "Model loading took %s GiB memory and %.6f seconds", format_gib(self.model_memory_usage), time_after_load - time_before_load, - scope="local", ) if not load_dummy_weights: prepare_communication_buffer_for_model(self.model) @@ -4989,7 +4988,7 @@ def reload_weights( ) # begin loading weights - logger.info_once("Reloading weights inplace...", scope="local") + logger.info_once("Reloading weights inplace...") if is_checkpoint_format: # load weights from checkpoint/ original model format initialize_layerwise_reload(model) @@ -5001,7 +5000,6 @@ def reload_weights( logger.warning_once( "Reloading with `is_checkpoint_format=True` requires that " "weights be in kernel format and already sharded", - scope="local", ) loaded_weights = set() for name, loaded_weight in weights_iterator: @@ -5015,7 +5013,6 @@ def reload_weights( logger.info_once( "Reloading and processing weights took %.2f seconds", diff_seconds, - scope="local", ) if self.model_config.quantization is None and loaded_weights is not None: weights_not_loaded = weights_to_load - loaded_weights @@ -5802,7 +5799,6 @@ def profile_run(self) -> None: encoder_budget, max_mm_items_per_batch, dummy_modality, - scope="local", ) # Create dummy batch of multimodal inputs. @@ -6099,7 +6095,6 @@ def capture_model(self) -> int: "Graph capturing finished in %.0f secs, took %.2f GiB", elapsed_time, cuda_graph_size / (1 << 30), - scope="local", ) return cuda_graph_size diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 98f3212bae0d..afbee95c4d7d 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -269,7 +269,7 @@ def init_device(self): ) if self.use_v2_model_runner: - logger.info_once("Using V2 Model Runner", scope="local") + logger.info_once("Using V2 Model Runner") # Set random seed. set_random_seed(self.model_config.seed) @@ -440,7 +440,6 @@ def determine_available_memory(self) -> int: logger.info_once( "Available KV cache memory: %s GiB", format_gib(self.available_kv_cache_memory_bytes), - scope="local", ) if cudagraph_memory_estimate > 0: