Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 3 additions & 13 deletions vllm/compilation/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,6 @@ def compile(
"from the cache, took %.3f s",
str(compile_range),
elapsed,
scope="local",
)
return compiled_graph

Expand Down Expand Up @@ -377,15 +376,13 @@ def autograd_cache_key(*args, **kwargs):
logger.info_once(
"Cache the graph of compile range %s for later use",
str(compile_range),
scope="local",
)
logger.debug_once(
"Store the %s-th graph for compile range%s from %s via handle %s",
graph_index,
str(compile_range),
self.compiler.name,
handle,
scope="local",
)

# after compiling the last graph, record the end time
Expand All @@ -399,7 +396,6 @@ def autograd_cache_key(*args, **kwargs):
"Compiling a graph for compile range %s takes %.2f s",
str(compile_range),
elapsed,
scope="local",
)

return compiled_graph
Expand Down Expand Up @@ -1072,12 +1068,11 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
disable_cache = disable_cache or is_ngram_gpu_enabled

if disable_cache:
logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
logger.info_once("vLLM's torch.compile cache is disabled.")
else:
logger.info_once(
"Using cache directory: %s for vLLM's torch.compile",
local_cache_dir,
scope="local",
)

self.compiler_manager.initialize_cache(
Expand Down Expand Up @@ -1134,9 +1129,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
from .monitor import torch_compile_start_time

dynamo_time = time.perf_counter() - torch_compile_start_time
logger.info_once(
"Dynamo bytecode transform time: %.2f s", dynamo_time, scope="local"
)
logger.info_once("Dynamo bytecode transform time: %.2f s", dynamo_time)
if self.is_encoder:
self.compilation_config.encoder_compilation_time += dynamo_time
else:
Expand Down Expand Up @@ -1215,7 +1208,6 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
logger.info_once(
"Saved compiler manager cache in %.2f seconds.",
elapsed,
scope="local",
)

from torch._guards import detect_fake_mode
Expand Down Expand Up @@ -1254,9 +1246,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
with open(graph_path, "w") as f:
f.write(src)

logger.debug_once(
"Computation graph saved to %s", graph_path, scope="local"
)
logger.debug_once("Computation graph saved to %s", graph_path)

self._called = True
graph_to_serialize = (
Expand Down
1 change: 0 additions & 1 deletion vllm/compilation/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,6 @@ def save_aot_compiled_function(self: type[_T]) -> None:
logger.info_once(
"saved AOT compiled function to %s",
self._aot_compilation_path,
scope="local",
)
except Exception as e:
logger.warning(
Expand Down
3 changes: 1 addition & 2 deletions vllm/compilation/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def monitor_torch_compile(
else:
total_compile_time = time.perf_counter() - torch_compile_start_time
if compilation_config.mode == CompilationMode.VLLM_COMPILE:
logger.info_once(message, total_compile_time, scope="local")
logger.info_once(message, total_compile_time)
finally:
if depyf_cm is not None:
try:
Expand Down Expand Up @@ -76,7 +76,6 @@ def monitor_profiling_run() -> Generator[None, None, None]:
logger.info_once(
"Initial profiling/warmup run took %.2f s",
elapsed,
scope="local",
)


Expand Down
1 change: 0 additions & 1 deletion vllm/config/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,6 @@ def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
logger.info_once(
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
self.max_num_batched_tokens,
scope="local",
)

if self.max_num_partial_prefills > 1:
Expand Down
11 changes: 1 addition & 10 deletions vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,9 +716,7 @@ def __post_init__(self):
self.instance_id = f"{time.time_ns()}"

if self.performance_mode != "balanced":
logger.info_once(
"Performance mode set to '%s'.", self.performance_mode, scope="local"
)
logger.info_once("Performance mode set to '%s'.", self.performance_mode)

self.try_verify_and_update_config()

Expand Down Expand Up @@ -818,7 +816,6 @@ def __post_init__(self):
"Async scheduling not supported with %s-based "
"speculative decoding and will be disabled.",
self.speculative_config.method,
scope="local",
)
self.scheduler_config.async_scheduling = False
elif (
Expand All @@ -828,15 +825,13 @@ def __post_init__(self):
logger.warning_once(
"Async scheduling is not compatible with "
"disable_padded_drafter_batch=True and will be disabled.",
scope="local",
)
self.scheduler_config.async_scheduling = False
elif not executor_supports_async_sched:
logger.warning_once(
"Async scheduling will be disabled because it is not supported "
"with the `%s` distributed executor backend. ",
executor_backend,
scope="local",
)
self.scheduler_config.async_scheduling = False
else:
Expand All @@ -855,7 +850,6 @@ def __post_init__(self):
logger.info_once(
"Disabling NCCL for DP synchronization "
"when using async scheduling.",
scope="local",
)
self.parallel_config.disable_nccl_for_dp_synchronization = True
else:
Expand All @@ -870,7 +864,6 @@ def __post_init__(self):
logger.warning_once(
"Disabling cascade attention (not yet compatible with "
"async speculative decoding).",
scope="local",
)
self.model_config.disable_cascade_attn = True

Expand Down Expand Up @@ -1231,7 +1224,6 @@ def has_blocked_weights():
self.model_config.disable_cascade_attn = True
logger.warning_once(
"Disabling cascade attention when VLLM_BATCH_INVARIANT is enabled.",
scope="local",
)

if self.parallel_config.use_ubatching:
Expand Down Expand Up @@ -1418,7 +1410,6 @@ def _set_max_num_scheduled_tokens(self):
" performance. Consider increasing max_num_batched_tokens to"
" accommodate the additional draft token slots, or decrease"
" num_speculative_tokens or max_num_seqs.",
scope="local",
)

max_num_scheduled_tokens = self.scheduler_config.max_num_scheduled_tokens
Expand Down
4 changes: 1 addition & 3 deletions vllm/distributed/device_communicators/pynccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,7 @@ def __init__(
if self.rank == 0:
# get the unique id from NCCL
self.unique_id = self.nccl.ncclGetUniqueId()
logger.info_once(
"vLLM is using nccl==%s", self.nccl.ncclGetVersion(), scope="local"
)
logger.info_once("vLLM is using nccl==%s", self.nccl.ncclGetVersion())
else:
# construct an empty unique id
self.unique_id = ncclUniqueId()
Expand Down
3 changes: 0 additions & 3 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2254,7 +2254,6 @@ def _set_default_chunked_prefill_and_prefix_caching_args(
"This model does not officially support disabling chunked prefill. "
"Disabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
scope="local",
)
elif (
model_config.runner_type == "pooling"
Expand All @@ -2265,7 +2264,6 @@ def _set_default_chunked_prefill_and_prefix_caching_args(
"This model does not officially support chunked prefill. "
"Enabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
scope="local",
)

if self.enable_prefix_caching is None:
Expand All @@ -2284,7 +2282,6 @@ def _set_default_chunked_prefill_and_prefix_caching_args(
"This model does not officially support prefix caching. "
"Enabling this manually may cause the engine to crash "
"or produce incorrect outputs.",
scope="local",
)

# Disable chunked prefill and prefix caching for:
Expand Down
1 change: 0 additions & 1 deletion vllm/lora/model_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,6 @@ def _parent_module(module_name: str) -> str:
"LoRA is not supported for non-gated MoE gate module."
" %s will be ignored.",
module_name,
scope="local",
)
continue

Expand Down
1 change: 0 additions & 1 deletion vllm/model_executor/layers/attention/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,6 @@ def __init__(
logger.warning_once(
"Disabling prefix caching for FLASHINFER/TRITON_MLA "
"with batch invariance, as it is not yet supported.",
scope="local",
)
cache_config.enable_prefix_caching = False

Expand Down
17 changes: 5 additions & 12 deletions vllm/model_executor/layers/attention/mla_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,6 @@ def __init__(
logger.warning_once(
"Disabling prefix caching for TRITON_MLA / FLASHINFER "
"with batch invariance, as it is not yet supported.",
scope="local",
)
cache_config.enable_prefix_caching = False

Expand Down Expand Up @@ -1523,17 +1522,14 @@ def determine_prefill_query_data_type(

if use_fp8:
fp8_dtype = current_platform.fp8_dtype()
logger.info_once(
"FP8 prefill attention enabled: query data type is FP8", scope="local"
)
logger.info_once("FP8 prefill attention enabled: query data type is FP8")
return fp8_dtype
elif vllm_config.attention_config.use_prefill_query_quantization:
logger.info_once(
"Unable to perform FP8 prefill attention when"
" use_prefill_query_quantization is enabled. Please"
" ensure that --kv-cache-dtype is set to fp8 and your prefill"
" backend is compatible with FP8 attention.",
scope="local",
)
return model_dtype
elif (
Expand All @@ -1547,7 +1543,6 @@ def determine_prefill_query_data_type(
"prefill latency. To enable, add: "
'--attention-config \'{"use_prefill_query_quantization"'
": true}'",
scope="local",
)

return model_dtype
Expand Down Expand Up @@ -2225,21 +2220,19 @@ def __init__(
)

if use_trtllm_ragged_deepseek_prefill():
logger.info_once(
"Using TRT-LLM ragged DeepSeek prefill for MLA", scope="local"
)
logger.info_once("Using TRT-LLM ragged DeepSeek prefill for MLA")
self._run_prefill_context_chunk = (
self._run_prefill_context_chunk_trtllm_ragged
)
self._run_prefill_new_tokens = self._run_prefill_new_tokens_trtllm_ragged
self._pad_v = False
elif use_flashinfer_prefill():
logger.info_once("Using FlashInfer prefill for MLA", scope="local")
logger.info_once("Using FlashInfer prefill for MLA")
self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
self._pad_v = False
elif use_cudnn_prefill():
logger.info_once("Using CUDNN prefill for MLA", scope="local")
logger.info_once("Using CUDNN prefill for MLA")
self._run_prefill_context_chunk = self._run_prefill_context_chunk_cudnn
self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn
self._pad_v = False
Expand All @@ -2250,7 +2243,7 @@ def __init__(
"available. Please install flash_attn or use "
"--attention-backend ROCM_AITER_MLA."
)
logger.info_once("Using FlashAttention prefill for MLA", scope="local")
logger.info_once("Using FlashAttention prefill for MLA")
self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa
self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,7 @@ def __init__(
if self.attn_backend == AttentionBackendEnum.FLASHINFER:
_get_flashinfer_workspace_buffer()

logger.info_once(
f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
)
logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")

@classmethod
def enabled(cls) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/batch_invariant.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,7 +1020,7 @@ def override_envs_for_invariance(
"You are using a non-decode-invariant form of batch invariance. "
"This will not be invariant between prefill and decode."
)
logger.warning_once(warning, scope="local")
logger.warning_once(warning)
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"

os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,6 @@ def estimate_expected_m(
logger.warning_once(
"DPMetadata unavailable. Defaulting expected_m to "
f"{max_tokens_per_expert}.",
scope="local",
)
return max_tokens_per_expert

Expand Down
1 change: 0 additions & 1 deletion vllm/model_executor/layers/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1091,7 +1091,6 @@ def get_moe_configs(
"Using default MoE config. Performance might be sub-optimal! "
"Config file not found at %s",
", ".join(config_file_paths),
scope="local",
)
return None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def post_init_setup(self, fused_experts: mk.FusedMoEExperts):
"NixlEPPrepareAndFinalize is setup to dispatch raw/unquantized "
f"activations despite ({fused_experts.__class__.__name__}) being able "
"to support quantized activations.",
scope="local",
)

def num_dispatchers(self) -> int:
Expand Down
14 changes: 6 additions & 8 deletions vllm/model_executor/layers/fused_moe/oracle/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def _return_or_raise(
k_cls, config, weight_key, activation_key, activation_format
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
raise ValueError(_make_log_unsupported(backend, reason))

Expand Down Expand Up @@ -337,12 +337,10 @@ def _return_or_raise(
)

if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
else:
logger.debug_once(
_make_log_unsupported(backend, reason), scope="local"
)
logger.debug_once(_make_log_unsupported(backend, reason))

raise NotImplementedError(
"Found VLLM_USE_FLASHINFER_MOE_FP8=1, but no "
Expand Down Expand Up @@ -396,10 +394,10 @@ def _return_or_raise(
activation_format,
)
if supported:
logger.info_once(_make_log_backend(backend), scope="local")
logger.info_once(_make_log_backend(backend))
return backend, k_cls
else:
logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
logger.debug_once(_make_log_unsupported(backend, reason))

# TODO(rob): per discussion with TPU team, we need a way to register
# MoE backends by OOT plugins, rather than having an explicit list
Expand Down Expand Up @@ -580,7 +578,7 @@ def make_fp8_moe_kernel(
)
assert prepare_finalize is not None

logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local")
logger.info_once("Using %s", prepare_finalize.__class__.__name__)

# Create Experts.
if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
Expand Down
Loading
Loading