Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion vllm/compilation/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,13 +371,15 @@ def autograd_cache_key(*args, **kwargs):
logger.info_once(
"Cache the graph of compile range %s for later use",
str(compile_range),
scope="local",
)
logger.debug(
logger.debug_once(
"Store the %s-th graph for compile range%s from %s via handle %s",
graph_index,
str(compile_range),
self.compiler.name,
handle,
scope="local",
)

# after compiling the last graph, record the end time
Expand Down
3 changes: 2 additions & 1 deletion vllm/config/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,9 +228,10 @@ def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
self.encoder_cache_size = self.max_num_batched_tokens

if self.enable_chunked_prefill:
logger.info(
logger.info_once(
"Chunked prefill is enabled with max_num_batched_tokens=%d.",
self.max_num_batched_tokens,
scope="local",
)

if self.max_num_partial_prefills > 1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,9 @@ def __init__(
if self.attn_backend == AttentionBackendEnum.FLASHINFER:
_get_flashinfer_workspace_buffer()

logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
logger.info_once(
f"Using {self.attn_backend} for MMEncoderAttention.", scope="local"
)

@classmethod
def enabled(cls) -> bool:
Expand Down
7 changes: 4 additions & 3 deletions vllm/model_executor/models/qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,14 +188,15 @@ def __init__(self) -> None:
use_flashinfer = supports_flashinfer

if use_flashinfer:
logger.info_once("Using FlashInfer GDN prefill kernel")
logger.info_once("Using FlashInfer GDN prefill kernel", scope="local")
logger.info_once(
"FlashInfer GDN prefill kernel is JIT-compiled; first run may "
"take a while to compile. Set `--gdn-prefill-backend triton` to "
"avoid JIT compile time."
"avoid JIT compile time.",
scope="local",
)
else:
logger.info_once("Using Triton/FLA GDN prefill kernel")
logger.info_once("Using Triton/FLA GDN prefill kernel", scope="local")

self._forward_method = (
self.forward_cuda if use_flashinfer else self.forward_native
Expand Down
3 changes: 2 additions & 1 deletion vllm/platforms/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,8 @@ def get_vit_attn_backend(
)
if is_backend_supported:
logger.info_once(
f"Using backend {vit_attn_backend} for vit attention"
f"Using backend {vit_attn_backend} for vit attention",
scope="local",
)
return vit_attn_backend
except ImportError:
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/executor/multiproc_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -994,12 +994,13 @@ def set_multiprocessing_worker_envs():
"OMP_NUM_THREADS" not in os.environ
and (current_parallelism := torch.get_num_threads()) > default_omp_num_threads
):
logger.warning(
logger.warning_once(
"Reducing Torch parallelism from %d threads to %d to avoid "
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
"external environment to tune this value as needed.",
current_parallelism,
default_omp_num_threads,
scope="local",
)
os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
torch.set_num_threads(default_omp_num_threads)
3 changes: 2 additions & 1 deletion vllm/v1/worker/dp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def _get_device_and_group(parallel_config: ParallelConfig):
# this optimization if we run into this case.
if parallel_config.disable_nccl_for_dp_synchronization:
logger.info_once(
"Using CPU all reduce to synchronize DP padding between ranks."
"Using CPU all reduce to synchronize DP padding between ranks.",
scope="local",
)
device = "cpu"
group = get_dp_group().cpu_group
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5510,13 +5510,14 @@ def profile_run(self) -> None:
dummy_modality
]

logger.info(
logger.info_once(
"Encoder cache will be initialized with a "
"budget of %s tokens, and profiled with "
"%s %s items of the maximum feature size.",
encoder_budget,
max_mm_items_per_batch,
dummy_modality,
scope="local",
)

# Create dummy batch of multimodal inputs.
Expand Down
Loading