Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion vllm/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,10 @@ class ModelConfig:
interleave_mm_strings: bool = False
"""Enable fully interleaved support for multimodal prompts, while using
--chat-template-content-format=string. Defaults to False."""
skip_mm_profiling: bool = False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a big fan of "positive flag names". Empirically, enable_xyz=False causes less cognitive overhead compared to disable_xyz=True. Maybe we can set this instead as enable_mm_profiling and default it to True instead?

Copy link
Member Author

@ywang96 ywang96 Aug 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We tried that with a few things with positive flag actually but ends up having a bit messy situation IMO (we have --enable-prefix-caching and --no-enable-prefix-caching)

Personally, I actually think vllm serve model_name --skip-mm-profiling is more intuitive than vllm serve model_name --enable-mm-profiling=False or vllm serve model_name --no-enable-mm-profiling, and is more consistent with other negative flags in we have (e.g, disable_sliding_window, skip_tokenizer_init, etc) when we want the positive flag to be the default behavior. What do you think?

Copy link
Member

@DarkLight1337 DarkLight1337 Aug 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in terms of user perspective it is better to use --skip- rather than --no-enable-. But maybe we can adjust the argument parser to support both ways while keeping a positive name for the Python variable

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah okay. If the underlying parser is using the "--no-enable-xyz" style instead of "--enable-xyz=false" (similar to how c++ gflags work) then I guess "skip" is indeed cleaner.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

--arg and --no-arg is actually built in Python behaviour, so we adopted it to he more standard from a Python perspective.

https://docs.python.org/3/library/argparse.html#argparse.BooleanOptionalAction

Copy link
Member Author

@ywang96 ywang96 Aug 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

--arg and --no-arg is actually built in Python behaviour, so we adopted it to he more standard from a Python perspective.

Yea - I meant more like having default behavior to be an explicit positive flag (instead of just having --disable-prefix-caching) seems a bit weird to me.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I agree, I was just providing context :)

"""When enabled, skips multimodal memory profiling during engine
initialization.
"""
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
Expand Down Expand Up @@ -837,7 +841,8 @@ def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
media_io_kwargs=self.media_io_kwargs,
mm_processor_kwargs=self.mm_processor_kwargs,
mm_processor_cache_gb=self.mm_processor_cache_gb,
interleave_mm_strings=self.interleave_mm_strings)
interleave_mm_strings=self.interleave_mm_strings,
skip_mm_profiling=self.skip_mm_profiling)

return None

Expand Down Expand Up @@ -2511,6 +2516,15 @@ class MultiModalConfig:
Enable fully interleaved support for multimodal prompts.
"""

skip_mm_profiling: bool = False
"""
When enabled, skips multimodal profiling during engine initialization.

This reduces engine startup time but shifts the responsibility to users for
estimating the peak memory usage of the activation of multimodal encoder and
embedding cache.
"""

def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
Expand Down
4 changes: 4 additions & 0 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ class EngineArgs:
MultiModalConfig.mm_processor_kwargs
disable_mm_preprocessor_cache: bool = False # DEPRECATED
mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
# LoRA fields
enable_lora: bool = False
enable_lora_bias: bool = LoRAConfig.bias_enabled
Expand Down Expand Up @@ -716,6 +717,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
multimodal_group.add_argument(
"--interleave-mm-strings",
**multimodal_kwargs["interleave_mm_strings"])
multimodal_group.add_argument("--skip-mm-profiling",
**multimodal_kwargs["skip_mm_profiling"])

# LoRA related configs
lora_kwargs = get_kwargs(LoRAConfig)
Expand Down Expand Up @@ -918,6 +921,7 @@ def create_model_config(self) -> ModelConfig:
limit_mm_per_prompt=self.limit_mm_per_prompt,
interleave_mm_strings=self.interleave_mm_strings,
media_io_kwargs=self.media_io_kwargs,
skip_mm_profiling=self.skip_mm_profiling,
use_async_output_proc=not self.disable_async_output_proc,
config_format=self.config_format,
mm_processor_kwargs=self.mm_processor_kwargs,
Expand Down
84 changes: 45 additions & 39 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2479,50 +2479,56 @@ def _dummy_pooler_run(
def profile_run(self) -> None:
# Profile with multimodal encoder & encoder cache.
if self.supports_mm_inputs:
mm_budget = self.mm_budget
assert mm_budget is not None

# TODO: handle encoder-decoder models once we support them.
if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
# NOTE: Currently model is profiled with a single non-text
# modality with the max possible input tokens even when
# it supports multiple.
(
dummy_modality,
max_tokens,
) = mm_budget.get_modality_with_max_tokens()
(
max_mm_items_per_prompt,
max_mm_items_per_batch,
) = mm_budget.get_max_items(dummy_modality, max_tokens)

if self.model_config.multimodal_config.skip_mm_profiling:
logger.info(
"Encoder cache will be initialized with a budget of "
"%s tokens, and profiled with %s %s items of the maximum "
"feature size.",
encoder_budget,
max_mm_items_per_batch,
dummy_modality,
)
"Skipping memory profiling for multimodal encoder and "
"encoder cache.")
else:
mm_budget = self.mm_budget
assert mm_budget is not None

# TODO: handle encoder-decoder models once we support them.
if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
# NOTE: Currently model is profiled with a single non-text
# modality with the max possible input tokens even when
# it supports multiple.
(
dummy_modality,
max_tokens,
) = mm_budget.get_modality_with_max_tokens()
(
max_mm_items_per_prompt,
max_mm_items_per_batch,
) = mm_budget.get_max_items(dummy_modality, max_tokens)

logger.info(
"Encoder cache will be initialized with a budget of "
"%s tokens, and profiled with %s %s items of the "
"maximum feature size.",
encoder_budget,
max_mm_items_per_batch,
dummy_modality,
)

# Create dummy batch of multimodal inputs.
batched_dummy_mm_inputs = self._get_mm_dummy_batch(
dummy_modality,
max_mm_items_per_batch,
)
# Create dummy batch of multimodal inputs.
batched_dummy_mm_inputs = self._get_mm_dummy_batch(
dummy_modality,
max_mm_items_per_batch,
)

# Run multimodal encoder.
dummy_encoder_outputs = self.model.get_multimodal_embeddings(
**batched_dummy_mm_inputs)
# Run multimodal encoder.
dummy_encoder_outputs = \
self.model.get_multimodal_embeddings(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need a stream synchronize here? The TPU version seems to explicitly doing a sync. Is that not needed for cuda?

Copy link
Member Author

@ywang96 ywang96 Aug 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not too familiar with why TPU did that but at least on cuda the two models are running on the same stream (this is by design since we don't want encoder to affect decoder implicitly in any possible way)

**batched_dummy_mm_inputs)

sanity_check_mm_encoder_outputs(
dummy_encoder_outputs,
expected_num_items=max_mm_items_per_batch,
)
sanity_check_mm_encoder_outputs(
dummy_encoder_outputs,
expected_num_items=max_mm_items_per_batch,
)

# Cache the dummy encoder outputs.
self.encoder_cache["tmp"] = dict(
enumerate(dummy_encoder_outputs))
# Cache the dummy encoder outputs.
self.encoder_cache["tmp"] = dict(
enumerate(dummy_encoder_outputs))

# Add `is_profile` here to pre-allocate communication buffers
hidden_states, last_hidden_states \
Expand Down
106 changes: 56 additions & 50 deletions vllm/v1/worker/tpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1529,60 +1529,66 @@ def profile_run(
) -> None:
# Profile with multimodal encoder & encoder cache.
if self.supports_mm_inputs:
mm_budget = self.mm_budget
assert mm_budget is not None

# TODO: handle encoder-decoder models once we support them.
if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
# NOTE: Currently model is profiled with a single non-text
# modality with the max possible input tokens even when
# it supports multiple.
(
dummy_modality,
max_tokens,
) = mm_budget.get_modality_with_max_tokens()
(
max_mm_items_per_prompt,
max_mm_items_per_batch,
) = mm_budget.get_max_items(dummy_modality, max_tokens)

if self.model_config.multimodal_config.skip_mm_profiling:
logger.info(
"Encoder cache will be initialized with a budget of "
"%s tokens, and profiled with %s %s items of the maximum "
"feature size.",
encoder_budget,
max_mm_items_per_batch,
dummy_modality,
)

# Create dummy batch of multimodal inputs.
batched_dummy_mm_inputs = self._get_mm_dummy_batch(
dummy_modality,
max_mm_items_per_batch,
)
"Skipping memory profiling for multimodal encoder and "
"encoder cache.")
else:
mm_budget = self.mm_budget
assert mm_budget is not None

# TODO: handle encoder-decoder models once we support them.
if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
# NOTE: Currently model is profiled with a single non-text
# modality with the max possible input tokens even when
# it supports multiple.
(
dummy_modality,
max_tokens,
) = mm_budget.get_modality_with_max_tokens()
(
max_mm_items_per_prompt,
max_mm_items_per_batch,
) = mm_budget.get_max_items(dummy_modality, max_tokens)

logger.info(
"Encoder cache will be initialized with a budget of "
"%s tokens, and profiled with %s %s items of the "
"maximum feature size.",
encoder_budget,
max_mm_items_per_batch,
dummy_modality,
)

# Run multimodal encoder.
# Isolate encoder graph from post-processing to minimize
# impact of recompilation until it's fixed.
start = time.perf_counter()
xm.mark_step()
dummy_encoder_outputs = self.model.get_multimodal_embeddings(
**batched_dummy_mm_inputs)
xm.mark_step()
xm.wait_device_ops()
end = time.perf_counter()
logger.info(
"Multimodal Encoder profiling finished in in %.2f [secs].",
end - start)
# Create dummy batch of multimodal inputs.
batched_dummy_mm_inputs = self._get_mm_dummy_batch(
dummy_modality,
max_mm_items_per_batch,
)

sanity_check_mm_encoder_outputs(
dummy_encoder_outputs,
expected_num_items=max_mm_items_per_batch,
)
# Run multimodal encoder.
# Isolate encoder graph from post-processing to minimize
# impact of recompilation until it's fixed.
start = time.perf_counter()
xm.mark_step()
dummy_encoder_outputs = \
self.model.get_multimodal_embeddings(
**batched_dummy_mm_inputs)
xm.mark_step()
xm.wait_device_ops()
end = time.perf_counter()
logger.info(
"Multimodal Encoder profiling finished in %.2f [secs].",
end - start)

sanity_check_mm_encoder_outputs(
dummy_encoder_outputs,
expected_num_items=max_mm_items_per_batch,
)

# Cache the dummy encoder outputs.
self.encoder_cache["tmp"] = dict(
enumerate(dummy_encoder_outputs))
# Cache the dummy encoder outputs.
self.encoder_cache["tmp"] = dict(
enumerate(dummy_encoder_outputs))

# Trigger compilation for general shape.
self._dummy_run(num_tokens, self.num_reqs_max_model_len,
Expand Down
Loading