Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions vllm_omni/entrypoints/omni.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Literal, overload

import huggingface_hub
from omegaconf import OmegaConf
from tqdm.auto import tqdm
from vllm import SamplingParams
Expand Down Expand Up @@ -40,6 +41,9 @@
)
from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType, OmniSamplingParams
from vllm_omni.metrics import OrchestratorAggregator, StageRequestStats
from vllm_omni.model_executor.model_loader.weight_utils import (
download_weights_from_hf_specific,
)
from vllm_omni.outputs import OmniRequestOutput

logger = init_logger(__name__)
Expand All @@ -66,14 +70,28 @@ def _dummy_snapshot_download(model_id):


def omni_snapshot_download(model_id) -> str:
# If it's already a local path, just return it
if os.path.exists(model_id):
return model_id
# TODO: this is just a workaround for quickly use modelscope, we should support
Comment thread
zzhuoxin1508 marked this conversation as resolved.
# modelscope in weight loading feature instead of using `snapshot_download`
if os.environ.get("VLLM_USE_MODELSCOPE", False):
from modelscope.hub.snapshot_download import snapshot_download

return snapshot_download(model_id)
else:
return _dummy_snapshot_download(model_id)
# For other cases (Hugging Face), perform a real download to ensure all
# necessary files (including *.pt for audio/diffusion) are available locally
Comment thread
zzhuoxin1508 marked this conversation as resolved.
# before stage workers are spawned. This prevents initialization timeouts.
try:
return download_weights_from_hf_specific(
model_name_or_path=model_id,
cache_dir=None,
allow_patterns=["*"],
require_all=True,
Comment on lines +86 to +90
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid forcing full repo download in offline mode

When HF_HUB_OFFLINE is set, download_weights_from_hf_specific passes local_files_only=True, so Hugging Face will only use cached files and will error if requested files are missing. With allow_patterns=["*"] here, the orchestrator now requests the entire repo before spawning stages, which means a partially cached model that previously loaded from the subset it needed will now fail during startup in offline environments. This is a regression for offline workflows where only weights are cached. Consider skipping the prefetch when offline or falling back to the narrower weight patterns in that case.

Useful? React with 👍 / 👎.

)
except huggingface_hub.errors.RepositoryNotFoundError:
logger.warning(f"Repository not found for '{model_id}'.")
return model_id


class OmniBase:
Expand Down
27 changes: 21 additions & 6 deletions vllm_omni/model_executor/model_loader/weight_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def download_weights_from_hf_specific(
allow_patterns: list[str],
revision: str | None = None,
ignore_patterns: str | list[str] | None = None,
require_all: bool = False,
) -> str:
"""Download model weights from Hugging Face Hub. Users can specify the
allow_patterns to download only the necessary weights.
Expand All @@ -35,6 +36,9 @@ def download_weights_from_hf_specific(
ignore_patterns (Optional[Union[str, list[str]]]): The patterns to
filter out the weight files. Files matched by any of the patterns
will be ignored.
require_all (bool): If True, will iterate through and download files
matching all patterns in allow_patterns. If False, will stop after
the first pattern that matches any files.

Returns:
str: The path to the downloaded model weights.
Expand All @@ -48,20 +52,31 @@ def download_weights_from_hf_specific(
# downloading the same model weights at the same time.
with get_lock(model_name_or_path, cache_dir):
start_time = time.perf_counter()
for allow_pattern in allow_patterns:
if require_all:
hf_folder = snapshot_download(
model_name_or_path,
allow_patterns=allow_pattern,
allow_patterns=allow_patterns,
ignore_patterns=ignore_patterns,
cache_dir=cache_dir,
revision=revision,
local_files_only=local_only,
**download_kwargs,
)
# If we have downloaded weights for this allow_pattern,
# we don't need to check the rest.
if any(Path(hf_folder).glob(allow_pattern)):
break
else:
for allow_pattern in allow_patterns:
hf_folder = snapshot_download(
model_name_or_path,
allow_patterns=allow_pattern,
ignore_patterns=ignore_patterns,
cache_dir=cache_dir,
revision=revision,
local_files_only=local_only,
**download_kwargs,
)
# If we have downloaded weights for this allow_pattern,
# we don't need to check the rest, unless require_all is set.
if any(Path(hf_folder).glob(allow_pattern)):
break
time_taken = time.perf_counter() - start_time
if time_taken > 0.5:
logger.info(
Expand Down