Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def main(args: argparse.Namespace) -> None:
omni = Omni(
model=args.model,
enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler,
mode="image-to-text",
)

prompt = "<|startoftext|>You are an assistant that understands images and outputs text.<img>" + args.prompt
Expand Down
8 changes: 7 additions & 1 deletion examples/offline_inference/text_to_image/text_to_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,12 @@ def parse_args() -> argparse.Namespace:
action="store_true",
help="Enable logging of diffusion pipeline stats.",
)
parser.add_argument(
"--diffusion-only",
action="store_true",
help="Start only the diffusion (DIT) stage for model support multiple startup modes,"
"e.g., HunyuanImage-3.0 support both DIT and AR+DIT.",
)
return parser.parse_args()


Expand Down Expand Up @@ -330,7 +336,7 @@ def main():
"parallel_config": parallel_config,
"enforce_eager": args.enforce_eager,
"enable_cpu_offload": args.enable_cpu_offload,
"mode": "text-to-image",
"diffusion_only": args.diffusion_only,
"log_stats": args.log_stats,
"enable_diffusion_pipeline_profiler": args.enable_diffusion_pipeline_profiler,
**lora_args,
Expand Down
6 changes: 6 additions & 0 deletions vllm_omni/entrypoints/cli/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,12 @@ def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgu
default=1,
help="Number of replica groups for HSDP. Each group holds a full sharded copy.",
)
omni_config_group.add_argument(
"--diffusion-only",
action="store_true",
help="Start only the diffusion (DIT) stage for model support multiple startup modes,"
"e.g., HunyuanImage-3.0 support both DIT and AR+DIT.",
)

# Cache optimization parameters
omni_config_group.add_argument(
Expand Down
108 changes: 10 additions & 98 deletions vllm_omni/entrypoints/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,100 +300,6 @@ def load_stage_configs_from_yaml(config_path: str, base_engine_args: dict | None
return stage_args


def filter_stages(
config_path: str | None,
stage_configs: list,
kwargs: dict | None,
) -> list:
"""Filter stage configs by mode when YAML defines a `modes` section.

The YAML can define, e.g.:

modes:
- mode: text-to-image
stages: [1]
- mode: image-to-text
stages: [0]

When users pass `mode="image-to-text"` into Omni(**kwargs), only the stages
listed for that mode are returned. If no mode is provided, defaults to
"text-to-image". If no modes are defined or filtering fails, returns the
original stage_configs unchanged.

Args:
config_path: Path to the YAML config (used to read `modes`).
stage_configs: Loaded list of stage configs.
kwargs: Engine/caller kwargs; may contain "mode".

Returns:
Filtered list of stage configs (or original list if filtering not applied).
"""
if not stage_configs or config_path is None:
return stage_configs

try:
cfg = load_yaml_config(config_path)
yaml_modes = getattr(cfg, "modes", None)
if yaml_modes is None:
return stage_configs

mode_to_stage_ids: dict[str, list[int]] = {}
if yaml_modes is not None:
for entry in yaml_modes:
mode_name = None
stages = None
if hasattr(entry, "mode") or hasattr(entry, "stages"):
mode_name = getattr(entry, "mode", None)
stages = getattr(entry, "stages", None)
elif isinstance(entry, dict):
mode_name = entry.get("mode")
stages = entry.get("stages")

if mode_name is None or stages is None:
continue

if isinstance(stages, int):
stage_list = [stages]
else:
stage_list = list(stages)

mode_to_stage_ids[str(mode_name)] = [int(sid) for sid in stage_list]

# No modes section or empty mapping: use all stages and return early.
active_mode: str | None = None
if isinstance(kwargs, dict):
active_mode = kwargs.get("mode")

if active_mode is None:
active_mode = "text-to-image"

if active_mode not in mode_to_stage_ids:
logger.warning(
"Requested mode '%s' not found in config '%s'; available modes: %s. Using all stages.",
active_mode,
config_path,
sorted(mode_to_stage_ids.keys()),
)
return stage_configs

allowed_ids = set(mode_to_stage_ids[active_mode])
filtered_stage_configs = [sc for sc in stage_configs if getattr(sc, "stage_id", None) in allowed_ids]
if not filtered_stage_configs:
logger.warning(
"Mode '%s' in config '%s' resolved to stage ids %s, but none matched loaded stage_args. "
"Falling back to all stages.",
active_mode,
config_path,
sorted(allowed_ids),
)
return stage_configs

return filtered_stage_configs
except Exception as e:
logger.warning("Failed to apply mode-based stage filtering: %s", e)
return stage_configs


def load_and_resolve_stage_configs(
model: str,
stage_configs_path: str | None,
Expand All @@ -408,14 +314,22 @@ def load_and_resolve_stage_configs(
kwargs: Engine arguments to merge with stage configs
default_stage_cfg_factory: Optional callable that takes no args and returns
default stage config list when no configs are found

Returns:
Tuple of (config_path, stage_configs)

The priority of stage configs should be:
Diffusion-only:
1. stage_configs_path
2. configs from default_stage_cfg_factory (construct from kwargs)

Other:
1. stage_configs_path
2. configs from resolve_model_config_path
"""
if stage_configs_path is None:
config_path = resolve_model_config_path(model)
stage_configs = load_stage_configs_from_model(model, base_engine_args=kwargs)
if not stage_configs:
if not stage_configs or kwargs.get("diffusion_only", False):
if default_stage_cfg_factory is not None:
default_stage_cfg = default_stage_cfg_factory()
stage_configs = create_config(default_stage_cfg)
Expand All @@ -424,8 +338,6 @@ def load_and_resolve_stage_configs(
else:
config_path = stage_configs_path
stage_configs = load_stage_configs_from_yaml(stage_configs_path, base_engine_args=kwargs)

stage_configs = filter_stages(config_path, stage_configs, kwargs)
logger.debug(f"stage_configs: {stage_configs}")

return config_path, stage_configs
Expand Down
39 changes: 0 additions & 39 deletions vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,6 @@
# Stage 0: AR Model (vLLM implementation)

# The following config has been verified on 8x L40S-48G GPU.
modes:
- mode: text-to-image
stages: [1]
- mode: image-to-text
stages: [0]
stage_args:
- stage_id: 0
Comment on lines 5 to 6
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Keep diffusion stage in default Hunyuan config

This change removes the stage_id: 1 diffusion stage from the default hunyuan_image_3_moe config, so launching without diffusion_only now resolves to an AR-only pipeline and breaks image generation flows that previously relied on default config resolution. That regresses both offline usage (tests/e2e/offline_inference/test_expert_parallel.py expects Omni(model="tencent/HunyuanImage-3.0") to return images) and serving paths (vllm_omni/entrypoints/openai/api_server.py rejects pipelines with no diffusion stage for /v1/images/*). The diffusion stage should remain in the default config, with diffusion_only controlling config-construction priority rather than deleting the stage.

Useful? React with 👍 / 👎.

stage_type: llm # Use llm stage type for AR stages
Expand Down Expand Up @@ -42,40 +37,6 @@ stage_args:
seed: 42
detokenize: True
repetition_penalty: 1.1
- stage_id: 1
stage_type: diffusion
runtime:
process: true
devices: "0,1,2,3,4,5,6,7"
max_batch_size: 1
engine_args:
model_stage: diffusion
gpu_memory_utilization: 0.9
enforce_eager: true
engine_output_type: image
distributed_executor_backend: "mp"
enable_prefix_caching: false
max_num_batched_tokens: 32768
vae_use_slicing: false
vae_use_tiling: false
cache_backend: null
cache_config: null
enable_cache_dit_summary: false
parallel_config:
pipeline_parallel_size: 1
data_parallel_size: 1
tensor_parallel_size: 8
enable_expert_parallel: false
sequence_parallel_size: 1
ulysses_degree: 1
ring_degree: 1
cfg_parallel_size: 1
vae_patch_parallel_size: 1
use_hsdp: false
hsdp_shard_size: -1
hsdp_replicate_size: 1
final_output: true
final_output_type: image

# Top-level runtime config (concise): default windows and stage edges
runtime:
Expand Down
39 changes: 0 additions & 39 deletions vllm_omni/platforms/xpu/stage_configs/hunyuan_image_3_moe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,6 @@
# Stage 0: AR Model (vLLM implementation)

# The following config has been verified on 8x Max 1550 GPU.
modes:
- mode: text-to-image
stages: [1]
- mode: image-to-text
stages: [0]
stage_args:
- stage_id: 0
stage_type: llm # Use llm stage type to launch OmniLLM
Expand Down Expand Up @@ -40,40 +35,6 @@ stage_args:
seed: 42
detokenize: True
repetition_penalty: 1.1
- stage_id: 1
stage_type: diffusion
runtime:
process: true
devices: "0,1,2,3,4,5,6,7"
max_batch_size: 1
engine_args:
model_stage: diffusion
gpu_memory_utilization: 0.9
enforce_eager: true
engine_output_type: image
distributed_executor_backend: "mp"
enable_prefix_caching: false
vae_use_slicing: false
vae_use_tiling: false
cache_backend: null
cache_config: null
enable_cache_dit_summary: false
quantization: "fp8"
parallel_config:
pipeline_parallel_size: 1
data_parallel_size: 1
tensor_parallel_size: 8
enable_expert_parallel: true
sequence_parallel_size: 1
ulysses_degree: 1
ring_degree: 1
cfg_parallel_size: 1
vae_patch_parallel_size: 1
use_hsdp: false
hsdp_shard_size: -1
hsdp_replicate_size: 1
final_output: true
final_output_type: image

# Top-level runtime config (concise): default windows and stage edges
runtime:
Expand Down
Loading