diff --git a/examples/online_serving/text_to_image/README.md b/examples/online_serving/text_to_image/README.md index a4f1ad6332..744b7b2921 100644 --- a/examples/online_serving/text_to_image/README.md +++ b/examples/online_serving/text_to_image/README.md @@ -116,6 +116,7 @@ Use `extra_body` to pass generation parameters: | `seed` | int | None | Random seed (reproducible) | | `negative_prompt` | str | None | Negative prompt | | `num_outputs_per_prompt` | int | 1 | Number of images to generate | +| `--cfg-parallel-size`. | int | 1 | Number of GPUs for CFG parallelism | ## Response Format diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index e41569281c..3c275147fa 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -132,9 +132,10 @@ def _create_default_diffusion_stage_cfg(self, kwargs: dict[str, Any]) -> dict[st ring_degree = kwargs.get("ring_degree") or 1 sequence_parallel_size = kwargs.get("sequence_parallel_size") tensor_parallel_size = kwargs.get("tensor_parallel_size") or 1 + cfg_parallel_size = kwargs.get("cfg_parallel_size") or 1 if sequence_parallel_size is None: sequence_parallel_size = ulysses_degree * ring_degree - num_devices = sequence_parallel_size * tensor_parallel_size + num_devices = sequence_parallel_size * tensor_parallel_size * cfg_parallel_size for i in range(1, num_devices): devices += f",{i}" parallel_config = DiffusionParallelConfig( @@ -144,7 +145,7 @@ def _create_default_diffusion_stage_cfg(self, kwargs: dict[str, Any]) -> dict[st sequence_parallel_size=sequence_parallel_size, ulysses_degree=ulysses_degree, ring_degree=ring_degree, - cfg_parallel_size=1, + cfg_parallel_size=cfg_parallel_size, ) default_stage_cfg = [ { diff --git a/vllm_omni/entrypoints/cli/serve.py b/vllm_omni/entrypoints/cli/serve.py index 3b222c8179..c3a37e3c82 100644 --- a/vllm_omni/entrypoints/cli/serve.py +++ b/vllm_omni/entrypoints/cli/serve.py @@ -208,6 +208,9 @@ def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgu default=None, help="Scheduler flow_shift for video models (e.g., 5.0 for 720p, 12.0 for 480p).", ) + omni_config_group.add_argument( + "--cfg-parallel-size", type=int, default=1, help="Number of GPUs for CFG parallel computation" + ) return serve_parser diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py index 804ab7b7fb..a2790cd06e 100644 --- a/vllm_omni/entrypoints/omni_stage.py +++ b/vllm_omni/entrypoints/omni_stage.py @@ -474,12 +474,14 @@ def _stage_worker( data_parallel_size = parallel_config.get("data_parallel_size", 1) prefill_context_parallel_size = 1 # not used for diffusion sequence_parallel_size = parallel_config.get("sequence_parallel_size", 1) + cfg_parallel_size = parallel_config.get("cfg_parallel_size", 1) else: tensor_parallel_size = engine_args.get("tensor_parallel_size", 1) pipeline_parallel_size = engine_args.get("pipeline_parallel_size", 1) data_parallel_size = engine_args.get("data_parallel_size", 1) prefill_context_parallel_size = engine_args.get("prefill_context_parallel_size", 1) sequence_parallel_size = 1 # not use in omni model + cfg_parallel_size = 1 # not used in omni model # Calculate total number of devices needed for this stage # For a single stage worker: @@ -488,7 +490,8 @@ def _stage_worker( # - DP: replicates model, but each replica uses TP devices # - PCP: context parallelism, typically uses TP devices # - SP: sequence parallelism, typically uses TP devices - # The number of devices per stage is determined by TP * PP * DP * PCP * SP size + # - CFG: Classifier-Free Guidance parallelism for diffusion models + # The number of devices per stage is determined by TP * PP * DP * PCP * SP * CFG size # (PP/DP/PCP are higher-level parallelism that don't add devices per stage) num_devices_per_stage = ( tensor_parallel_size @@ -496,6 +499,7 @@ def _stage_worker( * data_parallel_size * prefill_context_parallel_size * sequence_parallel_size + * cfg_parallel_size ) # Get physical device IDs from CUDA_VISIBLE_DEVICES