diff --git a/docs/user_guide/diffusion/parallelism_acceleration.md b/docs/user_guide/diffusion/parallelism_acceleration.md index 37fd5b1d1c..20f09d2c09 100644 --- a/docs/user_guide/diffusion/parallelism_acceleration.md +++ b/docs/user_guide/diffusion/parallelism_acceleration.md @@ -24,22 +24,22 @@ The following table shows which models are currently supported by parallelism me ### ImageGen -| Model | Model Identifier | Ulysses-SP | Ring-SP | CFG-Parallel | Tensor-Parallel | VAE-Patch-Parallel | Expert-Parallel | -|--------------------------|--------------------------------------|:----------:|:-------:|:------------:|:---------------:|:------------------:|:---------------:| -| **LongCat-Image** | `meituan-longcat/LongCat-Image` | ✅ | ✅ | ❌ | ✅ | ❌ | N/A | -| **LongCat-Image-Edit** | `meituan-longcat/LongCat-Image-Edit` | ✅ | ✅ | ❌ | ✅ | ❌ | N/A | -| **Ovis-Image** | `OvisAI/Ovis-Image` | ❌ | ❌ | ❌ | ❌ | ❌ | N/A | -| **Qwen-Image** | `Qwen/Qwen-Image` | ✅ | ✅ | ✅ | ✅ | ✅ | N/A | -| **Qwen-Image-Edit** | `Qwen/Qwen-Image-Edit` | ✅ | ✅ | ✅ | ✅ | ❌ | N/A | -| **Qwen-Image-Edit-2509** | `Qwen/Qwen-Image-Edit-2509` | ✅ | ✅ | ✅ | ✅ | ❌ | N/A | -| **Qwen-Image-Layered** | `Qwen/Qwen-Image-Layered` | ✅ | ✅ | ✅ | ✅ | ❌ | N/A | -| **Z-Image** | `Tongyi-MAI/Z-Image-Turbo` | ✅ | ✅ | ❌ | ✅ (TP=2 only) | ✅ | N/A | -| **Stable-Diffusion3.5** | `stabilityai/stable-diffusion-3.5` | ❌ | ❌ | ❌ | ✅ | ✅ | N/A | -| **FLUX.2-klein** | `black-forest-labs/FLUX.2-klein-4B` | ✅ | ✅ | ❌ | ✅ | ❌ | N/A | -| **FLUX.1-dev** | `black-forest-labs/FLUX.1-dev` | ❌ | ❌ | ✅ | ✅ | ❌ | N/A | -| **FLUX.2-dev** | `black-forest-labs/FLUX.2-dev` | ❌ | ❌ | ❌ | ✅ | ❌ | N/A | -| **HunyuanImage3.0** | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | -| **DreamID-Omni** | `XuGuo699/DreamID-Omni` | ❌ | ❌ | ✅ | ❌ | ❌ | N/A | +| Model | Model Identifier | Ulysses-SP | Ring-SP | CFG-Parallel | Tensor-Parallel | VAE-Patch-Parallel | Expert-Parallel | HSDP | +|--------------------------|--------------------------------------|:----------:|:-------:|:------------:|:---------------:|:------------------:|:---------------:|:----:| +| **LongCat-Image** | `meituan-longcat/LongCat-Image` | ✅ | ✅ | ❌ | ✅ | ❌ | N/A | ❌ | +| **LongCat-Image-Edit** | `meituan-longcat/LongCat-Image-Edit` | ✅ | ✅ | ❌ | ✅ | ❌ | N/A | ❌ | +| **Ovis-Image** | `OvisAI/Ovis-Image` | ❌ | ❌ | ❌ | ❌ | ❌ | N/A | ❌ | +| **Qwen-Image** | `Qwen/Qwen-Image` | ✅ | ✅ | ✅ | ✅ | ✅ | N/A | ❌ | +| **Qwen-Image-Edit** | `Qwen/Qwen-Image-Edit` | ✅ | ✅ | ✅ | ✅ | ❌ | N/A | ❌ | +| **Qwen-Image-Edit-2509** | `Qwen/Qwen-Image-Edit-2509` | ✅ | ✅ | ✅ | ✅ | ❌ | N/A | ❌ | +| **Qwen-Image-Layered** | `Qwen/Qwen-Image-Layered` | ✅ | ✅ | ✅ | ✅ | ❌ | N/A | ❌ | +| **Z-Image** | `Tongyi-MAI/Z-Image-Turbo` | ✅ | ✅ | ❌ | ✅ (TP=2 only) | ✅ | N/A | ❌ | +| **Stable-Diffusion3.5** | `stabilityai/stable-diffusion-3.5` | ❌ | ❌ | ❌ | ✅ | ✅ | N/A | ❌ | +| **FLUX.2-klein** | `black-forest-labs/FLUX.2-klein-4B` | ✅ | ✅ | ❌ | ✅ | ❌ | N/A | ✅ | +| **FLUX.1-dev** | `black-forest-labs/FLUX.1-dev` | ❌ | ❌ | ✅ | ✅ | ❌ | N/A | ✅ | +| **FLUX.2-dev** | `black-forest-labs/FLUX.2-dev` | ❌ | ❌ | ❌ | ✅ | ❌ | N/A | ✅ | +| **HunyuanImage3.0** | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | +| **DreamID-Omni** | `XuGuo699/DreamID-Omni` | ❌ | ❌ | ✅ | ❌ | ❌ | N/A | ❌ | !!! note "TP Limitations for Diffusion Models" We currently implement Tensor Parallelism (TP) only for the DiT (Diffusion Transformer) blocks. This is because the `text_encoder` component in vLLM-Omni uses the original Transformers implementation, which does not yet support TP. diff --git a/vllm_omni/diffusion/models/flux/flux_transformer.py b/vllm_omni/diffusion/models/flux/flux_transformer.py index 2979fd4f65..1a003b86c0 100644 --- a/vllm_omni/diffusion/models/flux/flux_transformer.py +++ b/vllm_omni/diffusion/models/flux/flux_transformer.py @@ -470,6 +470,12 @@ class FluxTransformer2DModel(nn.Module): # -- typically a transformer layer # used for torch compile optimizations _repeated_blocks = ["FluxTransformerBlock"] + + @staticmethod + def _is_transformer_block(name: str, module) -> bool: + return ("transformer_blocks" in name or "single_transformer_blocks" in name) and name.split(".")[-1].isdigit() + + _hsdp_shard_conditions = [_is_transformer_block] packed_modules_mapping = { "to_qkv": ["to_q", "to_k", "to_v"], "add_kv_proj": ["add_q_proj", "add_k_proj", "add_v_proj"], diff --git a/vllm_omni/diffusion/models/flux/pipeline_flux.py b/vllm_omni/diffusion/models/flux/pipeline_flux.py index d4793d34ec..3955fee120 100644 --- a/vllm_omni/diffusion/models/flux/pipeline_flux.py +++ b/vllm_omni/diffusion/models/flux/pipeline_flux.py @@ -160,10 +160,10 @@ def __init__( ) self.text_encoder = CLIPTextModel.from_pretrained( model, subfolder="text_encoder", local_files_only=local_files_only - ) + ).to(self.device) self.text_encoder_2 = T5EncoderModel.from_pretrained( model, subfolder="text_encoder_2", local_files_only=local_files_only - ) + ).to(self.device) self.vae = AutoencoderKL.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to( self.device ) diff --git a/vllm_omni/diffusion/models/flux2/flux2_transformer.py b/vllm_omni/diffusion/models/flux2/flux2_transformer.py index 040f2779a8..62f415393e 100644 --- a/vllm_omni/diffusion/models/flux2/flux2_transformer.py +++ b/vllm_omni/diffusion/models/flux2/flux2_transformer.py @@ -553,6 +553,12 @@ class Flux2Transformer2DModel(nn.Module): _repeated_blocks = ["Flux2TransformerBlock", "Flux2SingleTransformerBlock"] + @staticmethod + def _is_transformer_block(name: str, module) -> bool: + return ("transformer_blocks" in name or "single_transformer_blocks" in name) and name.split(".")[-1].isdigit() + + _hsdp_shard_conditions = [_is_transformer_block] + def __init__( self, patch_size: int = 1, diff --git a/vllm_omni/diffusion/models/flux2/pipeline_flux2.py b/vllm_omni/diffusion/models/flux2/pipeline_flux2.py index 7f07adf6f1..1da0f0cdaf 100644 --- a/vllm_omni/diffusion/models/flux2/pipeline_flux2.py +++ b/vllm_omni/diffusion/models/flux2/pipeline_flux2.py @@ -366,7 +366,7 @@ def __init__( ) self.text_encoder = Mistral3ForConditionalGeneration.from_pretrained( model, subfolder="text_encoder", local_files_only=local_files_only - ) + ).to(self._execution_device) self.tokenizer = PixtralProcessor.from_pretrained( model, subfolder="tokenizer", local_files_only=local_files_only ) diff --git a/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py b/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py index c10f06751d..fa7bbc4fe5 100644 --- a/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py +++ b/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py @@ -741,6 +741,11 @@ class Flux2Transformer2DModel(nn.Module): _repeated_blocks = ["Flux2TransformerBlock", "Flux2SingleTransformerBlock"] + @staticmethod + def _is_transformer_block(name: str, module) -> bool: + return ("transformer_blocks" in name or "single_transformer_blocks" in name) and name.split(".")[-1].isdigit() + + _hsdp_shard_conditions = [_is_transformer_block] _sp_plan = { "": { "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3, auto_pad=True), diff --git a/vllm_omni/diffusion/models/flux2_klein/pipeline_flux2_klein.py b/vllm_omni/diffusion/models/flux2_klein/pipeline_flux2_klein.py index d43748380b..551be4f069 100644 --- a/vllm_omni/diffusion/models/flux2_klein/pipeline_flux2_klein.py +++ b/vllm_omni/diffusion/models/flux2_klein/pipeline_flux2_klein.py @@ -218,7 +218,7 @@ def __init__( model, subfolder="text_encoder", local_files_only=local_files_only, - ) + ).to(self._execution_device) self.tokenizer = Qwen2TokenizerFast.from_pretrained( model, subfolder="tokenizer",