From 9bb72f6aa9d8ee9bbb1d207fd0df719f94ea3029 Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Wed, 1 Apr 2026 14:42:21 -0400
Subject: [PATCH 01/10] [Refactor] Let diffusion pipelines declare offloadable
 modules via SupportsModuleOffload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ModuleDiscovery previously hardcoded attribute names to find DiT,
encoder, and VAE modules for CPU offload. This silently failed for
pipelines using non-standard names (e.g. OmniGen2's 'mllm', Bagel's
'vit_model', MammothModa2's 'gen_transformer'/'gen_vae'), leaving
multi-GB models idle on GPU during the denoising loop.

Add SupportsModuleOffload protocol to the pipeline interface.
Pipelines declare _dit_modules, _encoder_modules, and _vae_modules
as class variables, and ModuleDiscovery.discover() reads them
directly. Both DiT and encoder lists are needed because the offload
hooks use mutual exclusion. Pipelines without the protocol fall back
to the existing attribute name scan.

Also update PipelineModules.vae to PipelineModules.vaes (list) to
support pipelines with multiple VAEs (e.g. LTX2's audio_vae,
DreamIDOmni's vae_model_audio). Both sequential and layerwise
offload backends updated to iterate the list.

Behavioral changes from unifying collection logic into
_collect_modules:
- Encoder collection now checks isinstance(nn.Module) (original
  did not) — prevents non-Module objects from reaching .to(device).
- Encoder collection now deduplicates (original did not) — avoids
  double hook registration when two attrs point to the same module.
- Non-Module attributes are warned when declared via the protocol
  (pipeline authoring bug), silently skipped in fallback path.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 vllm_omni/diffusion/models/interface.py       |  30 +++++
 .../diffusion/offloader/layerwise_backend.py  |   6 +-
 .../diffusion/offloader/module_collector.py   | 125 ++++++++++++------
 .../diffusion/offloader/sequential_backend.py |   6 +-
 4 files changed, 120 insertions(+), 47 deletions(-)

diff --git a/vllm_omni/diffusion/models/interface.py b/vllm_omni/diffusion/models/interface.py
index ef906472bd0..1aa12e7da39 100644
--- a/vllm_omni/diffusion/models/interface.py
+++ b/vllm_omni/diffusion/models/interface.py
@@ -58,6 +58,36 @@ def post_decode(self, state: DiffusionRequestState, **kwargs: Any) -> DiffusionO
         """Decode output after denoise loop."""
 
 
+@runtime_checkable
+class SupportsModuleOffload(Protocol):
+    """Declares which submodules participate in sequential CPU offload.
+
+    The offload system uses mutual exclusion: when one group runs,
+    the other is moved to CPU.  Pipelines must declare both groups
+    because only the pipeline knows its own architecture.
+
+    ``_dit_modules``: attribute names of denoising submodules (kept
+    on GPU during the diffusion loop).
+
+    ``_encoder_modules``: attribute names of encoder/vision
+    submodules (offloaded to CPU during the diffusion loop).
+
+    ``_vae_modules``: attribute names of VAE(s) (always kept on GPU,
+    not part of the mutual exclusion hooks).
+
+    Example::
+
+        class MyPipeline(nn.Module, SupportsModuleOffload):
+            _dit_modules: ClassVar[list[str]] = ["transformer"]
+            _encoder_modules: ClassVar[list[str]] = ["text_encoder", "vit"]
+            _vae_modules: ClassVar[list[str]] = ["vae"]
+    """
+
+    _dit_modules: ClassVar[list[str]]
+    _encoder_modules: ClassVar[list[str]]
+    _vae_modules: ClassVar[list[str]]
+
+
 def supports_step_execution(pipeline: object) -> bool:
     """Return whether `pipeline` implements :class:`SupportsStepExecution`."""
 
diff --git a/vllm_omni/diffusion/offloader/layerwise_backend.py b/vllm_omni/diffusion/offloader/layerwise_backend.py
index d1216e2f28c..96725ae1a16 100644
--- a/vllm_omni/diffusion/offloader/layerwise_backend.py
+++ b/vllm_omni/diffusion/offloader/layerwise_backend.py
@@ -298,10 +298,10 @@ def enable(self, pipeline: nn.Module) -> None:
         for enc in modules.encoders:
             enc.to(self.device)
 
-        # Move VAE to GPU if available
-        if modules.vae is not None:
+        # Move VAE(s) to GPU if available
+        for vae in modules.vaes:
             try:
-                modules.vae.to(self.device, non_blocking=True)
+                vae.to(self.device, non_blocking=True)
             except Exception as exc:
                 logger.debug("Failed to move VAE to GPU: %s", exc)
 
diff --git a/vllm_omni/diffusion/offloader/module_collector.py b/vllm_omni/diffusion/offloader/module_collector.py
index a09a337001e..bb3ea1a4515 100644
--- a/vllm_omni/diffusion/offloader/module_collector.py
+++ b/vllm_omni/diffusion/offloader/module_collector.py
@@ -6,6 +6,8 @@
 from torch import nn
 from vllm.logger import init_logger
 
+from vllm_omni.diffusion.models.interface import SupportsModuleOffload
+
 logger = init_logger(__name__)
 
 
@@ -15,15 +17,78 @@ class PipelineModules:
     dit_names: list[str]
     encoders: list[nn.Module]
     encoder_names: list[str]
-    vae: nn.Module | None = None
+    vaes: list[nn.Module]
 
 
 class ModuleDiscovery:
-    """Discovers pipeline components for offloading"""
+    """Discovers pipeline components for offloading.
+
+    If the pipeline implements :class:`SupportsModuleOffload`,
+    its ``_dit_modules``, ``_encoder_modules``, and ``_vae_modules``
+    class variables are used directly.  Otherwise, falls back to
+    scanning well-known attribute names.
+    """
+
+    # Fallback attribute names for pipelines that do not implement
+    # SupportsModuleOffload.
+    _FALLBACK_DIT_ATTRS = [
+        "transformer",
+        "transformer_2",
+        "dit",
+        "sr_dit",
+        "language_model",
+        "transformer_blocks",
+        "model",
+    ]
+    _FALLBACK_ENCODER_ATTRS = [
+        "text_encoder",
+        "text_encoder_2",
+        "text_encoder_3",
+        "image_encoder",
+    ]
+    _FALLBACK_VAE_ATTRS = [
+        "vae",
+        "audio_vae",
+    ]
 
-    DIT_ATTRS = ["transformer", "transformer_2", "dit", "sr_dit", "language_model", "transformer_blocks", "model"]
-    ENCODER_ATTRS = ["text_encoder", "text_encoder_2", "text_encoder_3", "image_encoder"]
-    VAE_ATTRS = ["vae", "audio_vae"]
+    @staticmethod
+    def _collect_modules(
+        pipeline: nn.Module,
+        attr_names: list[str],
+        *,
+        warn_missing: bool = False,
+    ) -> tuple[list[nn.Module], list[str]]:
+        """Resolve attribute names to (module, name) pairs, skipping missing.
+
+        When *warn_missing* is True (protocol path), warn about
+        declared attributes that do not exist.  Non-``nn.Module``
+        attributes always produce a warning regardless (they indicate
+        a real bug, even in the fallback scan).
+        """
+        modules: list[nn.Module] = []
+        names: list[str] = []
+        seen: set[int] = set()
+        for attr in attr_names:
+            module = getattr(pipeline, attr, None)
+            if module is None:
+                if warn_missing:
+                    logger.warning(
+                        "Pipeline declares '%s' as offloadable but the attribute does not exist or is None",
+                        attr,
+                    )
+                continue
+            if not isinstance(module, nn.Module):
+                logger.warning(
+                    "Expected '%s' to be nn.Module, got %r",
+                    attr,
+                    type(module),
+                )
+                continue
+            if id(module) not in seen:
+                seen.add(id(module))
+                modules.append(module)
+                names.append(attr)
+        return modules, names
 
     @staticmethod
     def discover(pipeline: nn.Module) -> PipelineModules:
@@ -35,46 +100,24 @@ def discover(pipeline: nn.Module) -> PipelineModules:
         Returns:
             PipelineModules with lists of discovered modules and names
         """
-        # Collect DiT/transformer modules
-        dit_modules: list[nn.Module] = []
-        dit_names: list[str] = []
-        for attr in ModuleDiscovery.DIT_ATTRS:
-            if not hasattr(pipeline, attr):
-                continue
-            module_obj = getattr(pipeline, attr)
-            if module_obj is None:
-                continue
-
-            if not isinstance(module_obj, nn.Module):
-                logger.warning(f"Expected {attr} to be nn.Module, got {type(module_obj)!r}")
-                continue
-
-            if module_obj in dit_modules:
-                continue
-
-            dit_modules.append(module_obj)
-            dit_names.append(attr)
-
-        # Collect all encoders
-        encoders: list[nn.Module] = []
-        encoder_names: list[str] = []
-        for attr in ModuleDiscovery.ENCODER_ATTRS:
-            if hasattr(pipeline, attr) and getattr(pipeline, attr) is not None:
-                encoders.append(getattr(pipeline, attr))
-                encoder_names.append(attr)
-
-        # Collect VAE
-        vae = None
-        for attr in ModuleDiscovery.VAE_ATTRS:
-            module = getattr(pipeline, attr, None)
-            if module is not None:
-                vae = module
-                break
+        declared = isinstance(pipeline, SupportsModuleOffload)
+        if declared:
+            dit_attrs = pipeline._dit_modules
+            enc_attrs = pipeline._encoder_modules
+            vae_attrs = pipeline._vae_modules
+        else:
+            dit_attrs = ModuleDiscovery._FALLBACK_DIT_ATTRS
+            enc_attrs = ModuleDiscovery._FALLBACK_ENCODER_ATTRS
+            vae_attrs = ModuleDiscovery._FALLBACK_VAE_ATTRS
+
+        dit_modules, dit_names = ModuleDiscovery._collect_modules(pipeline, dit_attrs, warn_missing=declared)
+        encoders, encoder_names = ModuleDiscovery._collect_modules(pipeline, enc_attrs, warn_missing=declared)
+        vaes, _ = ModuleDiscovery._collect_modules(pipeline, vae_attrs, warn_missing=declared)
 
         return PipelineModules(
             dits=dit_modules,
             dit_names=dit_names,
             encoders=encoders,
             encoder_names=encoder_names,
-            vae=vae,
+            vaes=vaes,
         )
diff --git a/vllm_omni/diffusion/offloader/sequential_backend.py b/vllm_omni/diffusion/offloader/sequential_backend.py
index 46f48e99c5d..06454ad5c63 100644
--- a/vllm_omni/diffusion/offloader/sequential_backend.py
+++ b/vllm_omni/diffusion/offloader/sequential_backend.py
@@ -210,10 +210,10 @@ def enable(self, pipeline: nn.Module) -> None:
         for enc in modules.encoders:
             enc.to(self.device)
 
-        # Move VAE to GPU if available
-        if modules.vae is not None:
+        # Move VAE(s) to GPU if available
+        for vae in modules.vaes:
             try:
-                modules.vae.to(self.device, non_blocking=True)
+                vae.to(self.device, non_blocking=True)
             except Exception as exc:
                 logger.debug("Failed to move VAE to GPU: %s", exc)
 

From d9dde26619829071fd79be837faaa1cc008a0d4a Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Wed, 1 Apr 2026 15:00:52 -0400
Subject: [PATCH 02/10] [Perf][OmniGen2] Declare offloadable modules for CPU
 offload discovery

Add SupportsModuleOffload to OmniGen2Pipeline so ModuleDiscovery
can find the Qwen2.5-VL text encoder ('mllm', ~6-16 GB) for
sequential CPU offload. Previously, 'mllm' was not in the hardcoded
attribute scan list, so enable_cpu_offload silently left it on GPU
during the entire denoising loop.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py b/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py
index 04720c932ff..46d634bfdc0 100644
--- a/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py
+++ b/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py
@@ -8,7 +8,7 @@
 import warnings
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, ClassVar
 
 import numpy as np
 import PIL.Image
@@ -32,6 +32,7 @@
 from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin
 from vllm_omni.diffusion.distributed.utils import get_local_device
 from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
+from vllm_omni.diffusion.models.interface import SupportsModuleOffload
 from vllm_omni.diffusion.models.omnigen2.omnigen2_transformer import (
     OmniGen2RotaryPosEmbed,
     OmniGen2Transformer2DModel,
@@ -620,7 +621,7 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-class OmniGen2Pipeline(CFGParallelMixin, nn.Module):
+class OmniGen2Pipeline(CFGParallelMixin, nn.Module, SupportsModuleOffload):
     """
     Pipeline for text-to-image generation using OmniGen2.
 
@@ -634,6 +635,10 @@ class OmniGen2Pipeline(CFGParallelMixin, nn.Module):
         od_config (OmniDiffusionConfig): The OmniDiffusion configuration.
     """
 
+    _dit_modules: ClassVar[list[str]] = ["transformer"]
+    _encoder_modules: ClassVar[list[str]] = ["mllm"]
+    _vae_modules: ClassVar[list[str]] = ["vae"]
+
     def __init__(
         self,
         *,

From 24e5500fb8bf55163cb1092ba84a3b9c77400fa0 Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Thu, 16 Apr 2026 15:59:57 -0400
Subject: [PATCH 03/10] [Test] Add unit tests for ModuleDiscovery and
 SupportsModuleOffload

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 .../offloader/test_module_collector.py        | 163 ++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 tests/diffusion/offloader/test_module_collector.py

diff --git a/tests/diffusion/offloader/test_module_collector.py b/tests/diffusion/offloader/test_module_collector.py
new file mode 100644
index 00000000000..0c7edfb5a76
--- /dev/null
+++ b/tests/diffusion/offloader/test_module_collector.py
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Unit tests for ModuleDiscovery and SupportsModuleOffload."""
+
+from typing import ClassVar
+
+import pytest
+from torch import nn
+
+from vllm_omni.diffusion.models.interface import SupportsModuleOffload
+from vllm_omni.diffusion.offloader.module_collector import ModuleDiscovery
+
+pytestmark = [pytest.mark.diffusion, pytest.mark.cpu, pytest.mark.core_model]
+
+# NOTE: tests for skipped/warned attributes verify the *behavioral*
+# outcome (attribute excluded from results) but do not assert on log
+# output.  vllm's logger sets propagate=False, preventing caplog from
+# capturing records.  See https://github.com/pytest-dev/pytest/issues/3697
+
+
+# ---------------------------------------------------------------------------
+# Test pipelines
+# ---------------------------------------------------------------------------
+
+
+class FallbackPipeline(nn.Module):
+    """Pipeline with standard attribute names (no protocol)."""
+
+    def __init__(self):
+        super().__init__()
+        self.transformer = nn.Linear(10, 10)
+        self.text_encoder = nn.Linear(10, 10)
+        self.text_encoder_2 = nn.Linear(10, 10)
+        self.vae = nn.Linear(10, 10)
+
+
+class NonModuleAttrPipeline(nn.Module):
+    """Pipeline where an attribute is not an nn.Module (fallback path)."""
+
+    def __init__(self):
+        super().__init__()
+        self.transformer = nn.Linear(10, 10)
+        self.text_encoder = "not_a_module"
+        self.vae = nn.Linear(10, 10)
+
+
+class DuplicateAttrPipeline(nn.Module):
+    """Pipeline where two encoder attrs point to the same module."""
+
+    def __init__(self):
+        super().__init__()
+        self.transformer = nn.Linear(10, 10)
+        encoder = nn.Linear(10, 10)
+        self.text_encoder = encoder
+        self.text_encoder_2 = encoder
+        self.vae = nn.Linear(10, 10)
+
+
+class ProtocolPipeline(nn.Module, SupportsModuleOffload):
+    """Pipeline with non-standard names, using the protocol."""
+
+    _dit_modules: ClassVar[list[str]] = ["gen_transformer"]
+    _encoder_modules: ClassVar[list[str]] = ["mllm", "vision_model"]
+    _vae_modules: ClassVar[list[str]] = ["gen_vae"]
+
+    def __init__(self):
+        super().__init__()
+        self.gen_transformer = nn.Linear(10, 10)
+        self.mllm = nn.Linear(10, 10)
+        self.vision_model = nn.Linear(10, 10)
+        self.gen_vae = nn.Linear(10, 10)
+        # Standard name present but NOT declared — should be ignored
+        self.transformer = nn.Linear(10, 10)
+
+
+class MissingAttrPipeline(nn.Module, SupportsModuleOffload):
+    """Pipeline that declares a non-existent attribute."""
+
+    _dit_modules: ClassVar[list[str]] = ["transformer"]
+    _encoder_modules: ClassVar[list[str]] = ["nonexistent_encoder"]
+    _vae_modules: ClassVar[list[str]] = ["vae"]
+
+    def __init__(self):
+        super().__init__()
+        self.transformer = nn.Linear(10, 10)
+        self.vae = nn.Linear(10, 10)
+
+
+class MultiVaePipeline(nn.Module, SupportsModuleOffload):
+    """Pipeline with multiple VAEs."""
+
+    _dit_modules: ClassVar[list[str]] = ["transformer"]
+    _encoder_modules: ClassVar[list[str]] = ["text_encoder"]
+    _vae_modules: ClassVar[list[str]] = ["vae", "audio_vae"]
+
+    def __init__(self):
+        super().__init__()
+        self.transformer = nn.Linear(10, 10)
+        self.text_encoder = nn.Linear(10, 10)
+        self.vae = nn.Linear(10, 10)
+        self.audio_vae = nn.Linear(10, 10)
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+class TestFallbackDiscovery:
+    """Test the fallback attribute scan (no SupportsModuleOffload)."""
+
+    def test_discovers_standard_attrs(self):
+        pipeline = FallbackPipeline()
+        result = ModuleDiscovery.discover(pipeline)
+
+        assert not isinstance(pipeline, SupportsModuleOffload)
+        assert result.dit_names == ["transformer"]
+        assert result.dits[0] is pipeline.transformer
+        assert result.encoder_names == ["text_encoder", "text_encoder_2"]
+        assert result.vaes[0] is pipeline.vae
+
+    def test_deduplicates_encoders(self):
+        pipeline = DuplicateAttrPipeline()
+        result = ModuleDiscovery.discover(pipeline)
+
+        assert len(result.encoders) == 1
+        assert result.encoder_names == ["text_encoder"]
+
+    def test_skips_non_module_attr(self):
+        pipeline = NonModuleAttrPipeline()
+        result = ModuleDiscovery.discover(pipeline)
+
+        assert len(result.encoders) == 0
+
+
+class TestProtocolDiscovery:
+    """Test discovery via SupportsModuleOffload protocol."""
+
+    def test_discovers_declared_attrs_and_ignores_undeclared(self):
+        pipeline = ProtocolPipeline()
+        result = ModuleDiscovery.discover(pipeline)
+
+        assert isinstance(pipeline, SupportsModuleOffload)
+        assert result.dit_names == ["gen_transformer"]
+        assert result.encoder_names == ["mllm", "vision_model"]
+        assert len(result.vaes) == 1
+        # self.transformer exists but is NOT in _dit_modules
+        assert "transformer" not in result.dit_names
+
+    def test_skips_missing_attr(self):
+        pipeline = MissingAttrPipeline()
+        result = ModuleDiscovery.discover(pipeline)
+
+        assert len(result.encoders) == 0
+
+    def test_multiple_vaes(self):
+        pipeline = MultiVaePipeline()
+        result = ModuleDiscovery.discover(pipeline)
+
+        assert len(result.vaes) == 2
+        assert result.vaes[0] is pipeline.vae
+        assert result.vaes[1] is pipeline.audio_vae

From 503716fe0f0818d88edb93668f0c73d5536336ef Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Mon, 20 Apr 2026 08:25:38 -0400
Subject: [PATCH 04/10] [Feat] Support dotted paths in SupportsModuleOffload
 module lists

Allow dotted attribute names (e.g. "pipe.transformer") in
_dit_modules, _encoder_modules, and _vae_modules to resolve
nested modules via operator.attrgetter.  This handles pipelines
like LTX2TwoStagesPipeline where the transformer lives under a
child pipeline (pipe.transformer), and Bagel where the encoder
is at language_model.model.

Flat attribute names continue to work unchanged.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 .../offloader/test_module_collector.py        | 41 +++++++++++++++++++
 .../diffusion/offloader/module_collector.py   |  9 +++-
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/tests/diffusion/offloader/test_module_collector.py b/tests/diffusion/offloader/test_module_collector.py
index 0c7edfb5a76..c207da30ca9 100644
--- a/tests/diffusion/offloader/test_module_collector.py
+++ b/tests/diffusion/offloader/test_module_collector.py
@@ -87,6 +87,32 @@ def __init__(self):
         self.vae = nn.Linear(10, 10)
 
 
+class MissingIntermediatePipeline(nn.Module, SupportsModuleOffload):
+    """Pipeline with dotted path referencing non-existent intermediate."""
+
+    _dit_modules: ClassVar[list[str]] = ["nonexistent.transformer"]
+    _encoder_modules: ClassVar[list[str]] = []
+    _vae_modules: ClassVar[list[str]] = []
+
+    def __init__(self):
+        super().__init__()
+
+
+class NestedPipeline(nn.Module, SupportsModuleOffload):
+    """Pipeline with nested modules accessed via dotted paths."""
+
+    _dit_modules: ClassVar[list[str]] = ["pipe.transformer"]
+    _encoder_modules: ClassVar[list[str]] = ["pipe.text_encoder"]
+    _vae_modules: ClassVar[list[str]] = ["vae"]
+
+    def __init__(self):
+        super().__init__()
+        self.pipe = nn.Module()
+        self.pipe.transformer = nn.Linear(10, 10)
+        self.pipe.text_encoder = nn.Linear(10, 10)
+        self.vae = nn.Linear(10, 10)
+
+
 class MultiVaePipeline(nn.Module, SupportsModuleOffload):
     """Pipeline with multiple VAEs."""
 
@@ -154,6 +180,21 @@ def test_skips_missing_attr(self):
 
         assert len(result.encoders) == 0
 
+    def test_skips_missing_intermediate(self):
+        result = ModuleDiscovery.discover(MissingIntermediatePipeline())
+
+        assert len(result.dits) == 0
+
+    def test_dotted_path_resolves_nested_modules(self):
+        pipeline = NestedPipeline()
+        result = ModuleDiscovery.discover(pipeline)
+
+        assert result.dit_names == ["pipe.transformer"]
+        assert result.dits[0] is pipeline.pipe.transformer
+        assert result.encoder_names == ["pipe.text_encoder"]
+        assert result.encoders[0] is pipeline.pipe.text_encoder
+        assert result.vaes[0] is pipeline.vae
+
     def test_multiple_vaes(self):
         pipeline = MultiVaePipeline()
         result = ModuleDiscovery.discover(pipeline)
diff --git a/vllm_omni/diffusion/offloader/module_collector.py b/vllm_omni/diffusion/offloader/module_collector.py
index bb3ea1a4515..b4f390c8a4c 100644
--- a/vllm_omni/diffusion/offloader/module_collector.py
+++ b/vllm_omni/diffusion/offloader/module_collector.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
+from operator import attrgetter
 
 from torch import nn
 from vllm.logger import init_logger
@@ -60,6 +61,9 @@ def _collect_modules(
     ) -> tuple[list[nn.Module], list[str]]:
         """Resolve attribute names to (module, name) pairs, skipping missing.
 
+        Dotted names (e.g. ``"pipe.transformer"``) are resolved by
+        walking the attribute chain via :func:`operator.attrgetter`.
+
         When *warn_missing* is True (protocol path), warn about
         declared attributes that do not exist.  Non-``nn.Module``
         attributes always produce a warning regardless (they indicate
@@ -69,7 +73,10 @@ def _collect_modules(
         names: list[str] = []
         seen: set[int] = set()
         for attr in attr_names:
-            module = getattr(pipeline, attr, None)
+            try:
+                module = attrgetter(attr)(pipeline)
+            except AttributeError:
+                module = None
             if module is None:
                 if warn_missing:
                     logger.warning(

From 1f28322f23ddf1a10cdb454c04c941c4ceb2c658 Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Mon, 20 Apr 2026 08:34:50 -0400
Subject: [PATCH 05/10] [Feat] Add _resident_modules for GPU-pinned modules
 during layerwise offload

Add _resident_modules class variable to SupportsModuleOffload for
small submodules that must stay on GPU during layer-wise offloading
(e.g. embedders, connectors).  Defaults to empty list.

During layerwise offload, pipelines load everything to CPU and the
offloader selectively moves dit/encoder/vae groups to GPU.  Modules
outside these groups stay on CPU, which breaks pipelines like Bagel
where time_embedder, vae2llm, vit_model etc. are needed every
forward pass but are not children of any discovered group.

_resident_modules lets pipelines declare these modules explicitly.
The layerwise backend pins them on GPU alongside encoders and VAEs.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 .../offloader/test_module_collector.py        | 36 +++++++++++++++++++
 vllm_omni/diffusion/models/interface.py       |  9 +++++
 .../diffusion/offloader/layerwise_backend.py  |  7 ++++
 .../diffusion/offloader/module_collector.py   |  9 ++++-
 4 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/tests/diffusion/offloader/test_module_collector.py b/tests/diffusion/offloader/test_module_collector.py
index c207da30ca9..ab15ad8df60 100644
--- a/tests/diffusion/offloader/test_module_collector.py
+++ b/tests/diffusion/offloader/test_module_collector.py
@@ -113,6 +113,27 @@ def __init__(self):
         self.vae = nn.Linear(10, 10)
 
 
+class ResidentPipeline(nn.Module, SupportsModuleOffload):
+    """Pipeline with resident modules that must stay on GPU."""
+
+    _dit_modules: ClassVar[list[str]] = ["language_model.model"]
+    _encoder_modules: ClassVar[list[str]] = []
+    _vae_modules: ClassVar[list[str]] = ["vae"]
+    _resident_modules: ClassVar[list[str]] = [
+        "bagel.time_embedder",
+        "bagel.vae2llm",
+    ]
+
+    def __init__(self):
+        super().__init__()
+        self.language_model = nn.Module()
+        self.language_model.model = nn.Linear(10, 10)
+        self.bagel = nn.Module()
+        self.bagel.time_embedder = nn.Linear(10, 10)
+        self.bagel.vae2llm = nn.Linear(10, 10)
+        self.vae = nn.Linear(10, 10)
+
+
 class MultiVaePipeline(nn.Module, SupportsModuleOffload):
     """Pipeline with multiple VAEs."""
 
@@ -145,6 +166,7 @@ def test_discovers_standard_attrs(self):
         assert result.dits[0] is pipeline.transformer
         assert result.encoder_names == ["text_encoder", "text_encoder_2"]
         assert result.vaes[0] is pipeline.vae
+        assert result.resident_modules == []
 
     def test_deduplicates_encoders(self):
         pipeline = DuplicateAttrPipeline()
@@ -173,6 +195,8 @@ def test_discovers_declared_attrs_and_ignores_undeclared(self):
         assert len(result.vaes) == 1
         # self.transformer exists but is NOT in _dit_modules
         assert "transformer" not in result.dit_names
+        # No _resident_modules declared — defaults to empty
+        assert result.resident_modules == []
 
     def test_skips_missing_attr(self):
         pipeline = MissingAttrPipeline()
@@ -195,6 +219,18 @@ def test_dotted_path_resolves_nested_modules(self):
         assert result.encoders[0] is pipeline.pipe.text_encoder
         assert result.vaes[0] is pipeline.vae
 
+    def test_resident_modules(self):
+        pipeline = ResidentPipeline()
+        result = ModuleDiscovery.discover(pipeline)
+
+        assert result.resident_names == [
+            "bagel.time_embedder",
+            "bagel.vae2llm",
+        ]
+        assert result.resident_modules[0] is pipeline.bagel.time_embedder
+        assert result.resident_modules[1] is pipeline.bagel.vae2llm
+        assert result.dits[0] is pipeline.language_model.model
+
     def test_multiple_vaes(self):
         pipeline = MultiVaePipeline()
         result = ModuleDiscovery.discover(pipeline)
diff --git a/vllm_omni/diffusion/models/interface.py b/vllm_omni/diffusion/models/interface.py
index 1aa12e7da39..34567104374 100644
--- a/vllm_omni/diffusion/models/interface.py
+++ b/vllm_omni/diffusion/models/interface.py
@@ -75,17 +75,26 @@ class SupportsModuleOffload(Protocol):
     ``_vae_modules``: attribute names of VAE(s) (always kept on GPU,
     not part of the mutual exclusion hooks).
 
+    ``_resident_modules``: attribute names of small submodules that
+    must stay on GPU during layer-wise offloading (e.g. embedders,
+    connectors).  Optional — defaults to ``[]``.
+
+    All attribute names support dotted paths (e.g.
+    ``"bagel.time_embedder"``) for nested submodules.
+
     Example::
 
         class MyPipeline(nn.Module, SupportsModuleOffload):
             _dit_modules: ClassVar[list[str]] = ["transformer"]
             _encoder_modules: ClassVar[list[str]] = ["text_encoder", "vit"]
             _vae_modules: ClassVar[list[str]] = ["vae"]
+            _resident_modules: ClassVar[list[str]] = ["time_embedder"]
     """
 
     _dit_modules: ClassVar[list[str]]
     _encoder_modules: ClassVar[list[str]]
     _vae_modules: ClassVar[list[str]]
+    _resident_modules: ClassVar[list[str]] = []
 
 
 def supports_step_execution(pipeline: object) -> bool:
diff --git a/vllm_omni/diffusion/offloader/layerwise_backend.py b/vllm_omni/diffusion/offloader/layerwise_backend.py
index 96725ae1a16..9979d01b103 100644
--- a/vllm_omni/diffusion/offloader/layerwise_backend.py
+++ b/vllm_omni/diffusion/offloader/layerwise_backend.py
@@ -305,6 +305,13 @@ def enable(self, pipeline: nn.Module) -> None:
             except Exception as exc:
                 logger.debug("Failed to move VAE to GPU: %s", exc)
 
+        # Move resident modules to GPU (small modules needed every forward)
+        for name, module in zip(modules.resident_names, modules.resident_modules):
+            try:
+                module.to(self.device)
+            except Exception as exc:
+                logger.debug("Failed to move resident module %s to GPU: %s", name, exc)
+
         logger.info("Applying layer-wise offloading on %s", modules.dit_names)
 
         # Apply block-wise offloading hook for each of the blocks in DiT model(s)
diff --git a/vllm_omni/diffusion/offloader/module_collector.py b/vllm_omni/diffusion/offloader/module_collector.py
index b4f390c8a4c..c0aa56522d7 100644
--- a/vllm_omni/diffusion/offloader/module_collector.py
+++ b/vllm_omni/diffusion/offloader/module_collector.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from operator import attrgetter
 
 from torch import nn
@@ -19,6 +19,8 @@ class PipelineModules:
     encoders: list[nn.Module]
     encoder_names: list[str]
     vaes: list[nn.Module]
+    resident_modules: list[nn.Module] = field(default_factory=list)
+    resident_names: list[str] = field(default_factory=list)
 
 
 class ModuleDiscovery:
@@ -112,14 +114,17 @@ def discover(pipeline: nn.Module) -> PipelineModules:
             dit_attrs = pipeline._dit_modules
             enc_attrs = pipeline._encoder_modules
             vae_attrs = pipeline._vae_modules
+            res_attrs = pipeline._resident_modules
         else:
             dit_attrs = ModuleDiscovery._FALLBACK_DIT_ATTRS
             enc_attrs = ModuleDiscovery._FALLBACK_ENCODER_ATTRS
             vae_attrs = ModuleDiscovery._FALLBACK_VAE_ATTRS
+            res_attrs = []
 
         dit_modules, dit_names = ModuleDiscovery._collect_modules(pipeline, dit_attrs, warn_missing=declared)
         encoders, encoder_names = ModuleDiscovery._collect_modules(pipeline, enc_attrs, warn_missing=declared)
         vaes, _ = ModuleDiscovery._collect_modules(pipeline, vae_attrs, warn_missing=declared)
+        residents, resident_names = ModuleDiscovery._collect_modules(pipeline, res_attrs, warn_missing=declared)
 
         return PipelineModules(
             dits=dit_modules,
@@ -127,4 +132,6 @@ def discover(pipeline: nn.Module) -> PipelineModules:
             encoders=encoders,
             encoder_names=encoder_names,
             vaes=vaes,
+            resident_modules=residents,
+            resident_names=resident_names,
         )

From 2e31a756fcb40d4c1e207b2be18866e509bc7d08 Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Thu, 2 Apr 2026 11:26:49 -0400
Subject: [PATCH 06/10] [Doc] Add SupportsModuleOffload documentation for CPU
 offload

Add 'To Support a Model' section under model-level offloading showing
how to implement the SupportsModuleOffload protocol. Restore the
layerwise 'To Support a Model' section under its own parent. Update
the Module Discovery section to document both protocol-based and
fallback attribute scan discovery paths.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 .../diffusion/cpu_offload_diffusion.md        | 55 +++++++++++++++++--
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/docs/user_guide/diffusion/cpu_offload_diffusion.md b/docs/user_guide/diffusion/cpu_offload_diffusion.md
index f80005ccb7e..b85c7e40815 100644
--- a/docs/user_guide/diffusion/cpu_offload_diffusion.md
+++ b/docs/user_guide/diffusion/cpu_offload_diffusion.md
@@ -36,6 +36,45 @@ m = Omni(model="Wan-AI/Wan2.2-T2V-A14B-Diffusers", enable_cpu_offload=True)
 vllm-omni serve diffusion Wan-AI/Wan2.2-T2V-A14B-Diffusers --enable-cpu-offload
 ```
 
+### To Support a Model
+
+Implement the `SupportsModuleOffload` protocol to declare which
+submodules participate in offloading:
+
+```python
+from typing import ClassVar
+from vllm_omni.diffusion.models.interface import SupportsModuleOffload
+
+class MyPipeline(nn.Module, SupportsModuleOffload):
+    _dit_modules: ClassVar[list[str]] = ["transformer"]
+    _encoder_modules: ClassVar[list[str]] = ["text_encoder", "vision_model"]
+    _vae_modules: ClassVar[list[str]] = ["vae"]
+    _resident_modules: ClassVar[list[str]] = []  # optional
+
+    def __init__(self):
+        super().__init__()
+        self.transformer = ...     # DiT — stays on GPU during denoising
+        self.text_encoder = ...    # Encoder — offloaded to CPU during denoising
+        self.vision_model = ...    # Encoder — offloaded to CPU during denoising
+        self.vae = ...             # VAE — always on GPU
+```
+
+- `_dit_modules`: attribute names of denoising submodules (kept on GPU
+  during the diffusion loop).
+- `_encoder_modules`: attribute names of encoder/vision submodules
+  (offloaded to CPU during the diffusion loop).
+- `_vae_modules`: attribute names of VAE(s) (always kept on GPU, not
+  part of the mutual exclusion hooks).
+- `_resident_modules`: attribute names of small submodules that must
+  stay on GPU during layerwise offloading (e.g. embedders, connectors).
+  Optional — defaults to `[]`.
+
+All attribute names support dotted paths for nested submodules
+(e.g. `"pipe.transformer"`, `"bagel.time_embedder"`).
+
+Both DiT and encoder lists are needed because the offload hooks use
+mutual exclusion: when one group runs, the other moves to CPU.
+
 ### Limitations
 - Cold start latency increases
 - Adds overhead from CPU-GPU transfers between encoder and denoising phases
@@ -116,11 +155,19 @@ class Flux2Transformer2DModel(nn.Module):
 
 **Module Discovery**
 
-The offloader automatically discovers pipeline components:
+The offloader discovers pipeline components in two ways:
+
+1. **Protocol-based** (preferred): If the pipeline implements
+    `SupportsModuleOffload`, its `_dit_modules`, `_encoder_modules`,
+    `_vae_modules`, and `_resident_modules` class variables are used
+    directly.  All attribute names support dotted paths (e.g.
+    `"pipe.transformer"`, `"bagel.time_embedder"`) for nested submodules.
 
-- **DiT modules**: `transformer`, `transformer_2`, `dit`
-- **Encoders**: `text_encoder`, `text_encoder_2`, `text_encoder_3`, `image_encoder`
-- **VAE**: `vae`
+2. **Fallback attribute scan**: Otherwise, the offloader scans for
+    well-known attribute names:
+    - **DiT modules**: `transformer`, `transformer_2`, `dit`, `sr_dit`, `language_model`, `transformer_blocks`, `model`
+    - **Encoders**: `text_encoder`, `text_encoder_2`, `text_encoder_3`, `image_encoder`
+    - **VAE**: `vae`, `audio_vae`
 
 **Hook System**
 

From 77cf6b1a8fa0524e8635dc8c353c8a0f413a42ea Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Mon, 20 Apr 2026 08:42:57 -0400
Subject: [PATCH 07/10] [Perf][LTX2] Declare offloadable modules for CPU
 offload discovery

LTX2 two-stage pipelines have nested module structure where the
DiT, encoders, and VAEs live under self.pipe.  The fallback
attribute scan cannot find them, causing layerwise offloading
to skip DiT discovery entirely.

Implement SupportsModuleOffload on LTX2TwoStagesPipeline and
LTX2ImageToVideoTwoStagesPipeline using dotted paths to reach
nested modules (pipe.transformer, pipe.text_encoder, pipe.vae,
pipe.audio_vae).

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py         | 9 +++++++--
 .../diffusion/models/ltx2/pipeline_ltx2_image2video.py   | 9 +++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py
index f06ffab165a..4f62d72c9b6 100644
--- a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py
+++ b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py
@@ -9,7 +9,7 @@
 import os
 from collections.abc import Iterable
 from contextlib import nullcontext
-from typing import Any
+from typing import Any, ClassVar
 
 import numpy as np
 import torch
@@ -34,6 +34,7 @@
 from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager
 from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
 from vllm_omni.diffusion.models.dmd2 import DMD2PipelineMixin
+from vllm_omni.diffusion.models.interface import SupportsModuleOffload
 from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.lora.request import LoRARequest
@@ -1152,9 +1153,13 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loader.load_weights(weights)
 
 
-class LTX2TwoStagesPipeline(nn.Module):
+class LTX2TwoStagesPipeline(nn.Module, SupportsModuleOffload):
     """LTX2TwoStagesPipeline is for two stages image to video generation"""
 
+    _dit_modules: ClassVar[list[str]] = ["pipe.transformer"]
+    _encoder_modules: ClassVar[list[str]] = ["pipe.text_encoder"]
+    _vae_modules: ClassVar[list[str]] = ["pipe.vae", "pipe.audio_vae"]
+
     def __init__(
         self,
         *,
diff --git a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_image2video.py b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_image2video.py
index 50a71a54b61..4cc65f74908 100644
--- a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_image2video.py
+++ b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_image2video.py
@@ -6,7 +6,7 @@
 import copy
 import os
 from collections.abc import Iterable
-from typing import Any
+from typing import Any, ClassVar
 
 import numpy as np
 import PIL.Image
@@ -26,6 +26,7 @@
 from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager
 from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
 from vllm_omni.diffusion.models.dmd2 import DMD2PipelineMixin
+from vllm_omni.diffusion.models.interface import SupportsModuleOffload
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.lora.request import LoRARequest
 
@@ -733,11 +734,15 @@ def forward(
         return DiffusionOutput(output=(video, audio))
 
 
-class LTX2ImageToVideoTwoStagesPipeline(nn.Module):
+class LTX2ImageToVideoTwoStagesPipeline(nn.Module, SupportsModuleOffload):
     """LTXImageToVideoTwoStagesPipeline is for two stages image to video generation"""
 
     support_image_input = True
 
+    _dit_modules: ClassVar[list[str]] = ["pipe.transformer"]
+    _encoder_modules: ClassVar[list[str]] = ["pipe.text_encoder"]
+    _vae_modules: ClassVar[list[str]] = ["pipe.vae", "pipe.audio_vae"]
+
     def __init__(
         self,
         *,

From bd876d3e3e07e3ca499f1178e99a592a40a4db8a Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Mon, 20 Apr 2026 08:43:40 -0400
Subject: [PATCH 08/10] [Perf][Bagel] Declare offloadable and resident modules
 for CPU offload

BagelPipeline has non-standard module layout: the DiT lives at
language_model.model, and several small modules under self.bagel
(time_embedder, vae2llm, llm2vae, latent_pos_embed, vit_model,
connector, vit_pos_embed) are needed every forward pass but are
not children of the DiT.

Implement SupportsModuleOffload with _resident_modules to pin
these small modules on GPU during layerwise offloading.  Without
this, they stay on CPU (offload pipelines skip self.to(device))
and forward() fails with device mismatch.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 .../diffusion/models/bagel/pipeline_bagel.py    | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py
index 90baf5f6761..0290e950292 100644
--- a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py
+++ b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py
@@ -12,6 +12,7 @@
 from copy import deepcopy
 from dataclasses import dataclass
 from math import isqrt
+from typing import ClassVar
 
 import numpy as np
 import torch
@@ -26,6 +27,7 @@
 from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig
 from vllm_omni.diffusion.distributed.utils import get_local_device
 from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
+from vllm_omni.diffusion.models.interface import SupportsModuleOffload
 from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.model_executor.model_loader.weight_utils import download_weights_from_hf_specific
@@ -148,12 +150,25 @@ def forward(self, packed_pixel_values, packed_flattened_position_ids, cu_seqlens
         return outputs.last_hidden_state.squeeze(0)
 
 
-class BagelPipeline(nn.Module, DiffusionPipelineProfilerMixin):
+class BagelPipeline(nn.Module, SupportsModuleOffload, DiffusionPipelineProfilerMixin):
     """Bagel generation pipeline (MoT) packaged for vllm-omni diffusion engine.
 
     This pipeline is self-contained and uses the ported Bagel core files.
     """
 
+    _dit_modules: ClassVar[list[str]] = ["language_model.model"]
+    _encoder_modules: ClassVar[list[str]] = []
+    _vae_modules: ClassVar[list[str]] = ["vae"]
+    _resident_modules: ClassVar[list[str]] = [
+        "bagel.time_embedder",
+        "bagel.vae2llm",
+        "bagel.llm2vae",
+        "bagel.latent_pos_embed",
+        "bagel.vit_model",
+        "bagel.connector",
+        "bagel.vit_pos_embed",
+    ]
+
     def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = ""):
         super().__init__()
         self.od_config = od_config

From a3ba7d90ff8ae7b0607001cbc5c9637cccf99262 Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Mon, 20 Apr 2026 10:51:45 -0400
Subject: [PATCH 09/10] [Doc] Update add-diffusion-model skill for
 SupportsModuleOffload

Add Step 11 (CPU Offload Support) covering SupportsModuleOffload
protocol: _dit_modules, _encoder_modules, _vae_modules,
_resident_modules, dotted path support.

Add cpu_offload_diffusion.md to Step 7 required docs list.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 .claude/skills/add-diffusion-model/SKILL.md | 40 ++++++++++++++++++---
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/.claude/skills/add-diffusion-model/SKILL.md b/.claude/skills/add-diffusion-model/SKILL.md
index a7e0bbf9a57..0b979e1a984 100644
--- a/.claude/skills/add-diffusion-model/SKILL.md
+++ b/.claude/skills/add-diffusion-model/SKILL.md
@@ -282,10 +282,11 @@ For Omni or custom models, create:
 
 Required updates:
 1. `docs/user_guide/diffusion/parallelism_acceleration.md` — parallelism support table
-2. `docs/user_guide/diffusion/teacache.md` — if TeaCache supported
-3. `docs/user_guide/diffusion/cache_dit_acceleration.md` — if Cache-DiT supported
-4. `examples/offline_inference/xxx/README.md` — offline example docs
-5. `examples/online_serve/xxx/README.md` — online serve docs
+2. `docs/user_guide/diffusion/cpu_offload_diffusion.md` — if CPU offload supported (add to supported models table)
+3. `docs/user_guide/diffusion/teacache.md` — if TeaCache supported
+4. `docs/user_guide/diffusion/cache_dit_acceleration.md` — if Cache-DiT supported
+5. `examples/offline_inference/xxx/README.md` — offline example docs
+6. `examples/online_serve/xxx/README.md` — online serve docs
 
 ### Step 8: Add E2E Tests (Recommended)
 
@@ -512,6 +513,37 @@ After adding parallelism support, update:
 1. `docs/user_guide/diffusion/parallelism_acceleration.md` — add your model to the support table
 2. Record which parallelism methods are supported (USP, Ring, CFG, TP, HSDP, VAE-Patch)
 
+### Step 11: Add CPU Offload Support
+
+Implement `SupportsModuleOffload` on your pipeline class to enable
+`--enable-cpu-offload` and `--enable-layerwise-offload`. The protocol
+declares which submodules the offloader should manage:
+
+```python
+from typing import ClassVar
+from vllm_omni.diffusion.models.interface import SupportsModuleOffload
+
+class YourPipeline(nn.Module, SupportsModuleOffload):
+    _dit_modules: ClassVar[list[str]] = ["transformer"]
+    _encoder_modules: ClassVar[list[str]] = ["text_encoder"]
+    _vae_modules: ClassVar[list[str]] = ["vae"]
+    _resident_modules: ClassVar[list[str]] = []  # optional
+```
+
+- `_dit_modules`: denoising submodules (kept on GPU during diffusion loop)
+- `_encoder_modules`: encoder/vision submodules (offloaded to CPU during diffusion loop)
+- `_vae_modules`: VAE(s) (handled by both sequential and layerwise backends)
+- `_resident_modules`: additional modules to pin on GPU during layerwise
+  offloading (e.g. embedders, connectors). Only used by the layerwise
+  backend. Optional — defaults to `[]`.
+
+All attribute names support dotted paths for nested submodules
+(e.g. `"pipe.transformer"`, `"bagel.time_embedder"`).
+
+Pipelines without `SupportsModuleOffload` fall back to scanning
+well-known attribute names (`transformer`, `text_encoder`, `vae`,
+etc.), which fails for non-standard names.
+
 ---
 
 ## Iterative Development Tips

From 7df49c3c39d9b63d877e181bd24d3256ac8a07a3 Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Tue, 21 Apr 2026 09:31:30 -0400
Subject: [PATCH 10/10] [Cleanup] Trim docstrings in SupportsModuleOffload and
 ModuleDiscovery

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 vllm_omni/diffusion/models/interface.py       | 36 +++++--------------
 .../diffusion/offloader/module_collector.py   |  9 ++---
 2 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/vllm_omni/diffusion/models/interface.py b/vllm_omni/diffusion/models/interface.py
index 34567104374..00d54420dfe 100644
--- a/vllm_omni/diffusion/models/interface.py
+++ b/vllm_omni/diffusion/models/interface.py
@@ -60,35 +60,17 @@ def post_decode(self, state: DiffusionRequestState, **kwargs: Any) -> DiffusionO
 
 @runtime_checkable
 class SupportsModuleOffload(Protocol):
-    """Declares which submodules participate in sequential CPU offload.
+    """Declares which submodules participate in CPU offload.
 
-    The offload system uses mutual exclusion: when one group runs,
-    the other is moved to CPU.  Pipelines must declare both groups
-    because only the pipeline knows its own architecture.
+    All attribute names support dotted paths for nested submodules
+    (e.g. ``"pipe.transformer"``).
 
-    ``_dit_modules``: attribute names of denoising submodules (kept
-    on GPU during the diffusion loop).
-
-    ``_encoder_modules``: attribute names of encoder/vision
-    submodules (offloaded to CPU during the diffusion loop).
-
-    ``_vae_modules``: attribute names of VAE(s) (always kept on GPU,
-    not part of the mutual exclusion hooks).
-
-    ``_resident_modules``: attribute names of small submodules that
-    must stay on GPU during layer-wise offloading (e.g. embedders,
-    connectors).  Optional — defaults to ``[]``.
-
-    All attribute names support dotted paths (e.g.
-    ``"bagel.time_embedder"``) for nested submodules.
-
-    Example::
-
-        class MyPipeline(nn.Module, SupportsModuleOffload):
-            _dit_modules: ClassVar[list[str]] = ["transformer"]
-            _encoder_modules: ClassVar[list[str]] = ["text_encoder", "vit"]
-            _vae_modules: ClassVar[list[str]] = ["vae"]
-            _resident_modules: ClassVar[list[str]] = ["time_embedder"]
+    Attributes:
+        _dit_modules: Denoising submodules (on GPU during diffusion).
+        _encoder_modules: Encoder submodules (offloaded during diffusion).
+        _vae_modules: VAE(s) (always on GPU).
+        _resident_modules: Extra modules pinned on GPU during layerwise
+            offloading.  Optional, defaults to ``[]``.
     """
 
     _dit_modules: ClassVar[list[str]]
diff --git a/vllm_omni/diffusion/offloader/module_collector.py b/vllm_omni/diffusion/offloader/module_collector.py
index c0aa56522d7..dfd81e98b89 100644
--- a/vllm_omni/diffusion/offloader/module_collector.py
+++ b/vllm_omni/diffusion/offloader/module_collector.py
@@ -63,13 +63,8 @@ def _collect_modules(
     ) -> tuple[list[nn.Module], list[str]]:
         """Resolve attribute names to (module, name) pairs, skipping missing.
 
-        Dotted names (e.g. ``"pipe.transformer"``) are resolved by
-        walking the attribute chain via :func:`operator.attrgetter`.
-
-        When *warn_missing* is True (protocol path), warn about
-        declared attributes that do not exist.  Non-``nn.Module``
-        attributes always produce a warning regardless (they indicate
-        a real bug, even in the fallback scan).
+        Supports dotted paths via :func:`operator.attrgetter`.
+        Warns on missing attributes when *warn_missing* is True.
         """
         modules: list[nn.Module] = []
         names: list[str] = []