Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
26a4e8d
add time cost log for different stages
SamitHuang Mar 6, 2026
b3b70a8
reduce hop3 overhead
SamitHuang Mar 6, 2026
104d71c
perf: reduce IPC overhead for single-stage diffusion serving
SamitHuang Mar 6, 2026
bf2ddb0
perf: reduce IPC overhead for single-stage diffusion serving (~6.5s, …
SamitHuang Mar 6, 2026
735b2ca
Merge branch 'main' into main
SamitHuang Mar 6, 2026
dd4468c
fix conflicts
SamitHuang Mar 6, 2026
ff62a1e
rm redundancy
SamitHuang Mar 9, 2026
870963e
Merge branch 'main' into main
SamitHuang Mar 9, 2026
5414a42
rm logs
SamitHuang Mar 9, 2026
e3dec54
fix inline
SamitHuang Mar 9, 2026
2cd9f9f
fix ci
SamitHuang Mar 9, 2026
172040a
fix ci
SamitHuang Mar 9, 2026
0a86fc5
fix log
SamitHuang Mar 9, 2026
9b9c597
fix
SamitHuang Mar 9, 2026
bda0f2d
fix log
SamitHuang Mar 9, 2026
3452ad3
Merge branch 'main' of https://github.com/samithuang/vllm-omni
SamitHuang Mar 9, 2026
9ab7c55
Merge remote-tracking branch 'upstream/main'
SamitHuang Mar 12, 2026
c32a78a
Merge remote-tracking branch 'upstream/main'
SamitHuang Mar 12, 2026
5fcf302
[Enhancement] Upgrade cache-dit from 1.2.0 to 1.3.0
SamitHuang Mar 12, 2026
a05183c
[Enhancement] Add cache-dit force_refresh support for Helios and GLM-…
SamitHuang Mar 12, 2026
2ba9814
[Enhancement] Add cache-dit CLI support to GLM-Image end2end.py
SamitHuang Mar 12, 2026
c36e4b4
[Enhancement] Remove cache-dit support for Helios, keep GLM-Image only
SamitHuang Mar 12, 2026
c9e7f43
Merge branch 'main' into feat/cache-dit-helios-glm-image
SamitHuang Mar 12, 2026
833ce12
Merge branch 'main' into feat/cache-dit-helios-glm-image
wtomin Mar 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions examples/offline_inference/glm_image/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,15 +238,34 @@ def main(args: argparse.Namespace) -> None:
if args.negative_prompt:
prompt_dict["negative_prompt"] = args.negative_prompt

# Build cache-dit config if requested
cache_config = None
if args.cache_backend == "cache_dit":
cache_config = {
"Fn_compute_blocks": 1,
"Bn_compute_blocks": 0,
"max_warmup_steps": 4,
"residual_diff_threshold": 0.24,
"max_continuous_cached_steps": 3,
"enable_taylorseer": False,
"taylorseer_order": 1,
"scm_steps_mask_policy": None,
"scm_steps_policy": "dynamic",
}

# Initialize Omni with multistage config
print("\nInitializing Omni with multistage pipeline...")
print(f" Cache backend: {args.cache_backend or 'None (no acceleration)'}")
start_time = time.time()

omni = Omni(
model=args.model_path,
stage_configs_path=config_path,
log_stats=args.enable_stats,
stage_init_timeout=args.stage_init_timeout,
cache_backend=args.cache_backend,
cache_config=cache_config,
enable_cache_dit_summary=getattr(args, "enable_cache_dit_summary", False),
enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler,
)

Expand Down Expand Up @@ -444,6 +463,20 @@ def parse_args() -> argparse.Namespace:
help="Number of images to generate (default: 1)",
)

# Cache acceleration
parser.add_argument(
"--cache-backend",
type=str,
default=None,
choices=["cache_dit"],
help="Cache backend for DiT acceleration. Default: None (no cache).",
)
parser.add_argument(
"--enable-cache-dit-summary",
action="store_true",
help="Enable cache-dit summary logging after diffusion forward passes.",
)

# Runtime options
parser.add_argument(
"--enable-stats",
Expand Down
33 changes: 33 additions & 0 deletions vllm_omni/diffusion/cache/cache_dit_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def _build_db_cache_config(cache_config: Any) -> DBCacheConfig:
max_cached_steps=cache_config.max_cached_steps,
max_continuous_cached_steps=cache_config.max_continuous_cached_steps,
residual_diff_threshold=cache_config.residual_diff_threshold,
force_refresh_step_hint=cache_config.force_refresh_step_hint,
force_refresh_step_policy=cache_config.force_refresh_step_policy,
)


Expand Down Expand Up @@ -1091,6 +1093,36 @@ def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool
return refresh_cache_context


def enable_cache_for_glm_image(pipeline: Any, cache_config: Any) -> Callable[[int], None]:
"""Enable cache-dit for GLM-Image pipeline.

GLM-Image processes prompt and image by calling the transformer before the
denoising loop. When an input image is provided (editing mode), the cache must
be force-refreshed after the preprocessing step so stale hidden states are
discarded. Set force_refresh_step_hint = 1 for editing, None for text-to-image.
"""
db_cache_config = _build_db_cache_config(cache_config)

calibrator_config = None
if cache_config.enable_taylorseer:
calibrator_config = TaylorSeerCalibratorConfig(taylorseer_order=cache_config.taylorseer_order)
logger.info(f"TaylorSeer enabled with order={cache_config.taylorseer_order}")

logger.info(
f"Enabling cache-dit on GLM-Image transformer: "
f"Fn={db_cache_config.Fn_compute_blocks}, "
f"Bn={db_cache_config.Bn_compute_blocks}, "
f"W={db_cache_config.max_warmup_steps}, "
f"force_refresh_step_hint={db_cache_config.force_refresh_step_hint}, "
)

cache_dit.enable_cache(
pipeline.transformer,
cache_config=db_cache_config,
calibrator_config=calibrator_config,
)


def enable_cache_for_flux2(pipeline: Any, cache_config: Any) -> Callable[[int], None]:
"""Enable cache-dit for Flux.2-dev pipeline.

Expand Down Expand Up @@ -1180,6 +1212,7 @@ def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool
"LTX2Pipeline": enable_cache_for_ltx2,
"LTX2ImageToVideoPipeline": enable_cache_for_ltx2,
"BagelPipeline": enable_cache_for_bagel,
"GlmImagePipeline": enable_cache_for_glm_image,
"Flux2Pipeline": enable_cache_for_flux2,
}
)
Expand Down
6 changes: 6 additions & 0 deletions vllm_omni/diffusion/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,12 @@ class DiffusionCacheConfig:
# Used by cache-dit for scm mask generation. If this value changes during inference,
# we will re-generate the scm mask and refresh the cache context.
num_inference_steps: int | None = None
# Force refresh the cache at a specific step index hint, useful for models like
# GLM-Image (image preprocessing step in editing mode).
force_refresh_step_hint: int | None = None
# Policy for force refresh: "once" refreshes only at the hint step,
# "repeat" refreshes every force_refresh_step_hint steps.
force_refresh_step_policy: str = "once"

# Additional parameters that may be passed but not explicitly defined
_extra_params: dict[str, Any] = field(default_factory=dict, repr=False)
Expand Down
Loading