From 71fb970abea17fdfae1fde37371377aed814e70b Mon Sep 17 00:00:00 2001 From: jyli Date: Tue, 28 Apr 2026 06:56:22 +0000 Subject: [PATCH 1/3] shrink wan_t2v_size for xpu Co-authored-by: JianyuLi01 <109940438+JianyuLi01@users.noreply.github.com> --- .../text_to_video/run_t2v.sh | 19 +++++++ .../text_to_video/text_to_video.py | 8 +++ vllm_omni/diffusion/diffusion_engine.py | 4 ++ .../models/wan2_2/pipeline_wan2_2.py | 50 ++++++++++++++++--- .../models/wan2_2/wan2_2_transformer.py | 5 ++ 5 files changed, 79 insertions(+), 7 deletions(-) create mode 100755 examples/offline_inference/text_to_video/run_t2v.sh diff --git a/examples/offline_inference/text_to_video/run_t2v.sh b/examples/offline_inference/text_to_video/run_t2v.sh new file mode 100755 index 0000000000..1059d328f4 --- /dev/null +++ b/examples/offline_inference/text_to_video/run_t2v.sh @@ -0,0 +1,19 @@ +export VLLM_OMNI_WAN_DUMMY_TEXT_ENCODER=1 +export VLLM_OMNI_SKIP_DUMMY_RUN=1 +export VLLM_OMNI_WAN_PROFILE_TRANSFORMER_ONLY=1 +export VLLM_TORCH_PROFILER_DIR="./" + +MODEL_PATH="/mnt/disk2/hf_models/Wan2.1-T2V-1.3B-Diffusers" +MODEL_PATH="/mnt/disk2/hf_models/Wan2.2-T2V-A14B-Diffusers" + +python text_to_video.py \ + --model "$MODEL_PATH" \ + --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \ + --height 128 --width 128 --num-frames 9 \ + --guidance-scale 1.0 --guidance-scale-high 1.0 \ + --boundary-ratio 0.0 --flow-shift 12.0 \ + --num-inference-steps 10 --fps 16 \ + --output t2v_out.mp4 \ + --enforce-eager + +# --profiler-config '{"profiler":"torch","torch_profiler_dir":"./perf"}' diff --git a/examples/offline_inference/text_to_video/text_to_video.py b/examples/offline_inference/text_to_video/text_to_video.py index a96949fa54..d1bbf27cb4 100644 --- a/examples/offline_inference/text_to_video/text_to_video.py +++ b/examples/offline_inference/text_to_video/text_to_video.py @@ -319,6 +319,7 @@ def main(): if args.negative_prompt: prompt_dict["negative_prompt"] = args.negative_prompt + output_type = "latent" sampling_kwargs = dict( height=args.height, width=args.width, @@ -326,6 +327,7 @@ def main(): guidance_scale=args.guidance_scale, num_inference_steps=args.num_inference_steps, num_frames=args.num_frames, + output_type=output_type, ) if args.guidance_scale_high is not None: sampling_kwargs["guidance_scale_2"] = args.guidance_scale_high @@ -340,6 +342,12 @@ def main(): # Print profiling results print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)") + if output_type == "latent": + if profiler_enabled: + print("\n[Profiler] Stopping profiler and collecting results...") + profile_results = omni.stop_profile() + print(profile_results) + return audio = None if isinstance(frames, list): diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index abaf598959..d0215385a9 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -403,6 +403,10 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N raise RuntimeError(f"Could not {action} profiler: {e}") from e def _dummy_run(self): + if os.environ.get("VLLM_OMNI_SKIP_DUMMY_RUN", "0") == "1": + logger.warning("Skipping diffusion dummy run because VLLM_OMNI_SKIP_DUMMY_RUN=1") + return + """A dummy run to warm up the model.""" num_inference_steps = 1 height = 512 diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py index 188eb70b2b..34045a8aaa 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py @@ -316,13 +316,25 @@ def __init__( ) ) - self.tokenizer = AutoTokenizer.from_pretrained(model, subfolder="tokenizer", local_files_only=local_files_only) - self.text_encoder = UMT5EncoderModel.from_pretrained( - model, subfolder="text_encoder", torch_dtype=dtype, local_files_only=local_files_only - ).to(self.device) - self.vae = DistributedAutoencoderKLWan.from_pretrained( - model, subfolder="vae", torch_dtype=dtype, local_files_only=local_files_only - ).to(self.device) + self.transformer_only_profile = os.environ.get("VLLM_OMNI_WAN_PROFILE_TRANSFORMER_ONLY", "0") == "1" + + if self.transformer_only_profile: + logger.warning( + "VLLM_OMNI_WAN_PROFILE_TRANSFORMER_ONLY=1: " + "skipping tokenizer, text_encoder and VAE loading. " + "This mode is only for transformer/operator profiling." + ) + self.tokenizer = None + self.text_encoder = None + self.vae = None + else: + self.tokenizer = AutoTokenizer.from_pretrained(model, subfolder="tokenizer", local_files_only=local_files_only) + self.text_encoder = UMT5EncoderModel.from_pretrained( + model, subfolder="text_encoder", torch_dtype=dtype, local_files_only=local_files_only + ).to(self.device) + self.vae = DistributedAutoencoderKLWan.from_pretrained( + model, subfolder="vae", torch_dtype=torch.float32, local_files_only=local_files_only + ).to(self.device) # Initialize transformers with correct config (weights loaded via load_weights) if load_transformer: @@ -510,6 +522,11 @@ def forward( width = req.sampling_params.width or width num_frames = req.sampling_params.num_frames if req.sampling_params.num_frames else frame_num + if req.sampling_params.output_type is not None: + output_type = req.sampling_params.output_type + if os.environ.get("VLLM_OMNI_WAN_PROFILE_TRANSFORMER_ONLY", "0") == "1": + output_type = "latent" + # Ensure dimensions are compatible with VAE and patch size # For expand_timesteps mode, we need latent dims to be even (divisible by patch_size) patch_size = self.transformer_config.patch_size @@ -821,6 +838,25 @@ def encode_prompt( prompt_clean = [self._prompt_clean(p) for p in prompt] batch_size = len(prompt_clean) + if ( + os.environ.get("VLLM_OMNI_WAN_DUMMY_TEXT_ENCODER", "0") == "1" + or os.environ.get("VLLM_OMNI_WAN_PROFILE_TRANSFORMER_ONLY", "0") == "1" + ): + text_dim = self.transformer_config.text_dim + prompt_embeds = torch.zeros( + batch_size * num_videos_per_prompt, + max_sequence_length, + text_dim, + device=device, + dtype=dtype, + ) + + negative_prompt_embeds = None + if do_classifier_free_guidance: + negative_prompt_embeds = torch.zeros_like(prompt_embeds) + + return prompt_embeds, negative_prompt_embeds + text_inputs = self.tokenizer( prompt_clean, padding="max_length", diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index 98e10b4087..0edd221428 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -995,6 +995,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if weight_name not in original_name: continue lookup_name = original_name.replace(weight_name, param_name) + + if lookup_name not in params_dict: + logger.warning(f"Skipping weight {original_name} -> {lookup_name}") + break + param = params_dict[lookup_name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) From dcecdc9a0a1d2b7febd0911ce02ef83928c5e97c Mon Sep 17 00:00:00 2001 From: jyli Date: Tue, 28 Apr 2026 10:28:51 +0000 Subject: [PATCH 2/3] enable profiling Co-authored-by: JianyuLi01 <109940438+JianyuLi01@users.noreply.github.com> --- examples/offline_inference/text_to_video/run_t2v.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference/text_to_video/run_t2v.sh b/examples/offline_inference/text_to_video/run_t2v.sh index 1059d328f4..ef3c467bca 100755 --- a/examples/offline_inference/text_to_video/run_t2v.sh +++ b/examples/offline_inference/text_to_video/run_t2v.sh @@ -12,7 +12,7 @@ python text_to_video.py \ --height 128 --width 128 --num-frames 9 \ --guidance-scale 1.0 --guidance-scale-high 1.0 \ --boundary-ratio 0.0 --flow-shift 12.0 \ - --num-inference-steps 10 --fps 16 \ + --num-inference-steps 40 --fps 16 \ --output t2v_out.mp4 \ --enforce-eager From 5cb7fb721c6a7c34dfc102bcabb0468337da0e4d Mon Sep 17 00:00:00 2001 From: jyli Date: Wed, 29 Apr 2026 00:55:15 +0000 Subject: [PATCH 3/3] restore original images size Co-authored-by: JianyuLi01 <109940438+JianyuLi01@users.noreply.github.com> --- examples/offline_inference/text_to_video/run_t2v.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference/text_to_video/run_t2v.sh b/examples/offline_inference/text_to_video/run_t2v.sh index ef3c467bca..a1b97e0f0d 100755 --- a/examples/offline_inference/text_to_video/run_t2v.sh +++ b/examples/offline_inference/text_to_video/run_t2v.sh @@ -9,7 +9,7 @@ MODEL_PATH="/mnt/disk2/hf_models/Wan2.2-T2V-A14B-Diffusers" python text_to_video.py \ --model "$MODEL_PATH" \ --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \ - --height 128 --width 128 --num-frames 9 \ + --height 480 --width 832 --num-frames 9 \ --guidance-scale 1.0 --guidance-scale-high 1.0 \ --boundary-ratio 0.0 --flow-shift 12.0 \ --num-inference-steps 40 --fps 16 \