diff --git a/examples/stable-diffusion/README.md b/examples/stable-diffusion/README.md index c2dc936464..78cf511b51 100644 --- a/examples/stable-diffusion/README.md +++ b/examples/stable-diffusion/README.md @@ -339,25 +339,6 @@ Stable Video Diffusion (SVD) was unveiled in [Stable Video Diffusion Announcemen by the Stability AI team. Stable Video Diffusion XT version (SVD-XT) is tuned to generate 25 frames of video from a single image. -# CogvideoX Examples - -CogVideoX is an open-source version of the video generation model originating from QingYing, unveiled in https://huggingface.co/THUDM/CogVideoX-5b. - -```bash -python text_to_video_generation.py \ - --model_name_or_path "THUDM/CogVideoX-2b" \ - --pipeline_type "cogvideox" \ - --prompts "An astronaut riding a horse" \ - --use_habana \ - --use_hpu_graphs \ - --num_videos_per_prompt 1 \ - --num_inference_steps 50 \ - --num_frames 49 \ - --guidance_scale 6 \ - --dtype bf16 -``` - - ## Image-to-Video Generation Script `image_to_video_generation.py` showcases how to perform image-to-video generation using Stable Video Diffusion on Intel Gaudi. @@ -421,8 +402,7 @@ python image_to_video_generation.py \ --width=512 \ --height=512 ``` - -# I2vgen-xl +### Image-to-Video with I2vgen-xl I2vgen-xl is high quality Image-to-Video synthesis via cascaded diffusion models. Please refer to [Huggingface i2vgen-xl doc](https://huggingface.co/ali-vilab/i2vgen-xl). Here is how to generate video with one image and text prompt: @@ -447,6 +427,24 @@ python image_to_video_generation.py \ --bf16 ``` +### Text-to-Video with CogvideoX + +CogVideoX is an open-source version of the video generation model originating from QingYing, unveiled in https://huggingface.co/THUDM/CogVideoX-5b. + +```bash +python text_to_video_generation.py \ + --model_name_or_path "THUDM/CogVideoX-2b" \ + --pipeline_type "cogvideox" \ + --prompts "An astronaut riding a horse" \ + --use_habana \ + --use_hpu_graphs \ + --num_videos_per_prompt 1 \ + --num_inference_steps 50 \ + --num_frames 49 \ + --guidance_scale 6 \ + --dtype bf16 +``` + # Important Notes for Gaudi3 Users - **Batch Size Limitation**: Due to a known issue, batch sizes for some Stable Diffusion models need to be reduced. diff --git a/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py index 409f77995f..4d037fc272 100644 --- a/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +++ b/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py @@ -15,7 +15,6 @@ from typing import Optional, Union import torch -import torch.nn.functional as F from diffusers.models.autoencoders.vae import DecoderOutput @@ -120,13 +119,14 @@ def CogVideoXCausalConv3dforwardGaudi( inputs = self.fake_context_parallel_forward(inputs, conv_cache) # conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone() - padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad) - inputs_pad = F.pad(inputs, padding_2d, mode="constant", value=0) + if self.pad_mode == "replicate": + conv_cache = None + else: + if self.time_kernel_size > 1: + if conv_cache is not None and conv_cache.shape == inputs[:, :, -self.time_kernel_size + 1 :].shape: + conv_cache.copy_(inputs[:, :, -self.time_kernel_size + 1 :]) + else: + conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone() - output = self.conv(inputs_pad) - if self.time_kernel_size > 1: - if conv_cache is not None and conv_cache.shape == inputs[:, :, -self.time_kernel_size + 1 :].shape: - conv_cache.copy_(inputs[:, :, -self.time_kernel_size + 1 :]) - else: - conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone() + output = self.conv(inputs) return output, conv_cache diff --git a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py index 17894db5ae..8ada22a143 100644 --- a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py +++ b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py @@ -27,7 +27,14 @@ from diffusers.models.transformers import FluxTransformer2DModel from diffusers.pipelines.flux.pipeline_flux_img2img import FluxImg2ImgPipeline, calculate_shift, retrieve_timesteps from diffusers.utils import BaseOutput, replace_example_docstring -from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast +from transformers import ( + CLIPImageProcessor, + CLIPTextModel, + CLIPTokenizer, + CLIPVisionModelWithProjection, + T5EncoderModel, + T5TokenizerFast, +) from optimum.utils import logging @@ -262,7 +269,7 @@ def __call__( class GaudiFluxImg2ImgPipeline(GaudiDiffusionPipeline, FluxImg2ImgPipeline): r""" - Adapted from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L162 + Adapted from https://github.com/huggingface/diffusers/blob/v0.33.1/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L169 The Flux pipeline for image-to-image generation. @@ -314,6 +321,8 @@ def __init__( text_encoder_2: T5EncoderModel, tokenizer_2: T5TokenizerFast, transformer: FluxTransformer2DModel, + image_encoder: CLIPVisionModelWithProjection = None, + feature_extractor: CLIPImageProcessor = None, use_habana: bool = False, use_hpu_graphs: bool = False, gaudi_config: Union[str, GaudiConfig] = None, diff --git a/setup.py b/setup.py index f49b043990..e54d6626d7 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ "optimum", "torch", "accelerate >= 1.7.0", - "diffusers >= 0.32.0, < 0.32.1", + "diffusers >= 0.33.1, < 0.33.2", "huggingface_hub >= 0.24.7", "sentence-transformers == 3.3.1", ] diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py index 8a06f6b330..7678d205ed 100644 --- a/tests/test_diffusers.py +++ b/tests/test_diffusers.py @@ -5195,6 +5195,12 @@ def get_dummy_components(self): } return components + # IG: this test passes locally, but crashes on CI with uncleared graph. Adding teardown and gc.collect to remediate. + def test_attention_slicing_forward_pass(self): + super().tearDown() + gc.collect() + super().test_attention_slicing_forward_pass(expected_max_diff=3e-3) + def get_dummy_inputs(self, device, seed=0): # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched # ensure determinism for the device-dependent torch.Generator on HPU