Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 19 additions & 21 deletions examples/stable-diffusion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -339,25 +339,6 @@ Stable Video Diffusion (SVD) was unveiled in [Stable Video Diffusion Announcemen
by the Stability AI team. Stable Video Diffusion XT version (SVD-XT) is tuned to generate 25 frames of video from a single image.


# CogvideoX Examples

CogVideoX is an open-source version of the video generation model originating from QingYing, unveiled in https://huggingface.co/THUDM/CogVideoX-5b.

```bash
python text_to_video_generation.py \
--model_name_or_path "THUDM/CogVideoX-2b" \
--pipeline_type "cogvideox" \
--prompts "An astronaut riding a horse" \
--use_habana \
--use_hpu_graphs \
--num_videos_per_prompt 1 \
--num_inference_steps 50 \
--num_frames 49 \
--guidance_scale 6 \
--dtype bf16
```


## Image-to-Video Generation

Script `image_to_video_generation.py` showcases how to perform image-to-video generation using Stable Video Diffusion on Intel Gaudi.
Expand Down Expand Up @@ -421,8 +402,7 @@ python image_to_video_generation.py \
--width=512 \
--height=512
```

# I2vgen-xl
### Image-to-Video with I2vgen-xl
I2vgen-xl is high quality Image-to-Video synthesis via cascaded diffusion models. Please refer to [Huggingface i2vgen-xl doc](https://huggingface.co/ali-vilab/i2vgen-xl).

Here is how to generate video with one image and text prompt:
Expand All @@ -447,6 +427,24 @@ python image_to_video_generation.py \
--bf16
```

### Text-to-Video with CogvideoX

CogVideoX is an open-source version of the video generation model originating from QingYing, unveiled in https://huggingface.co/THUDM/CogVideoX-5b.

```bash
python text_to_video_generation.py \
--model_name_or_path "THUDM/CogVideoX-2b" \
--pipeline_type "cogvideox" \
--prompts "An astronaut riding a horse" \
--use_habana \
--use_hpu_graphs \
--num_videos_per_prompt 1 \
--num_inference_steps 50 \
--num_frames 49 \
--guidance_scale 6 \
--dtype bf16
```

# Important Notes for Gaudi3 Users

- **Batch Size Limitation**: Due to a known issue, batch sizes for some Stable Diffusion models need to be reduced.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from typing import Optional, Union

import torch
import torch.nn.functional as F
from diffusers.models.autoencoders.vae import DecoderOutput


Expand Down Expand Up @@ -120,13 +119,14 @@ def CogVideoXCausalConv3dforwardGaudi(
inputs = self.fake_context_parallel_forward(inputs, conv_cache)
# conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()

padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
inputs_pad = F.pad(inputs, padding_2d, mode="constant", value=0)
if self.pad_mode == "replicate":
conv_cache = None
else:
if self.time_kernel_size > 1:
if conv_cache is not None and conv_cache.shape == inputs[:, :, -self.time_kernel_size + 1 :].shape:
conv_cache.copy_(inputs[:, :, -self.time_kernel_size + 1 :])
else:
conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()

output = self.conv(inputs_pad)
if self.time_kernel_size > 1:
if conv_cache is not None and conv_cache.shape == inputs[:, :, -self.time_kernel_size + 1 :].shape:
conv_cache.copy_(inputs[:, :, -self.time_kernel_size + 1 :])
else:
conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
output = self.conv(inputs)
return output, conv_cache
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,14 @@
from diffusers.models.transformers import FluxTransformer2DModel
from diffusers.pipelines.flux.pipeline_flux_img2img import FluxImg2ImgPipeline, calculate_shift, retrieve_timesteps
from diffusers.utils import BaseOutput, replace_example_docstring
from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
from transformers import (
CLIPImageProcessor,
CLIPTextModel,
CLIPTokenizer,
CLIPVisionModelWithProjection,
T5EncoderModel,
T5TokenizerFast,
)

from optimum.utils import logging

Expand Down Expand Up @@ -262,7 +269,7 @@ def __call__(

class GaudiFluxImg2ImgPipeline(GaudiDiffusionPipeline, FluxImg2ImgPipeline):
r"""
Adapted from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L162
Adapted from https://github.com/huggingface/diffusers/blob/v0.33.1/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L169

The Flux pipeline for image-to-image generation.

Expand Down Expand Up @@ -314,6 +321,8 @@ def __init__(
text_encoder_2: T5EncoderModel,
tokenizer_2: T5TokenizerFast,
transformer: FluxTransformer2DModel,
image_encoder: CLIPVisionModelWithProjection = None,
feature_extractor: CLIPImageProcessor = None,
use_habana: bool = False,
use_hpu_graphs: bool = False,
gaudi_config: Union[str, GaudiConfig] = None,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"optimum",
"torch",
"accelerate >= 1.7.0",
"diffusers >= 0.32.0, < 0.32.1",
"diffusers >= 0.33.1, < 0.33.2",
"huggingface_hub >= 0.24.7",
"sentence-transformers == 3.3.1",
]
Expand Down
6 changes: 6 additions & 0 deletions tests/test_diffusers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5195,6 +5195,12 @@ def get_dummy_components(self):
}
return components

# IG: this test passes locally, but crashes on CI with uncleared graph. Adding teardown and gc.collect to remediate.
def test_attention_slicing_forward_pass(self):
super().tearDown()
gc.collect()
super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)

def get_dummy_inputs(self, device, seed=0):
# TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
# ensure determinism for the device-dependent torch.Generator on HPU
Expand Down