huggingface · regisss · May 19, 2025 · May 7, 2025 · May 7, 2025 · May 9, 2025
@@ -339,25 +339,6 @@ Stable Video Diffusion (SVD) was unveiled in [Stable Video Diffusion Announcemen
 by the Stability AI team. Stable Video Diffusion XT version (SVD-XT) is tuned to generate 25 frames of video from a single image.
 
 
-# CogvideoX Examples
-
-CogVideoX is an open-source version of the video generation model originating from QingYing, unveiled in https://huggingface.co/THUDM/CogVideoX-5b.
-
-```bash
-python text_to_video_generation.py \
-    --model_name_or_path "THUDM/CogVideoX-2b" \
-    --pipeline_type "cogvideox" \
-    --prompts "An astronaut riding a horse" \
-    --use_habana \
-    --use_hpu_graphs \
-    --num_videos_per_prompt 1 \
-    --num_inference_steps 50 \
-    --num_frames 49 \
-    --guidance_scale 6 \
-    --dtype bf16
-```
-
-
 ## Image-to-Video Generation
 
 Script `image_to_video_generation.py` showcases how to perform image-to-video generation using Stable Video Diffusion on Intel Gaudi.
@@ -421,8 +402,7 @@ python image_to_video_generation.py \
     --width=512 \
     --height=512
 ```
-
-# I2vgen-xl
+### Image-to-Video with I2vgen-xl
 I2vgen-xl is high quality Image-to-Video synthesis via cascaded diffusion models. Please refer to  [Huggingface i2vgen-xl doc](https://huggingface.co/ali-vilab/i2vgen-xl).
 
 Here is how to generate video with one image and text prompt:
@@ -447,6 +427,24 @@ python image_to_video_generation.py \
     --bf16
 ```
 
+### Text-to-Video with CogvideoX
+
+CogVideoX is an open-source version of the video generation model originating from QingYing, unveiled in https://huggingface.co/THUDM/CogVideoX-5b.
+
+```bash
+python text_to_video_generation.py \
+    --model_name_or_path "THUDM/CogVideoX-2b" \
+    --pipeline_type "cogvideox" \
+    --prompts "An astronaut riding a horse" \
+    --use_habana \
+    --use_hpu_graphs \
+    --num_videos_per_prompt 1 \
+    --num_inference_steps 50 \
+    --num_frames 49 \
+    --guidance_scale 6 \
+    --dtype bf16
+```
+
 # Important Notes for Gaudi3 Users
 
  - **Batch Size Limitation**: Due to a known issue, batch sizes for some Stable Diffusion models need to be reduced.

@@ -15,7 +15,6 @@
 from typing import Optional, Union
 
 import torch
-import torch.nn.functional as F
 from diffusers.models.autoencoders.vae import DecoderOutput
 
 
@@ -120,13 +119,14 @@ def CogVideoXCausalConv3dforwardGaudi(
     inputs = self.fake_context_parallel_forward(inputs, conv_cache)
     # conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
 
-    padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
-    inputs_pad = F.pad(inputs, padding_2d, mode="constant", value=0)
+    if self.pad_mode == "replicate":
+        conv_cache = None
+    else:
+        if self.time_kernel_size > 1:
+            if conv_cache is not None and conv_cache.shape == inputs[:, :, -self.time_kernel_size + 1 :].shape:
+                conv_cache.copy_(inputs[:, :, -self.time_kernel_size + 1 :])
+            else:
+                conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
 
-    output = self.conv(inputs_pad)
-    if self.time_kernel_size > 1:
-        if conv_cache is not None and conv_cache.shape == inputs[:, :, -self.time_kernel_size + 1 :].shape:
-            conv_cache.copy_(inputs[:, :, -self.time_kernel_size + 1 :])
-        else:
-            conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
+    output = self.conv(inputs)
     return output, conv_cache
@@ -27,7 +27,14 @@
 from diffusers.models.transformers import FluxTransformer2DModel
 from diffusers.pipelines.flux.pipeline_flux_img2img import FluxImg2ImgPipeline, calculate_shift, retrieve_timesteps
 from diffusers.utils import BaseOutput, replace_example_docstring
-from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
 
 from optimum.utils import logging
 
@@ -262,7 +269,7 @@ def __call__(
 
 class GaudiFluxImg2ImgPipeline(GaudiDiffusionPipeline, FluxImg2ImgPipeline):
     r"""
-    Adapted from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L162
+    Adapted from https://github.com/huggingface/diffusers/blob/v0.33.1/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L169
 
     The Flux pipeline for image-to-image generation.
 
@@ -314,6 +321,8 @@ def __init__(
         text_encoder_2: T5EncoderModel,
         tokenizer_2: T5TokenizerFast,
         transformer: FluxTransformer2DModel,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
         use_habana: bool = False,
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,

@@ -33,7 +33,7 @@
     "optimum",
     "torch",
     "accelerate >= 1.7.0",
-    "diffusers >= 0.32.0, < 0.32.1",
+    "diffusers >= 0.33.1, < 0.33.2",
     "huggingface_hub >= 0.24.7",
     "sentence-transformers == 3.3.1",
 ]

@@ -5195,6 +5195,12 @@ def get_dummy_components(self):
         }
         return components
 
+    # IG: this test passes locally, but crashes on CI with uncleared graph. Adding teardown and gc.collect to remediate.
+    def test_attention_slicing_forward_pass(self):
+        super().tearDown()
+        gc.collect()
+        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
     def get_dummy_inputs(self, device, seed=0):
         # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
         # ensure determinism for the device-dependent torch.Generator on HPU