From 8fdfd0ef1ff80a20e362fe5ed9ff0191026b1db3 Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Wed, 7 May 2025 22:21:37 +0000
Subject: [PATCH 01/10] fea(): skipping contrastive_generate_dynamic_shapes
 based on RD team list

---
 tests/transformers/tests/models/t5/test_modeling_t5.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/transformers/tests/models/t5/test_modeling_t5.py b/tests/transformers/tests/models/t5/test_modeling_t5.py
index 46754368aa..7f9da6f9ca 100644
--- a/tests/transformers/tests/models/t5/test_modeling_t5.py
+++ b/tests/transformers/tests/models/t5/test_modeling_t5.py
@@ -800,6 +800,10 @@ def test_decoder_model_past_with_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
+    @pytest.mark.skip("Skipped for Gaudi")
+    def test_contrastive_generate_dynamic_shapes(self):
+        pass
+
     @pytest.mark.skip("Skipped for Gaudi")
     def test_generate_with_past_key_values(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()

From e1d3c069b9b09779cd80434acbb1bf5f66e1c6b4 Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Wed, 7 May 2025 22:22:22 +0000
Subject: [PATCH 02/10] fea(): skipping roberta inputs_embeds_decoder_only due
 to HF changes

---
 .../tests/models/roberta/test_modeling_roberta.py            | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/transformers/tests/models/roberta/test_modeling_roberta.py b/tests/transformers/tests/models/roberta/test_modeling_roberta.py
index 7723c753e4..d9c6794f4b 100644
--- a/tests/transformers/tests/models/roberta/test_modeling_roberta.py
+++ b/tests/transformers/tests/models/roberta/test_modeling_roberta.py
@@ -16,6 +16,7 @@
 
 import unittest
 
+import pytest
 from transformers import RobertaConfig, is_torch_available
 from transformers.testing_utils import TestCasePlus, require_torch, slow
 
@@ -477,6 +478,10 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
+    @pytest.mark.skip("Skipped since HF upstream test is modified starting v4.47")
+    def test_generate_from_inputs_embeds_decoder_only():
+        pass
+
     @slow
     def test_model_from_pretrained(self):
         model_name = "FacebookAI/roberta-base"

From 96c8b2c761982d5343803632509b748abd770b03 Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Fri, 9 May 2025 23:30:16 +0000
Subject: [PATCH 03/10] initial commit

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c4ef7f50f5..463726e452 100644
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,7 @@
     "optimum",
     "torch",
     "accelerate >= 1.5.0",
-    "diffusers >= 0.32.0, < 0.32.1",
+    "diffusers >= 0.33.1, < 0.33.2",
     "huggingface_hub >= 0.24.7",
     "sentence-transformers == 3.3.1",
 ]

From c888ce57d61510c267f2fe15bf47f825285f5eed Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Mon, 12 May 2025 17:21:54 +0000
Subject: [PATCH 04/10] fixed flux changes

---
 .../pipelines/flux/pipeline_flux_img2img.py         | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py
index 17894db5ae..8ada22a143 100644
--- a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py
+++ b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py
@@ -27,7 +27,14 @@
 from diffusers.models.transformers import FluxTransformer2DModel
 from diffusers.pipelines.flux.pipeline_flux_img2img import FluxImg2ImgPipeline, calculate_shift, retrieve_timesteps
 from diffusers.utils import BaseOutput, replace_example_docstring
-from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
 
 from optimum.utils import logging
 
@@ -262,7 +269,7 @@ def __call__(
 
 class GaudiFluxImg2ImgPipeline(GaudiDiffusionPipeline, FluxImg2ImgPipeline):
     r"""
-    Adapted from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L162
+    Adapted from https://github.com/huggingface/diffusers/blob/v0.33.1/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L169
 
     The Flux pipeline for image-to-image generation.
 
@@ -314,6 +321,8 @@ def __init__(
         text_encoder_2: T5EncoderModel,
         tokenizer_2: T5TokenizerFast,
         transformer: FluxTransformer2DModel,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
         use_habana: bool = False,
         use_hpu_graphs: bool = False,
         gaudi_config: Union[str, GaudiConfig] = None,

From 1922a64d7ef9df55cf6298308389052fb11be38b Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Tue, 13 May 2025 20:02:30 +0000
Subject: [PATCH 05/10] fea(): Added test_attention_slicing_forward_pass in
 StableDiffusionInpaintPipelineTests class

---
 tests/test_diffusers.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index 8a06f6b330..da286dee24 100644
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -5234,6 +5234,9 @@ def test_stable_diffusion_inpaint(self):
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
+    def test_attention_slicing_forward_pass(self):
+        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
     def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 

From 587c14a2161094e4a1bb31ebfba3cd601f74696c Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Tue, 13 May 2025 22:11:18 +0000
Subject: [PATCH 06/10] Revert "fea(): Added
 test_attention_slicing_forward_pass in StableDiffusionInpaintPipelineTests
 class"

This reverts commit 1922a64d7ef9df55cf6298308389052fb11be38b.
---
 tests/test_diffusers.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index da286dee24..8a06f6b330 100644
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -5234,9 +5234,6 @@ def test_stable_diffusion_inpaint(self):
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
-    def test_attention_slicing_forward_pass(self):
-        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
-
     def test_inference_batch_single_identical(self):
         super().test_inference_batch_single_identical(expected_max_diff=3e-3)
 

From f237a3b9381c8a9cc7a3d251cb3142f212a8007f Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Tue, 13 May 2025 22:43:43 +0000
Subject: [PATCH 07/10] fea(): try test_attention_slicing_forward_pass with
 gc.collect

---
 tests/test_diffusers.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index 8a06f6b330..82df8b823b 100644
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -5195,6 +5195,11 @@ def get_dummy_components(self):
         }
         return components
 
+    def test_attention_slicing_forward_pass(self):
+        super().tearDown()
+        gc.collect()
+        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
+
     def get_dummy_inputs(self, device, seed=0):
         # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
         # ensure determinism for the device-dependent torch.Generator on HPU

From 82f669ad415705647dc3037d3352bcbc4a93eb97 Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Wed, 14 May 2025 17:47:26 +0000
Subject: [PATCH 08/10] Added a comment on test_attention_slicing_forward_pass
 CI behaviour

---
 tests/test_diffusers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index 82df8b823b..7678d205ed 100644
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -5195,6 +5195,7 @@ def get_dummy_components(self):
         }
         return components
 
+    # IG: this test passes locally, but crashes on CI with uncleared graph. Adding teardown and gc.collect to remediate.
     def test_attention_slicing_forward_pass(self):
         super().tearDown()
         gc.collect()

From f75cec938e6d390ab61e47f7925fd873a444fb46 Mon Sep 17 00:00:00 2001
From: "Zhou, Huijuan" <huijuan.zhou@intel.com>
Date: Thu, 15 May 2025 03:13:06 -0700
Subject: [PATCH 09/10] update CogVideoXCausalConv3dforwardGaudi for diffuser
 v0.33.x

---
 .../autoencoders/autoencoder_kl_cogvideox.py  | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
index 409f77995f..7f7db81e18 100644
--- a/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
+++ b/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
@@ -120,13 +120,14 @@ def CogVideoXCausalConv3dforwardGaudi(
     inputs = self.fake_context_parallel_forward(inputs, conv_cache)
     # conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
 
-    padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
-    inputs_pad = F.pad(inputs, padding_2d, mode="constant", value=0)
-
-    output = self.conv(inputs_pad)
-    if self.time_kernel_size > 1:
-        if conv_cache is not None and conv_cache.shape == inputs[:, :, -self.time_kernel_size + 1 :].shape:
-            conv_cache.copy_(inputs[:, :, -self.time_kernel_size + 1 :])
-        else:
-            conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
+    if self.pad_mode == "replicate":
+        conv_cache = None
+    else:
+        if self.time_kernel_size > 1:
+            if conv_cache is not None and conv_cache.shape == inputs[:, :, -self.time_kernel_size + 1 :].shape:
+                conv_cache.copy_(inputs[:, :, -self.time_kernel_size + 1 :])
+            else:
+                conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
+
+    output = self.conv(inputs)
     return output, conv_cache

From b4d87c16ac2747800100907ad3f5b04c1d08c1b2 Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Thu, 15 May 2025 22:50:16 +0000
Subject: [PATCH 10/10] fea(): cleaned up the readme. make style

---
 examples/stable-diffusion/README.md           | 40 +++++++++----------
 .../autoencoders/autoencoder_kl_cogvideox.py  |  1 -
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/examples/stable-diffusion/README.md b/examples/stable-diffusion/README.md
index c2dc936464..78cf511b51 100644
--- a/examples/stable-diffusion/README.md
+++ b/examples/stable-diffusion/README.md
@@ -339,25 +339,6 @@ Stable Video Diffusion (SVD) was unveiled in [Stable Video Diffusion Announcemen
 by the Stability AI team. Stable Video Diffusion XT version (SVD-XT) is tuned to generate 25 frames of video from a single image.
 
 
-# CogvideoX Examples
-
-CogVideoX is an open-source version of the video generation model originating from QingYing, unveiled in https://huggingface.co/THUDM/CogVideoX-5b.
-
-```bash
-python text_to_video_generation.py \
-    --model_name_or_path "THUDM/CogVideoX-2b" \
-    --pipeline_type "cogvideox" \
-    --prompts "An astronaut riding a horse" \
-    --use_habana \
-    --use_hpu_graphs \
-    --num_videos_per_prompt 1 \
-    --num_inference_steps 50 \
-    --num_frames 49 \
-    --guidance_scale 6 \
-    --dtype bf16
-```
-
-
 ## Image-to-Video Generation
 
 Script `image_to_video_generation.py` showcases how to perform image-to-video generation using Stable Video Diffusion on Intel Gaudi.
@@ -421,8 +402,7 @@ python image_to_video_generation.py \
     --width=512 \
     --height=512
 ```
-
-# I2vgen-xl
+### Image-to-Video with I2vgen-xl
 I2vgen-xl is high quality Image-to-Video synthesis via cascaded diffusion models. Please refer to  [Huggingface i2vgen-xl doc](https://huggingface.co/ali-vilab/i2vgen-xl).
 
 Here is how to generate video with one image and text prompt:
@@ -447,6 +427,24 @@ python image_to_video_generation.py \
     --bf16
 ```
 
+### Text-to-Video with CogvideoX
+
+CogVideoX is an open-source version of the video generation model originating from QingYing, unveiled in https://huggingface.co/THUDM/CogVideoX-5b.
+
+```bash
+python text_to_video_generation.py \
+    --model_name_or_path "THUDM/CogVideoX-2b" \
+    --pipeline_type "cogvideox" \
+    --prompts "An astronaut riding a horse" \
+    --use_habana \
+    --use_hpu_graphs \
+    --num_videos_per_prompt 1 \
+    --num_inference_steps 50 \
+    --num_frames 49 \
+    --guidance_scale 6 \
+    --dtype bf16
+```
+
 # Important Notes for Gaudi3 Users
 
  - **Batch Size Limitation**: Due to a known issue, batch sizes for some Stable Diffusion models need to be reduced.
diff --git a/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
index 7f7db81e18..4d037fc272 100644
--- a/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
+++ b/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
@@ -15,7 +15,6 @@
 from typing import Optional, Union
 
 import torch
-import torch.nn.functional as F
 from diffusers.models.autoencoders.vae import DecoderOutput