From 8fdfd0ef1ff80a20e362fe5ed9ff0191026b1db3 Mon Sep 17 00:00:00 2001 From: Iman Gohari Date: Wed, 7 May 2025 22:21:37 +0000 Subject: [PATCH 01/10] fea(): skipping contrastive_generate_dynamic_shapes based on RD team list --- tests/transformers/tests/models/t5/test_modeling_t5.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/transformers/tests/models/t5/test_modeling_t5.py b/tests/transformers/tests/models/t5/test_modeling_t5.py index 46754368aa..7f9da6f9ca 100644 --- a/tests/transformers/tests/models/t5/test_modeling_t5.py +++ b/tests/transformers/tests/models/t5/test_modeling_t5.py @@ -800,6 +800,10 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) + @pytest.mark.skip("Skipped for Gaudi") + def test_contrastive_generate_dynamic_shapes(self): + pass + @pytest.mark.skip("Skipped for Gaudi") def test_generate_with_past_key_values(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() From e1d3c069b9b09779cd80434acbb1bf5f66e1c6b4 Mon Sep 17 00:00:00 2001 From: Iman Gohari Date: Wed, 7 May 2025 22:22:22 +0000 Subject: [PATCH 02/10] fea(): skipping roberta inputs_embeds_decoder_only due to HF changes --- .../tests/models/roberta/test_modeling_roberta.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/transformers/tests/models/roberta/test_modeling_roberta.py b/tests/transformers/tests/models/roberta/test_modeling_roberta.py index 7723c753e4..d9c6794f4b 100644 --- a/tests/transformers/tests/models/roberta/test_modeling_roberta.py +++ b/tests/transformers/tests/models/roberta/test_modeling_roberta.py @@ -16,6 +16,7 @@ import unittest +import pytest from transformers import RobertaConfig, is_torch_available from transformers.testing_utils import TestCasePlus, require_torch, slow @@ -477,6 +478,10 @@ def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + @pytest.mark.skip("Skipped since HF upstream test is modified starting v4.47") + def test_generate_from_inputs_embeds_decoder_only(): + pass + @slow def test_model_from_pretrained(self): model_name = "FacebookAI/roberta-base" From 96c8b2c761982d5343803632509b748abd770b03 Mon Sep 17 00:00:00 2001 From: Iman Gohari Date: Fri, 9 May 2025 23:30:16 +0000 Subject: [PATCH 03/10] initial commit --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c4ef7f50f5..463726e452 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ "optimum", "torch", "accelerate >= 1.5.0", - "diffusers >= 0.32.0, < 0.32.1", + "diffusers >= 0.33.1, < 0.33.2", "huggingface_hub >= 0.24.7", "sentence-transformers == 3.3.1", ] From c888ce57d61510c267f2fe15bf47f825285f5eed Mon Sep 17 00:00:00 2001 From: Iman Gohari Date: Mon, 12 May 2025 17:21:54 +0000 Subject: [PATCH 04/10] fixed flux changes --- .../pipelines/flux/pipeline_flux_img2img.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py index 17894db5ae..8ada22a143 100644 --- a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py +++ b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py @@ -27,7 +27,14 @@ from diffusers.models.transformers import FluxTransformer2DModel from diffusers.pipelines.flux.pipeline_flux_img2img import FluxImg2ImgPipeline, calculate_shift, retrieve_timesteps from diffusers.utils import BaseOutput, replace_example_docstring -from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast +from transformers import ( + CLIPImageProcessor, + CLIPTextModel, + CLIPTokenizer, + CLIPVisionModelWithProjection, + T5EncoderModel, + T5TokenizerFast, +) from optimum.utils import logging @@ -262,7 +269,7 @@ def __call__( class GaudiFluxImg2ImgPipeline(GaudiDiffusionPipeline, FluxImg2ImgPipeline): r""" - Adapted from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L162 + Adapted from https://github.com/huggingface/diffusers/blob/v0.33.1/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L169 The Flux pipeline for image-to-image generation. @@ -314,6 +321,8 @@ def __init__( text_encoder_2: T5EncoderModel, tokenizer_2: T5TokenizerFast, transformer: FluxTransformer2DModel, + image_encoder: CLIPVisionModelWithProjection = None, + feature_extractor: CLIPImageProcessor = None, use_habana: bool = False, use_hpu_graphs: bool = False, gaudi_config: Union[str, GaudiConfig] = None, From 1922a64d7ef9df55cf6298308389052fb11be38b Mon Sep 17 00:00:00 2001 From: Iman Gohari Date: Tue, 13 May 2025 20:02:30 +0000 Subject: [PATCH 05/10] fea(): Added test_attention_slicing_forward_pass in StableDiffusionInpaintPipelineTests class --- tests/test_diffusers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py index 8a06f6b330..da286dee24 100644 --- a/tests/test_diffusers.py +++ b/tests/test_diffusers.py @@ -5234,6 +5234,9 @@ def test_stable_diffusion_inpaint(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + def test_attention_slicing_forward_pass(self): + super().test_attention_slicing_forward_pass(expected_max_diff=3e-3) + def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=3e-3) From 587c14a2161094e4a1bb31ebfba3cd601f74696c Mon Sep 17 00:00:00 2001 From: Iman Gohari Date: Tue, 13 May 2025 22:11:18 +0000 Subject: [PATCH 06/10] Revert "fea(): Added test_attention_slicing_forward_pass in StableDiffusionInpaintPipelineTests class" This reverts commit 1922a64d7ef9df55cf6298308389052fb11be38b. --- tests/test_diffusers.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py index da286dee24..8a06f6b330 100644 --- a/tests/test_diffusers.py +++ b/tests/test_diffusers.py @@ -5234,9 +5234,6 @@ def test_stable_diffusion_inpaint(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - def test_attention_slicing_forward_pass(self): - super().test_attention_slicing_forward_pass(expected_max_diff=3e-3) - def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=3e-3) From f237a3b9381c8a9cc7a3d251cb3142f212a8007f Mon Sep 17 00:00:00 2001 From: Iman Gohari Date: Tue, 13 May 2025 22:43:43 +0000 Subject: [PATCH 07/10] fea(): try test_attention_slicing_forward_pass with gc.collect --- tests/test_diffusers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py index 8a06f6b330..82df8b823b 100644 --- a/tests/test_diffusers.py +++ b/tests/test_diffusers.py @@ -5195,6 +5195,11 @@ def get_dummy_components(self): } return components + def test_attention_slicing_forward_pass(self): + super().tearDown() + gc.collect() + super().test_attention_slicing_forward_pass(expected_max_diff=3e-3) + def get_dummy_inputs(self, device, seed=0): # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched # ensure determinism for the device-dependent torch.Generator on HPU From 82f669ad415705647dc3037d3352bcbc4a93eb97 Mon Sep 17 00:00:00 2001 From: Iman Gohari Date: Wed, 14 May 2025 17:47:26 +0000 Subject: [PATCH 08/10] Added a comment on test_attention_slicing_forward_pass CI behaviour --- tests/test_diffusers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py index 82df8b823b..7678d205ed 100644 --- a/tests/test_diffusers.py +++ b/tests/test_diffusers.py @@ -5195,6 +5195,7 @@ def get_dummy_components(self): } return components + # IG: this test passes locally, but crashes on CI with uncleared graph. Adding teardown and gc.collect to remediate. def test_attention_slicing_forward_pass(self): super().tearDown() gc.collect() From f75cec938e6d390ab61e47f7925fd873a444fb46 Mon Sep 17 00:00:00 2001 From: "Zhou, Huijuan" Date: Thu, 15 May 2025 03:13:06 -0700 Subject: [PATCH 09/10] update CogVideoXCausalConv3dforwardGaudi for diffuser v0.33.x --- .../autoencoders/autoencoder_kl_cogvideox.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py index 409f77995f..7f7db81e18 100644 --- a/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +++ b/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py @@ -120,13 +120,14 @@ def CogVideoXCausalConv3dforwardGaudi( inputs = self.fake_context_parallel_forward(inputs, conv_cache) # conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone() - padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad) - inputs_pad = F.pad(inputs, padding_2d, mode="constant", value=0) - - output = self.conv(inputs_pad) - if self.time_kernel_size > 1: - if conv_cache is not None and conv_cache.shape == inputs[:, :, -self.time_kernel_size + 1 :].shape: - conv_cache.copy_(inputs[:, :, -self.time_kernel_size + 1 :]) - else: - conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone() + if self.pad_mode == "replicate": + conv_cache = None + else: + if self.time_kernel_size > 1: + if conv_cache is not None and conv_cache.shape == inputs[:, :, -self.time_kernel_size + 1 :].shape: + conv_cache.copy_(inputs[:, :, -self.time_kernel_size + 1 :]) + else: + conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone() + + output = self.conv(inputs) return output, conv_cache From b4d87c16ac2747800100907ad3f5b04c1d08c1b2 Mon Sep 17 00:00:00 2001 From: Iman Gohari Date: Thu, 15 May 2025 22:50:16 +0000 Subject: [PATCH 10/10] fea(): cleaned up the readme. make style --- examples/stable-diffusion/README.md | 40 +++++++++---------- .../autoencoders/autoencoder_kl_cogvideox.py | 1 - 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/examples/stable-diffusion/README.md b/examples/stable-diffusion/README.md index c2dc936464..78cf511b51 100644 --- a/examples/stable-diffusion/README.md +++ b/examples/stable-diffusion/README.md @@ -339,25 +339,6 @@ Stable Video Diffusion (SVD) was unveiled in [Stable Video Diffusion Announcemen by the Stability AI team. Stable Video Diffusion XT version (SVD-XT) is tuned to generate 25 frames of video from a single image. -# CogvideoX Examples - -CogVideoX is an open-source version of the video generation model originating from QingYing, unveiled in https://huggingface.co/THUDM/CogVideoX-5b. - -```bash -python text_to_video_generation.py \ - --model_name_or_path "THUDM/CogVideoX-2b" \ - --pipeline_type "cogvideox" \ - --prompts "An astronaut riding a horse" \ - --use_habana \ - --use_hpu_graphs \ - --num_videos_per_prompt 1 \ - --num_inference_steps 50 \ - --num_frames 49 \ - --guidance_scale 6 \ - --dtype bf16 -``` - - ## Image-to-Video Generation Script `image_to_video_generation.py` showcases how to perform image-to-video generation using Stable Video Diffusion on Intel Gaudi. @@ -421,8 +402,7 @@ python image_to_video_generation.py \ --width=512 \ --height=512 ``` - -# I2vgen-xl +### Image-to-Video with I2vgen-xl I2vgen-xl is high quality Image-to-Video synthesis via cascaded diffusion models. Please refer to [Huggingface i2vgen-xl doc](https://huggingface.co/ali-vilab/i2vgen-xl). Here is how to generate video with one image and text prompt: @@ -447,6 +427,24 @@ python image_to_video_generation.py \ --bf16 ``` +### Text-to-Video with CogvideoX + +CogVideoX is an open-source version of the video generation model originating from QingYing, unveiled in https://huggingface.co/THUDM/CogVideoX-5b. + +```bash +python text_to_video_generation.py \ + --model_name_or_path "THUDM/CogVideoX-2b" \ + --pipeline_type "cogvideox" \ + --prompts "An astronaut riding a horse" \ + --use_habana \ + --use_hpu_graphs \ + --num_videos_per_prompt 1 \ + --num_inference_steps 50 \ + --num_frames 49 \ + --guidance_scale 6 \ + --dtype bf16 +``` + # Important Notes for Gaudi3 Users - **Batch Size Limitation**: Due to a known issue, batch sizes for some Stable Diffusion models need to be reduced. diff --git a/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py index 7f7db81e18..4d037fc272 100644 --- a/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +++ b/optimum/habana/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py @@ -15,7 +15,6 @@ from typing import Optional, Union import torch -import torch.nn.functional as F from diffusers.models.autoencoders.vae import DecoderOutput