From e49d11263e0933119d1c18024bd81f1b341b3c81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=92=8B=E7=A1=95?= Date: Fri, 11 Oct 2024 09:44:44 +0800 Subject: [PATCH 1/8] Improve NPU performance --- src/diffusers/models/attention_processor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 9f9bc5a46e10..54d0c9d997dd 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -4277,6 +4277,7 @@ def __init__(self): CROSS_ATTENTION_PROCESSORS = ( AttnProcessor, AttnProcessor2_0, + AttnProcessorNPU, XFormersAttnProcessor, SlicedAttnProcessor, IPAdapterAttnProcessor, @@ -4286,6 +4287,7 @@ def __init__(self): AttentionProcessor = Union[ AttnProcessor, AttnProcessor2_0, + AttnProcessorNPU, FusedAttnProcessor2_0, XFormersAttnProcessor, SlicedAttnProcessor, From d25d229be047be9d42822cb5dbd4d63d0896bfb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=92=8B=E7=A1=95?= Date: Sat, 12 Oct 2024 17:29:42 +0800 Subject: [PATCH 2/8] Improve NPU performance --- src/diffusers/models/attention_processor.py | 86 ++++++++++++++------- 1 file changed, 58 insertions(+), 28 deletions(-) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 54d0c9d997dd..a7912a2867ae 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -4274,32 +4274,62 @@ def __init__(self): XFormersAttnAddedKVProcessor, ) -CROSS_ATTENTION_PROCESSORS = ( - AttnProcessor, - AttnProcessor2_0, - AttnProcessorNPU, - XFormersAttnProcessor, - SlicedAttnProcessor, - IPAdapterAttnProcessor, - IPAdapterAttnProcessor2_0, -) +if is_torch_npu_available(): + CROSS_ATTENTION_PROCESSORS = ( + AttnProcessor, + AttnProcessor2_0, + AttnProcessorNPU, + XFormersAttnProcessor, + SlicedAttnProcessor, + IPAdapterAttnProcessor, + IPAdapterAttnProcessor2_0, + ) + + AttentionProcessor = Union[ + AttnProcessor, + AttnProcessor2_0, + AttnProcessorNPU, + FusedAttnProcessor2_0, + XFormersAttnProcessor, + SlicedAttnProcessor, + AttnAddedKVProcessor, + SlicedAttnAddedKVProcessor, + AttnAddedKVProcessor2_0, + XFormersAttnAddedKVProcessor, + CustomDiffusionAttnProcessor, + CustomDiffusionXFormersAttnProcessor, + CustomDiffusionAttnProcessor2_0, + PAGCFGIdentitySelfAttnProcessor2_0, + PAGIdentitySelfAttnProcessor2_0, + PAGCFGHunyuanAttnProcessor2_0, + PAGHunyuanAttnProcessor2_0, + ] -AttentionProcessor = Union[ - AttnProcessor, - AttnProcessor2_0, - AttnProcessorNPU, - FusedAttnProcessor2_0, - XFormersAttnProcessor, - SlicedAttnProcessor, - AttnAddedKVProcessor, - SlicedAttnAddedKVProcessor, - AttnAddedKVProcessor2_0, - XFormersAttnAddedKVProcessor, - CustomDiffusionAttnProcessor, - CustomDiffusionXFormersAttnProcessor, - CustomDiffusionAttnProcessor2_0, - PAGCFGIdentitySelfAttnProcessor2_0, - PAGIdentitySelfAttnProcessor2_0, - PAGCFGHunyuanAttnProcessor2_0, - PAGHunyuanAttnProcessor2_0, -] +else: + CROSS_ATTENTION_PROCESSORS = ( + AttnProcessor, + AttnProcessor2_0, + XFormersAttnProcessor, + SlicedAttnProcessor, + IPAdapterAttnProcessor, + IPAdapterAttnProcessor2_0, + ) + + AttentionProcessor = Union[ + AttnProcessor, + AttnProcessor2_0, + FusedAttnProcessor2_0, + XFormersAttnProcessor, + SlicedAttnProcessor, + AttnAddedKVProcessor, + SlicedAttnAddedKVProcessor, + AttnAddedKVProcessor2_0, + XFormersAttnAddedKVProcessor, + CustomDiffusionAttnProcessor, + CustomDiffusionXFormersAttnProcessor, + CustomDiffusionAttnProcessor2_0, + PAGCFGIdentitySelfAttnProcessor2_0, + PAGIdentitySelfAttnProcessor2_0, + PAGCFGHunyuanAttnProcessor2_0, + PAGHunyuanAttnProcessor2_0, + ] From 9555cf3d80e6c65a9ff13c47d25582dc2ad656a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=92=8B=E7=A1=95?= Date: Sat, 12 Oct 2024 17:46:14 +0800 Subject: [PATCH 3/8] Improve NPU performance --- src/diffusers/models/attention_processor.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index a7912a2867ae..46893b8b59d6 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -4275,7 +4275,7 @@ def __init__(self): ) if is_torch_npu_available(): - CROSS_ATTENTION_PROCESSORS = ( + cross_attention_processors = ( AttnProcessor, AttnProcessor2_0, AttnProcessorNPU, @@ -4285,7 +4285,7 @@ def __init__(self): IPAdapterAttnProcessor2_0, ) - AttentionProcessor = Union[ + attentionProcessor = Union[ AttnProcessor, AttnProcessor2_0, AttnProcessorNPU, @@ -4306,7 +4306,7 @@ def __init__(self): ] else: - CROSS_ATTENTION_PROCESSORS = ( + cross_attention_processors = ( AttnProcessor, AttnProcessor2_0, XFormersAttnProcessor, @@ -4315,7 +4315,7 @@ def __init__(self): IPAdapterAttnProcessor2_0, ) - AttentionProcessor = Union[ + attentionProcessor= Union[ AttnProcessor, AttnProcessor2_0, FusedAttnProcessor2_0, @@ -4333,3 +4333,6 @@ def __init__(self): PAGCFGHunyuanAttnProcessor2_0, PAGHunyuanAttnProcessor2_0, ] + +CROSS_ATTENTION_PROCESSORS = cross_attention_processors +AttentionProcessor = attentionProcessor \ No newline at end of file From 2b0febea9f46c1bbc05fe5b1173f7150913dfab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=92=8B=E7=A1=95?= Date: Mon, 14 Oct 2024 08:35:26 +0800 Subject: [PATCH 4/8] Improve NPU performance --- src/diffusers/models/attention_processor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 46893b8b59d6..7bd3bc3b8d18 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -4285,7 +4285,7 @@ def __init__(self): IPAdapterAttnProcessor2_0, ) - attentionProcessor = Union[ + attention_processor = Union[ AttnProcessor, AttnProcessor2_0, AttnProcessorNPU, @@ -4315,7 +4315,7 @@ def __init__(self): IPAdapterAttnProcessor2_0, ) - attentionProcessor= Union[ + attention_processor= Union[ AttnProcessor, AttnProcessor2_0, FusedAttnProcessor2_0, @@ -4335,4 +4335,4 @@ def __init__(self): ] CROSS_ATTENTION_PROCESSORS = cross_attention_processors -AttentionProcessor = attentionProcessor \ No newline at end of file +AttentionProcessor = attention_processor \ No newline at end of file From 5a742e656c933c70a54a256e816d15fa0493e4bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=92=8B=E7=A1=95?= Date: Thu, 24 Oct 2024 19:18:44 +0800 Subject: [PATCH 5/8] [bugfix] bugfix for npu free memory --- src/diffusers/training_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/training_utils.py b/src/diffusers/training_utils.py index 9c898ad141ee..0e0d0ce5b568 100644 --- a/src/diffusers/training_utils.py +++ b/src/diffusers/training_utils.py @@ -284,7 +284,7 @@ def free_memory(): elif torch.backends.mps.is_available(): torch.mps.empty_cache() elif is_torch_npu_available(): - torch_npu.empty_cache() + torch_npu.npu.empty_cache() # Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14 From 42271f13f3c4116fd6f7f49f7e56bdde9401caa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=92=8B=E7=A1=95?= Date: Thu, 24 Oct 2024 19:28:23 +0800 Subject: [PATCH 6/8] [bugfix] bugfix for npu free memory --- src/diffusers/models/attention_processor.py | 87 ++++++--------------- 1 file changed, 26 insertions(+), 61 deletions(-) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index a3c893b1bf90..91924b345fd9 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -4290,65 +4290,30 @@ def __init__(self): XFormersAttnAddedKVProcessor, ) -if is_torch_npu_available(): - cross_attention_processors = ( - AttnProcessor, - AttnProcessor2_0, - AttnProcessorNPU, - XFormersAttnProcessor, - SlicedAttnProcessor, - IPAdapterAttnProcessor, - IPAdapterAttnProcessor2_0, - ) - - attention_processor = Union[ - AttnProcessor, - AttnProcessor2_0, - AttnProcessorNPU, - FusedAttnProcessor2_0, - XFormersAttnProcessor, - SlicedAttnProcessor, - AttnAddedKVProcessor, - SlicedAttnAddedKVProcessor, - AttnAddedKVProcessor2_0, - XFormersAttnAddedKVProcessor, - CustomDiffusionAttnProcessor, - CustomDiffusionXFormersAttnProcessor, - CustomDiffusionAttnProcessor2_0, - PAGCFGIdentitySelfAttnProcessor2_0, - PAGIdentitySelfAttnProcessor2_0, - PAGCFGHunyuanAttnProcessor2_0, - PAGHunyuanAttnProcessor2_0, - ] +CROSS_ATTENTION_PROCESSORS = ( + AttnProcessor, + AttnProcessor2_0, + XFormersAttnProcessor, + SlicedAttnProcessor, + IPAdapterAttnProcessor, + IPAdapterAttnProcessor2_0, +) -else: - cross_attention_processors = ( - AttnProcessor, - AttnProcessor2_0, - XFormersAttnProcessor, - SlicedAttnProcessor, - IPAdapterAttnProcessor, - IPAdapterAttnProcessor2_0, - ) - - attention_processor= Union[ - AttnProcessor, - AttnProcessor2_0, - FusedAttnProcessor2_0, - XFormersAttnProcessor, - SlicedAttnProcessor, - AttnAddedKVProcessor, - SlicedAttnAddedKVProcessor, - AttnAddedKVProcessor2_0, - XFormersAttnAddedKVProcessor, - CustomDiffusionAttnProcessor, - CustomDiffusionXFormersAttnProcessor, - CustomDiffusionAttnProcessor2_0, - PAGCFGIdentitySelfAttnProcessor2_0, - PAGIdentitySelfAttnProcessor2_0, - PAGCFGHunyuanAttnProcessor2_0, - PAGHunyuanAttnProcessor2_0, - ] - -CROSS_ATTENTION_PROCESSORS = cross_attention_processors -AttentionProcessor = attention_processor \ No newline at end of file +AttentionProcessor= Union[ + AttnProcessor, + AttnProcessor2_0, + FusedAttnProcessor2_0, + XFormersAttnProcessor, + SlicedAttnProcessor, + AttnAddedKVProcessor, + SlicedAttnAddedKVProcessor, + AttnAddedKVProcessor2_0, + XFormersAttnAddedKVProcessor, + CustomDiffusionAttnProcessor, + CustomDiffusionXFormersAttnProcessor, + CustomDiffusionAttnProcessor2_0, + PAGCFGIdentitySelfAttnProcessor2_0, + PAGIdentitySelfAttnProcessor2_0, + PAGCFGHunyuanAttnProcessor2_0, + PAGHunyuanAttnProcessor2_0, +] \ No newline at end of file From 62a718ca12620162690302636a28baf66b46c3a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=92=8B=E7=A1=95?= Date: Thu, 24 Oct 2024 19:29:45 +0800 Subject: [PATCH 7/8] [bugfix] bugfix for npu free memory --- src/diffusers/models/attention_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 91924b345fd9..e735c4ee7d17 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -4299,7 +4299,7 @@ def __init__(self): IPAdapterAttnProcessor2_0, ) -AttentionProcessor= Union[ +AttentionProcessor = Union[ AttnProcessor, AttnProcessor2_0, FusedAttnProcessor2_0, @@ -4316,4 +4316,4 @@ def __init__(self): PAGIdentitySelfAttnProcessor2_0, PAGCFGHunyuanAttnProcessor2_0, PAGHunyuanAttnProcessor2_0, -] \ No newline at end of file +] From 2dbbe34bc89f32fd28f95bed7c544d5785c3f571 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=92=8B=E7=A1=95?= Date: Fri, 1 Nov 2024 12:39:15 +0800 Subject: [PATCH 8/8] Reduce memory cost for flux training process --- examples/dreambooth/train_dreambooth_flux.py | 6 ++++++ examples/dreambooth/train_dreambooth_lora_flux.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/examples/dreambooth/train_dreambooth_flux.py b/examples/dreambooth/train_dreambooth_flux.py index bd1c29009976..9fd95fe823a5 100644 --- a/examples/dreambooth/train_dreambooth_flux.py +++ b/examples/dreambooth/train_dreambooth_flux.py @@ -1740,6 +1740,9 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): torch_npu.npu.empty_cache() gc.collect() + images = None + del pipeline + # Save the lora layers accelerator.wait_for_everyone() if accelerator.is_main_process: @@ -1798,6 +1801,9 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): ignore_patterns=["step_*", "epoch_*"], ) + images = None + del pipeline + accelerator.end_training() diff --git a/examples/dreambooth/train_dreambooth_lora_flux.py b/examples/dreambooth/train_dreambooth_lora_flux.py index a0a197b1b2ee..bba95a326b25 100644 --- a/examples/dreambooth/train_dreambooth_lora_flux.py +++ b/examples/dreambooth/train_dreambooth_lora_flux.py @@ -1842,6 +1842,9 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): del text_encoder_one, text_encoder_two free_memory() + images = None + del pipeline + # Save the lora layers accelerator.wait_for_everyone() if accelerator.is_main_process: @@ -1906,6 +1909,9 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): ignore_patterns=["step_*", "epoch_*"], ) + images = None + del pipeline + accelerator.end_training()